struct jstruct_result _jstruct_import(struct json_object *obj, const void *data, const struct jstruct_object_property *properties, struct json_object *errors) { _init_importers(); if (errors != NULL && json_object_get_type(errors) != json_type_array) { return jstruct_error_new(jstruct_error_errors_not_array_or_null, NULL, json_object_get_type(errors)); } const struct jstruct_object_property *property; struct json_object *prop; struct jstruct_result result = JSTRUCT_OK; result.allocated = array_list_new(jstruct_allocated_free); for (property = properties; property->name; ++property) { void *ptr = jstruct_prop_ptr(data, property, JSTRUCT_PROP_PTR_GET_NO_DEREF); struct jstruct_result err; if (json_object_object_get_ex(obj, property->name, &prop)) { if (json_object_get_type(prop) != property->type.json) { err = jstruct_error_new(jstruct_error_incorrect_type, property->name, json_object_get_type(prop)); } else { jstruct_import_importer import = importers[json_type_index(property->type.json)]; err = import(prop, data, ptr, property); } } else { if (!set_null(ptr, property)) { err = jstruct_error_array_add(errors, jstruct_error_not_nullable, property->name, 0); } } jstruct_error_consume(&result, &err, errors, property->name, -1); } if (result.allocated->length == 0) { array_list_free(result.allocated); result.allocated = NULL; } return result; }
void gff_batch_free(gff_batch_t* batch) { assert(batch); if (batch->text) { free(batch->text); } array_list_free(batch->records, (void *)gff_record_free); free(batch); }
void write_mapped_read(array_list_t *array_list, bam_file_t *bam_file) { size_t num_items = array_list_size(array_list); alignment_t *alig; bam1_t *bam1; for (size_t j = 0; j < num_items; j++) { alig = (alignment_t *) array_list_get(j, array_list); //printf("\t******** %i(%i)\n", j, num_items); //printf("is null alig->name %i\n", (alig->query_name == NULL)); //printf("name = %s\n", alig->query_name); //printf("read = %s\n", alig->sequence); //printf("\t-----> %s\n", alig->cigar); LOG_DEBUG("writting bam..\n"); //alignment_print(alig); //exit(-1); if (alig != NULL) { bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, bam_file); bam_destroy1(bam1); alignment_free(alig); } else { LOG_FATAL_F("alig is NULL, num_items = %lu\n", num_items); } //printf("\t**************** %i(%i)\n", j, num_items); } if (array_list) { array_list_free(array_list, NULL); } }
json_object * tg_shared_preferences_find_parent_of_leaf(SharedPreferences* thiz,const CHAR* key_path,CHAR** leaf_key) { struct json_object *jso=NULL; struct array_list* key_list = NULL; INT32 key_list_len = 0; INT32 idx = 0; return_val_if_fail((thiz&&key_path),NULL); key_list = array_list_new(tg_shared_preferences_key_free); return_val_if_fail((key_list),NULL); return_val_if_fail(tg_shared_preferences_parse_keypath(key_path,key_list,&key_list_len),NULL); for (jso=thiz->obj; idx<key_list_len-1; idx++) { jso = json_object_object_get(jso,(CHAR*)array_list_get_idx(key_list,idx)); if (jso==NULL) break; } if (jso!=NULL) { CHAR* key = (CHAR*)array_list_get_idx(key_list,key_list_len-1); //ASSERT(key); *leaf_key = TG_CALLOC((strlen(key)+1),1); strcpy(*leaf_key,key); } array_list_free(key_list); return jso; }
int merge_remaining_interval(kh_pos_t* positions_read, vcf_file_t **files, shared_options_data_t *shared_options_data, merge_options_data_t *options_data, list_t *output_list) { int num_entries = 0; #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries) for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) { if (kh_exist(positions_read, k)) { array_list_t *records_in_position = kh_value(positions_read, k); assert(records_in_position); // Launch merge int err_code = 0; vcf_record_t *merged = merge_position((vcf_record_file_link **) records_in_position->items, records_in_position->size, files, options_data->num_files, options_data, &err_code); if (!err_code) { list_item_t *item = list_item_new(k, MERGED_RECORD, merged); list_insert_item(item, output_list); num_entries += 1; } // Free empty nodes (lists of records in the same position) array_list_free(records_in_position, vcf_record_file_link_free); kh_del(pos, positions_read, k); } } return num_entries; }
void vcf_batch_free(vcf_batch_t* batch) { assert(batch); if (batch->text && !mmap_vcf) { // printf("text to free = '%.*s'\n", 50, batch->text); free(batch->text); } array_list_free(batch->records, vcf_record_free); free(batch); }
void vcf_record_free_deep(vcf_record_t *record) { assert(record); free(record->chromosome); free(record->id); free(record->reference); free(record->alternate); free(record->filter); free(record->info); free(record->format); array_list_free(record->samples, free); free(record); }
void jstruct_allocated_free(void *data) { struct jstruct_allocated *allocated = (struct jstruct_allocated *)data; switch (allocated->type) { case jstruct_allocated_type_raw: free(allocated->data); break; case jstruct_allocated_type_arraylist: array_list_free((array_list *)allocated->data); break; } free(data); }
void *fastq_reader(void *input) { struct timeval start, end; double time; extern size_t fd_read_bytes; size_t read_bytes; //if (time_on) { start_timer(start); } wf_input_t *wf_input = (wf_input_t *) input; batch_t *new_batch = NULL; batch_t *batch = wf_input->batch; fastq_batch_reader_input_t *fq_reader_input = wf_input->fq_reader_input; array_list_t *reads = array_list_new(10000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); if (fq_reader_input->gzip) { //Gzip fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { fastq_gzread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1); } else { //printf("Gzip Reader for pair-end not implemented\n");; fastq_gzread_bytes_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2); //fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, // fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2); } } else { //Fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { read_bytes = fastq_fread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1); } else { read_bytes = fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1, fq_reader_input->fq_file2); } fd_read_bytes += read_bytes; } size_t num_reads = array_list_size(reads); if (num_reads == 0) { array_list_free(reads, (void *)fastq_read_free); } else { mapping_batch_t *mapping_batch = mapping_batch_new(reads, batch->pair_input->pair_mng); new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, batch->pair_input, batch->preprocess_rna, batch->sw_input, batch->writer_input, batch->mapping_mode, mapping_batch); } //if (time_on) { stop_timer(start, end, time); timing_add(time, FASTQ_READER, timing); } //printf("Read batch %i\n", num_reads); return new_batch; }
void workflow_free(workflow_t *wf) { if (wf == NULL) return; if (wf->stage_times) { free(wf->stage_times); } if (wf->pending_items) { for (int i = 0; i < wf->num_stages; i++) { array_list_free(wf->pending_items[i], NULL); } free(wf->pending_items); } if (wf->completed_items) array_list_free(wf->completed_items, NULL); if (wf->num_stages && wf->stage_labels) { for (int i = 0; i < wf->num_stages; i++) { if (wf->stage_labels[i]) { free(wf->stage_labels[i]); } } free(wf->stage_labels); } if (wf->producer_label) { free(wf->producer_label); } if (wf->consumer_label) { free(wf->consumer_label); } if (wf->stage_times_mutex) { free(wf->stage_times_mutex); } free(wf); }
void *sa_fq_reader(void *input) { sa_wf_input_t *wf_input = (sa_wf_input_t *) input; sa_wf_batch_t *new_wf_batch = NULL; sa_wf_batch_t *curr_wf_batch = wf_input->wf_batch; fastq_batch_reader_input_t *fq_reader_input = wf_input->fq_reader_input; array_list_t *reads = array_list_new(fq_reader_input->batch_size, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); if (fq_reader_input->gzip) { // Gzip fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { fastq_gzread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1); } else { fastq_gzread_bytes_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2); } } else { // Fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { fastq_fread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1); } else { fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1, fq_reader_input->fq_file2); } } size_t num_reads = array_list_size(reads); if (num_reads == 0) { array_list_free(reads, (void *)fastq_read_free); } else { sa_mapping_batch_t *sa_mapping_batch = sa_mapping_batch_new(reads); sa_mapping_batch->bam_format = wf_input->bam_format; new_wf_batch = sa_wf_batch_new(curr_wf_batch->options, curr_wf_batch->sa_index, curr_wf_batch->writer_input, sa_mapping_batch, NULL); } return new_wf_batch; }
int merge_interval(kh_pos_t* positions_read, char *max_chromosome_merged, unsigned long max_position_merged, char **chromosome_order, int num_chromosomes, vcf_file_t **files, shared_options_data_t *shared_options_data, merge_options_data_t *options_data, list_t *output_list) { int num_entries = 0; #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries) for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) { if (kh_exist(positions_read, k)) { array_list_t *records_in_position = kh_value(positions_read, k); assert(records_in_position); vcf_record_t *record = ((vcf_record_file_link*) array_list_get(0, records_in_position))->record; vcf_record_file_link **links = NULL; int num_links = 0; // Remove positions prior to the last chromosome:position to merge int cmp_chrom = compare_chromosomes(record->chromosome, max_chromosome_merged, chromosome_order, num_chromosomes); if (cmp_chrom < 0 || (cmp_chrom == 0 && compare_positions(record->position, max_position_merged) <= 0)) { links = records_in_position->items; num_links = records_in_position->size; } // Launch merge if (num_links > 0) { // printf("links[0] = %s:%ld in file %s\n", links[0]->record->chromosome, links[0]->record->position, links[0]->file->filename); int err_code = 0; vcf_record_t *merged = merge_position(links, num_links, files, options_data->num_files, options_data, &err_code); if (!err_code) { list_item_t *item = list_item_new(k, MERGED_RECORD, merged); list_insert_item(item, output_list); num_entries += 1; } // Free empty nodes (lists of records in the same position) array_list_free(records_in_position, vcf_record_file_link_free); kh_del(pos, positions_read, k); } } // End kh_exist } return num_entries; }
static void report_vcf_variant_stats_sqlite3(sqlite3 *db, int num_variants, variant_stats_t **stats_batch) { array_list_t *fields = array_list_new(num_variants + 1, 1.1, COLLECTION_MODE_ASYNCHRONIZED); variant_stats_t *var_stats; for (int i = 0; i < num_variants; i++) { var_stats = stats_batch[i]; variant_stats_db_fields_t *f = variant_stats_db_fields_new(var_stats->chromosome, var_stats->position, var_stats->ref_allele, var_stats->alt_alleles, var_stats->maf_allele, var_stats->maf, var_stats->mgf_genotype, var_stats->mgf, var_stats->missing_alleles, var_stats->missing_genotypes, var_stats->mendelian_errors, var_stats->is_indel, var_stats->cases_percent_dominant, var_stats->controls_percent_dominant, var_stats->cases_percent_recessive, var_stats->controls_percent_recessive); array_list_insert(f, fields); } insert_variant_stats_db_fields_list(fields, db); array_list_free(fields, (void *)variant_stats_db_fields_free); }
/* require interface */ json_object * tg_shared_preferences_find_leaf_obj(SharedPreferences* thiz,const CHAR* key_path) { struct json_object *jso=NULL; struct array_list* key_list = NULL; INT32 key_list_len = 0; INT32 idx = 0; return_val_if_fail((thiz&&key_path),NULL); if (strcmp(key_path,"/")==0) return thiz->obj; key_list = array_list_new(tg_shared_preferences_key_free); return_val_if_fail((key_list),NULL); return_val_if_fail(tg_shared_preferences_parse_keypath(key_path,key_list,&key_list_len),NULL); for (jso=thiz->obj; idx<key_list_len &&jso; idx++) { jso = json_object_object_get(jso,(CHAR*)array_list_get_idx(key_list,idx)); } array_list_free(key_list); return jso; }
static int filesystem_getattr(const char *path, struct stat *statbuf) { int retstat; logging_log("Filesystem", LOGGING_LEVEL_INFO, "filesystem_getattr(path=\"%s\", statbuf=0x%08x)", path, statbuf); struct timeval tv; gettimeofday(&tv, NULL); struct timespec ts; ts.tv_nsec = tv.tv_usec * 1000; ts.tv_sec = tv.tv_sec; statbuf->st_uid = getuid(); statbuf->st_size = 0; statbuf->st_rdev = 0; statbuf->st_nlink = 0; statbuf->st_mtime = ts.tv_sec; statbuf->st_mtimensec = ts.tv_nsec; statbuf->st_ino = 0; statbuf->st_gid = getgid(); statbuf->st_dev = 0; statbuf->st_ctime = ts.tv_sec; statbuf->st_ctimensec = ts.tv_nsec; statbuf->st_blocks = 0; statbuf->st_blksize = 0; statbuf->st_atime = ts.tv_sec; statbuf->st_atimensec = ts.tv_nsec; path_t *path_parsed = path_parse(path); logging_log("Filesystem", LOGGING_LEVEL_INFO, "path_parsed->parts_length: %lu...", path_parsed->parts_length); switch(path_parsed->parts_length) { case 0: { logging_log("Filesystem", LOGGING_LEVEL_INFO, "filesystem_getattr() - /..."); statbuf->st_mode = configuration->filesystem_directory_mode; retstat = 0; break; } case 1: { if(!strcmp(path_parsed->parts[0], "search")) { retstat = 0; statbuf->st_mode = configuration->filesystem_directory_mode; break; } // if(path_parsed->parts[0][0] == '.') // retstat = -1; // else { // statbuf->st_mode = FILESYSTEM_DIRECTORY_MODE; // retstat = 0; // } pthread_mutex_lock(filesystem_io_mutex); ArrayList *search_dirs = searcher_get_searches(); pthread_mutex_unlock(filesystem_io_mutex); if(search_dirs != NULL) { retstat = -1; for(size_t i = 0; i < array_list_get_length(search_dirs); i++) { char *name; array_list_get(search_dirs, (const void **)&name, i); if(!strcmp(path_parsed->parts[0], name)) { retstat = 0; statbuf->st_mode = configuration->filesystem_directory_mode; goto for_end; } } for_end: ; array_list_free(search_dirs); } else retstat = -1; break; } case 2: { if(!strcmp(path_parsed->parts[0], "search")) if(strlen(path_parsed->parts[0]) > 3) { ALDictionary *value = searcher_file_name_url_dictionary_get(path_parsed->parts[1]); pthread_mutex_lock(filesystem_io_mutex); searcher_add_search(path_parsed->parts[1], value); pthread_mutex_unlock(filesystem_io_mutex); retstat = 0; statbuf->st_mode = configuration->filesystem_file_mode; break; } pthread_mutex_lock(filesystem_io_mutex); ALDictionary *results = searcher_get_search_results(path_parsed->parts[0]); if(results != NULL) { logging_log("Filesystem", LOGGING_LEVEL_INFO, "results != NULL..."); char result; char *url = (char*)al_dictionary_get(results, &result, path_parsed->parts[1]); if(result) { logging_log("Filesystem", LOGGING_LEVEL_INFO, "Invalid file %s...", path); retstat = -1; } else { logging_log("Filesystem", LOGGING_LEVEL_INFO, "Url is %s...", url); retstat = 0; statbuf->st_mode = configuration->filesystem_file_mode; statbuf->st_size = downloader_file_size_try_get(url); } } else retstat = -1; pthread_mutex_unlock(filesystem_io_mutex); break; } default: { break; } } path_free(path_parsed); logging_log("Filesystem", LOGGING_LEVEL_INFO, "filesystem_getattr() finished..."); return retstat; }
void vcf_record_free(vcf_record_t *record) { assert(record); array_list_free(record->samples, free); free(record); }
int apply_seeding(region_seeker_input_t* input, batch_t *batch) { //printf("APPLY SEEDING...\n"); //if (time_on) { start_timer(start); } mapping_batch_t *mapping_batch = batch->mapping_batch; size_t num_mappings; int seed_size = input->cal_optarg_p->seed_size; size_t min_seed_size = input->cal_optarg_p->min_seed_size; size_t num_targets = mapping_batch->num_targets; size_t *targets = mapping_batch->targets; size_t new_num_targets = 0; fastq_read_t *read; int min_intron_size = 40; int target; bwt_anchor_t *bwt_anchor = NULL; region_t *region; int gap_nt; int start_search; int end_search; // set to zero mapping_batch->num_to_do = 0; //TODO: omp parallel for !! /*if (batch->mapping_mode == 1000) { for (size_t i = 0; i < num_targets; i++) { //printf("Seq (i=%i)(target=%i): %s\n", i, targets[i], read->sequence); read = array_list_get(targets[i], mapping_batch->fq_batch); num_mappings = bwt_map_exact_seeds_seq(padding_left, padding_right, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], mapping_batch->extra_stage_id[targets[i]]); //printf("Num mappings %i\n", num_mappings); if (num_mappings > 0) { array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]); targets[new_num_targets++] = targets[i]; mapping_batch->num_to_do += num_mappings; } } } else {*/ //size_t new_num_targets = 0; //size_t *new_targets = (size_t *)malloc(array_list_size(fq_batch)*sizeof(size_t)); array_list_t *array_list_aux = array_list_new(256, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); //Flag 0: The read has simple anchor or any, and need seeds and normal Cal_Seeker //Flag 1: The read has double anchor and the gap is smaller than MIN_INTRON_SIZE. Cal_Seeker will be make one CAL //Flag 2: The read has double anchor but the gap is bigger than MIN_INTRON_SIZE. for (size_t i = 0; i < num_targets; i++) { read = array_list_get(targets[i], mapping_batch->fq_batch); //printf("Read Region %s: \n", read->id); /* if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) { array_list_clear(mapping_batch->mapping_lists[targets[i]], bwt_anchor_free); continue; } */ if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) { //Flag 0 Case, Not anchors found, Make normal seeds // printf("***** Normal Case 0. Not anchors found!\n"); for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) { bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor, array_list_aux); } num_mappings = 0; num_mappings = bwt_map_exact_seeds_seq(0, 0, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], 0); if (num_mappings > 0) { array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]); targets[new_num_targets++] = targets[i]; //mapping_batch->num_to_do += num_mappings; } } else if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) { //Flag 1 Case, One anchor found, Make displacements seeds printf("***** Case 1. One anchor found!\n"); for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) { bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor, array_list_aux); } int anchor_nt = bwt_anchor->end - bwt_anchor->start; int seed_id = 0; int seed_start, seed_end; int extra_seed; if ((bwt_anchor->type == FORWARD_ANCHOR && bwt_anchor->strand == 0) || (bwt_anchor->type == BACKWARD_ANCHOR && bwt_anchor->strand == 1 )) { start_search = anchor_nt + 1; end_search = read->length - 1; extra_seed = EXTRA_SEED_END; } else { start_search = 0; end_search = read->length - anchor_nt - 2; extra_seed = EXTRA_SEED_START; } printf("end_start %i - start_search %i = %i >= seed_size %i\n", end_search, start_search, end_search - start_search, seed_size); if (end_search - start_search >= seed_size) { printf("00 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search); /* num_mappings = bwt_map_exact_seeds_between_coords(start_search, end_search, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], extra_seed, &seed_id); */ } if (bwt_anchor->type == FORWARD_ANCHOR) { seed_id = 0; seed_start = 0; seed_end = anchor_nt; } else { seed_id += 1; seed_start = read->length - anchor_nt - 1; seed_end = read->length - 1; } for (int j = 0; j < array_list_size(array_list_aux); j++) { bwt_anchor_t *bwt_anchor = array_list_get(j, array_list_aux); // printf("\tCreate seed Anchor [%i:%lu|%i-%i|%lu]\n", bwt_anchor->chromosome + 1, bwt_anchor->start, // seed_start,seed_end,bwt_anchor->end); region = region_bwt_new(bwt_anchor->chromosome + 1, bwt_anchor->strand, bwt_anchor->start, bwt_anchor->end, seed_start, seed_end, read->length, seed_id); array_list_insert(region, mapping_batch->mapping_lists[targets[i]]); } array_list_clear(array_list_aux, (void *)bwt_anchor_free); array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]); targets[new_num_targets++] = targets[i]; } else { //Flag 2 Case, Pair of anchors found printf("***** Case 2. Double anchor found!\n"); bwt_anchor_t *bwt_anchor; bwt_anchor_t *bwt_anchor_forw, *bwt_anchor_back; int read_nt, genome_nt; int distance; int found = 0; region_t *region; int seed_id = 0; //if (array_list_size(mapping_batch->mapping_lists[targets[i]]) > 2) { int *anchors_targets = (int *)calloc(array_list_size(mapping_batch->mapping_lists[targets[i]]), sizeof(int)); int num = 0; //min_intron_size = 0; //Search if one anchor is at the same distance from the reference and the read for (int b = 0; b < array_list_size(mapping_batch->mapping_lists[targets[i]]); b += 2) { bwt_anchor_forw = array_list_get(b, mapping_batch->mapping_lists[targets[i]]); bwt_anchor_back = array_list_get(b + 1, mapping_batch->mapping_lists[targets[i]]); //printf("FORW=%i:%lu-%lu BACK=%i:%lu-%lu\n", bwt_anchor_forw->chromosome, bwt_anchor_forw->start, bwt_anchor_forw->end, // bwt_anchor_back->chromosome, bwt_anchor_back->start, bwt_anchor_back->end); read_nt = read->length - ((bwt_anchor_forw->end - bwt_anchor_forw->start) + (bwt_anchor_back->end - bwt_anchor_back->start)); genome_nt = bwt_anchor_back->start - bwt_anchor_forw->end; distance = abs(genome_nt - read_nt); //printf("\t%i:Distance %i\n", b, distance); if (distance < min_intron_size) { found = 1; } else { anchors_targets[num++] = b; } } if (found) { //printf("\tFound Exact Case... Delete other anchors\n"); for (int t = num - 1; t >= 0; t--) { target = anchors_targets[t]; //printf("\tDelete %i, %i-->\n", target, target + 1); bwt_anchor = array_list_remove_at(target + 1, mapping_batch->mapping_lists[targets[i]]); bwt_anchor_free(bwt_anchor); bwt_anchor = array_list_remove_at(target, mapping_batch->mapping_lists[targets[i]]); bwt_anchor_free(bwt_anchor); } array_list_set_flag(1, mapping_batch->mapping_lists[targets[i]]); } else { //Seeding between anchors //printf("\tFound gap between anchors \n"); array_list_t *anchors_forward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]), 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *anchors_backward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]), 1.25f, COLLECTION_MODE_ASYNCHRONIZED); int big_gap = 0; int final_anchor_nt = 0; int anchor_nt; int anchor_type; int anchor_strand; for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j -= 2) { bwt_anchor_back = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor_back, anchors_backward); bwt_anchor_forw = array_list_remove_at(j - 1, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor_forw, anchors_forward); if (bwt_anchor_forw->strand == 0) { anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); } else { anchor_nt = bwt_anchor_back->end - bwt_anchor_back->start; gap_nt = read->length - (anchor_nt + (bwt_anchor_forw->end - bwt_anchor_forw->start)); } if (gap_nt < 0) { gap_nt = 0; } //printf("Gap nt (%i - %i): %i\n", anchor_nt, bwt_anchor_back->end - bwt_anchor_back->start, gap_nt); if (gap_nt > big_gap) { big_gap = gap_nt; final_anchor_nt = anchor_nt; anchor_type = bwt_anchor_back->type; anchor_strand = bwt_anchor_back->strand; } } printf("%i, %i\n", big_gap - 2, seed_size); if (big_gap - 2 > seed_size) { //if (anchor_type == FORWARD_ANCHOR && anchor_strand == 0 || // anchor_type == BACKWARD_ANCHOR && anchor_strand == 1 ) { start_search = final_anchor_nt + 1; end_search = final_anchor_nt + big_gap - 1; //} else { // start_search = final_anchor_nt + big_gap - 1; //end_search = final_anchor_nt + 1; //} //printf("Seeding between anchors... gap=%i\n", big_gap); printf("11 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search); /* num_mappings = bwt_map_exact_seeds_between_coords(start_search, end_search, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], EXTRA_SEED_NONE, &seed_id); */ } //printf("Making seeds anchors...\n"); for (int a = 0; a < array_list_size(anchors_forward); a++) { //Insert the last anchor. (Create new seed) bwt_anchor_forw = array_list_get(a, anchors_forward); bwt_anchor_back = array_list_get(a, anchors_backward); anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); //printf("\t --> Big Seed: %i, gap_nt: %i, anchor_nt = %i\n", a, gap_nt, anchor_nt); if (gap_nt < 0) { //gap_nt = 0; bwt_anchor_forw->end += gap_nt; bwt_anchor_back->start -= gap_nt; anchor_nt += gap_nt; gap_nt = 0; } else if (gap_nt == 0) { bwt_anchor_forw->end -= 1; bwt_anchor_back->start += 1; anchor_nt -= 1; gap_nt = 1; } region = region_bwt_new(bwt_anchor_forw->chromosome + 1, bwt_anchor_forw->strand, bwt_anchor_forw->start, bwt_anchor_forw->end, 0, anchor_nt, read->length, 0); //printf("Region: %i-%i\n", region->seq_start, region->seq_end); array_list_insert(region, mapping_batch->mapping_lists[targets[i]]); region = region_bwt_new(bwt_anchor_back->chromosome + 1, bwt_anchor_back->strand, bwt_anchor_back->start, bwt_anchor_back->end, anchor_nt + gap_nt, read->length - 1, read->length, seed_id + 1); //printf("Region: %i-%i\n", region->seq_start, region->seq_end); array_list_insert(region, mapping_batch->mapping_lists[targets[i]]); //printf("\tMaking seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]])); bwt_anchor_free(bwt_anchor_back); bwt_anchor_free(bwt_anchor_forw); } array_list_free(anchors_forward, NULL); array_list_free(anchors_backward, NULL); //printf("Making seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]])); array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]); } free(anchors_targets); targets[new_num_targets++] = targets[i]; } } mapping_batch->num_targets = new_num_targets; array_list_free(array_list_aux, NULL); //if (time_on) { stop_timer(start, end, time); timing_add(time, REGION_SEEKER, timing); } //printf("APPLY SEEDING DONE!\n"); return CAL_STAGE; }
void vcf_header_entry_free(vcf_header_entry_t *header_entry) { assert(header_entry); free(header_entry->name); array_list_free(header_entry->values, free); free(header_entry); }
int apply_sw_bs_4nt(sw_server_input_t* input, batch_t *batch) { mapping_batch_t *mapping_batch = batch->mapping_batch; genome_t *genome1 = input->genome1_p; genome_t *genome2 = input->genome2_p; sw_optarg_t *sw_optarg = &input->sw_optarg; { char r[1024]; size_t start = 169312417; size_t end = start + 99; genome_read_sequence_by_chr_index(r, 0, 0, &start, &end, genome2); printf("+++++++++++++ genome2 = %s \n", r); genome_read_sequence_by_chr_index(r, 0, 0, &start, &end, genome1); printf("+++++++++++++ genome1 = %s \n", r); } // fill gaps between seeds fill_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 5, 1); merge_seed_regions_bs(mapping_batch, 1); fill_end_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 400, 1); fill_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 5, 0); merge_seed_regions_bs(mapping_batch, 0); fill_end_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 400, 0); // now we can create the alignments fastq_read_t *read; array_list_t *fq_batch = mapping_batch->fq_batch; char *match_seq, *match_qual; size_t read_index, read_len, match_len, match_start; cal_t *cal; array_list_t *cal_list = NULL; size_t num_cals; seed_region_t *s; cigar_code_t *cigar_code; cigar_op_t *first_op; float score, norm_score, min_score = input->min_score; alignment_t *alignment; array_list_t *alignment_list; char *p, *optional_fields; int optional_fields_length, AS; array_list_t **mapping_lists; size_t num_targets; size_t *targets; for (int bs_id = 0; bs_id < 2; bs_id++) { if (bs_id == 0) { mapping_lists = mapping_batch->mapping_lists; num_targets = mapping_batch->num_targets; targets = mapping_batch->targets; } else { mapping_lists = mapping_batch->mapping_lists2; num_targets = mapping_batch->num_targets2; targets = mapping_batch->targets2; } for (size_t i = 0; i < num_targets; i++) { read_index = targets[i]; read = (fastq_read_t *) array_list_get(read_index, fq_batch); cal_list = mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; read_len = read->length; alignment_list = array_list_new(num_cals, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); if (cal->sr_list->size == 0) continue; s = (seed_region_t *) linked_list_get_first(cal->sr_list); cigar_code = (cigar_code_t *) s->info; norm_score = cigar_code_get_score(read_len, cigar_code); score = norm_score * 100; //read_len; LOG_DEBUG_F("score = %0.2f\n", norm_score); // filter by SW score if (norm_score > min_score) { // update cigar and sequence and quality strings cigar_code_update(cigar_code); LOG_DEBUG_F("\tcigar code = %s\n", new_cigar_code_string(cigar_code)); match_start = 0; match_len = cigar_code_nt_length(cigar_code); first_op = cigar_code_get_first_op(cigar_code); match_start = (first_op && first_op->name == 'H' ? first_op->number : 0); match_seq = (char *) malloc((match_len + 1)* sizeof(char)); memcpy(match_seq, &read->sequence[match_start], match_len); match_seq[match_len] = 0; match_qual = (char *) malloc((match_len + 1)* sizeof(char)); memcpy(match_qual, &read->quality[match_start], match_len); match_qual[match_len] = 0; // set optional fields optional_fields_length = 100; optional_fields = (char *) calloc(optional_fields_length, sizeof(char)); p = optional_fields; AS = (int) norm_score * 100; sprintf(p, "ASi"); p += 3; memcpy(p, &AS, sizeof(int)); p += sizeof(int); sprintf(p, "NHi"); p += 3; memcpy(p, &num_cals, sizeof(int)); p += sizeof(int); sprintf(p, "NMi"); p += 3; memcpy(p, &cigar_code->distance, sizeof(int)); p += sizeof(int); assert(read->length == cigar_code_nt_length(cigar_code)); // create an alignment and insert it into the list alignment = alignment_new(); //read_id = malloc(read->length); size_t header_len = strlen(read->id); char *head_id = (char *) malloc(header_len + 1); get_to_first_blank(read->id, header_len, head_id); alignment_init_single_end(head_id, match_seq, match_qual, cal->strand, cal->chromosome_id - 1, cal->start - 1, new_cigar_code_string(cigar_code), cigar_code_get_num_ops(cigar_code), norm_score * 254, 1, (num_cals > 1), optional_fields_length, optional_fields, alignment); array_list_insert(alignment, alignment_list); LOG_DEBUG_F("creating alignment (bs_id = %i)...\n", bs_id); //alignment_print(alignment); } } // free the cal list, and update the mapping list with the alignment list array_list_free(cal_list, (void *) cal_free); mapping_lists[read_index] = alignment_list; } } // go to the next stage return BS_POST_PAIR_STAGE; }
void fill_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, genome_t *genome, int min_gap, int min_distance) { int sw_count = 0; fastq_read_t *read; array_list_t *fq_batch = mapping_batch->fq_batch; size_t read_index, read_len; cal_t *cal; array_list_t *cal_list = NULL; size_t num_cals, num_targets = mapping_batch->num_targets; char *revcomp_seq = NULL; seed_region_t *s, *prev_s, *new_s; linked_list_iterator_t* itr; cigar_code_t *cigar_code; size_t start, end; size_t gap_read_start, gap_read_end, gap_read_len; size_t gap_genome_start, gap_genome_end, gap_genome_len; int left_flank, right_flank; sw_prepare_t *sw_prepare; array_list_t *sw_prepare_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); char *query, *ref; int distance, first = 0, last = 0; // LOG_DEBUG("\n\n P R E - P R O C E S S\n"); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_targets; i++) { read_index = mapping_batch->targets[i]; read = (fastq_read_t *) array_list_get(read_index, fq_batch); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; read_len = read->length; min_distance = read_len*0.2; LOG_DEBUG_F(">>>>> read %s\n", read->id); // printf(">>>>> read %s\n", read->id); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, sr_duplicate_list size = %i\n", j, num_cals, cal->strand, cal->sr_list->size, cal->sr_duplicate_list->size); prev_s = NULL; itr = linked_list_iterator_new(cal->sr_list); s = (seed_region_t *) linked_list_iterator_curr(itr); while (s != NULL) { { // for debugging size_t start = s->genome_start;// + 1; size_t end = s->genome_end;// + 1; size_t len = end - start + 1; // printf(":::::::::: %lu - %lu = %i ::::::::::::\n", end, start, len ); char *ref = (char *) malloc((len + 1) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); ref[len] = '\0'; // LOG_DEBUG_F("\tseed: [%i|%i - %i|%i] %s (len = %i)\n", s->genome_start, s->read_start, s->read_end, s->genome_end, ref, len); free(ref); } // set the cigar for the current region gap_read_len = s->read_end - s->read_start + 1; cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); s->info = (void *) cigar_code; cigar_code = NULL; sw_prepare = NULL; if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) { distance = 0; mapping_batch->num_gaps++; if (prev_s == NULL) { // gap at the first position gap_read_start = 0; gap_read_end = s->read_start - 1; gap_genome_start = s->genome_start - s->read_start; gap_genome_end = s->genome_start - 1; gap_read_len = gap_read_end - gap_read_start + 1; gap_genome_len = gap_genome_end - gap_genome_start + 1; cal->start = gap_genome_start; assert(gap_read_len != 0); assert(gap_genome_len != 0); if (gap_read_len > min_gap) { // the gap is too big, may be there's another CAL to cover it cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code); } else { left_flank = 0; right_flank = DOUBLE_FLANK; } } else { assert(prev_s->read_end < s->read_start); // gap in a middle position gap_read_start = prev_s->read_end + 1; gap_read_end = s->read_start - 1; gap_genome_start = prev_s->genome_end + 1; gap_genome_end = s->genome_start - 1; gap_read_len = gap_read_end - gap_read_start + 1; gap_genome_len = gap_genome_end - gap_genome_start + 1; LOG_DEBUG_F("gap (read, genome) = (%i, %i)\n", gap_read_len, gap_genome_len); if (gap_genome_len == 0) { printf("#@#: %s\n", read->id); } assert(gap_genome_len != 0); if (gap_read_len == 0) { // there's a deletion just between two consecutives seeds cigar_code = (cigar_code_t *)prev_s->info; cigar_code_append_op(cigar_op_new(gap_genome_len, 'D'), cigar_code); cigar_code->distance += gap_genome_len; cigar_code_append_op(cigar_op_new(s->read_end - s->read_start + 1, 'M'), cigar_code); cigar_code->distance += ((cigar_code_t *)s->info)->distance; prev_s->read_end = s->read_end; prev_s->genome_end = s->genome_end; LOG_DEBUG_F("prev cigar = %s\n", new_cigar_code_string((cigar_code_t *)prev_s->info)); // continue loop... linked_list_iterator_remove(itr); s = linked_list_iterator_curr(itr); continue; } left_flank = SINGLE_FLANK; right_flank = SINGLE_FLANK; } if (!cigar_code) { // we have to try to fill this gap and get a cigar if (gap_read_len == gap_genome_len) { // 1) first, for from begin -> end, and begin <- end start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; first = -1; last = -1; ref = (char *) malloc((gap_genome_len + 5) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } query = &revcomp_seq[gap_read_start]; } else { query = &read->sequence[gap_read_start]; } for (int k = 0; k < gap_read_len; k++) { if (query[k] != ref[k]) { distance++; if (first == -1) first = k; last = k; } } if (distance < min_distance) { cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); cigar_code_inc_distance(distance, cigar_code); } } if (!cigar_code) { // 2) second, prepare SW to run // get query sequence, revcomp if necessary size_t read_start = gap_read_start - left_flank; size_t read_end = gap_read_end + right_flank; int gap_read_len_ex = read_end - read_start + 1; query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char)); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } memcpy(query, &revcomp_seq[read_start], gap_read_len_ex); } else { memcpy(query, &read->sequence[read_start], gap_read_len_ex); } query[gap_read_len_ex] = '\0'; // get ref. sequence size_t genome_start = gap_genome_start - left_flank;// + 1; size_t genome_end = gap_genome_end + right_flank;// + 1; int gap_genome_len_ex = genome_end - genome_start + 1; ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &genome_start, &genome_end, genome); ref[gap_genome_len_ex] = '\0'; if (prev_s == NULL) { sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, FIRST_SW); } else { sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, MIDDLE_SW); } array_list_insert(sw_prepare, sw_prepare_list); // increase counter sw_count++; LOG_DEBUG_F("query: %s\n", query); LOG_DEBUG_F("ref : %s\n", ref); LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", distance, min_distance, gap_read_len, first, last); LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, read->id); } } // insert gap in the list new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0); new_s->info = (void *) cigar_code; linked_list_iterator_insert(new_s, itr); if (sw_prepare) { sw_prepare->seed_region = new_s; sw_prepare->cal = cal; sw_prepare->read = read; } } // continue loop... prev_s = s; linked_list_iterator_next(itr); s = linked_list_iterator_curr(itr); } // check for a gap at the last position sw_prepare = NULL; if (prev_s != NULL && prev_s->read_end < read_len - 1) { cigar_code = NULL; mapping_batch->num_gaps++; // mapping_batch->num_sws++; // mapping_batch->num_ext_sws++; // gap at the last position gap_read_start = prev_s->read_end + 1; gap_read_end = read_len - 1; gap_read_len = gap_read_end - gap_read_start + 1; assert(gap_read_len != 0); gap_genome_len = gap_read_len; gap_genome_start = prev_s->genome_end + 1; gap_genome_end = gap_genome_start + gap_genome_len - 1; cal->end = gap_genome_end; assert(gap_genome_len != 0); // LOG_DEBUG_F("\t\tgap_read_len = %i, gap_genome_len = %i\n", gap_read_len, gap_genome_len); // LOG_DEBUG_F("\t\t%i : [%lu|%lu - %lu|%lu]\n", // sw_count, gap_genome_start, gap_read_start, gap_read_end, gap_genome_end); if (gap_read_len > min_gap) { // the gap is too big, may be there's another CAL to cover it cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code); } else { // we have to try to fill this gap and get a cigar // 1) first, for from begin -> end, and begin <- end start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; first = -1; last = -1; ref = (char *) malloc((gap_genome_len + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } query = &revcomp_seq[gap_read_start]; } else { query = &read->sequence[gap_read_start]; } distance = 0; for (int k = 0; k < gap_read_len; k++) { if (query[k] != ref[k]) { distance++; if (first == -1) first = k; last = k; } } if (distance < min_distance) { cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); cigar_code_inc_distance(distance, cigar_code); } else { // 2) second, prepare SW to run left_flank = DOUBLE_FLANK; right_flank = 0; // get query sequence, revcomp if necessary size_t read_start = gap_read_start - left_flank; size_t read_end = gap_read_end + right_flank; int gap_read_len_ex = read_end - read_start + 1; query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char)); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } memcpy(query, &revcomp_seq[read_start], gap_read_len_ex); } else { memcpy(query, &read->sequence[read_start], gap_read_len_ex); } query[gap_read_len_ex] = '\0'; // get ref. sequence size_t genome_start = gap_genome_start - left_flank;// + 1; size_t genome_end = gap_genome_end + right_flank;// + 1; int gap_genome_len_ex = genome_end - genome_start + 1; ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &genome_start, &genome_end, genome); query[gap_genome_len_ex] = '\0'; sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, LAST_SW); array_list_insert(sw_prepare, sw_prepare_list); // increase counter sw_count++; LOG_DEBUG_F("query: %s\n", query); LOG_DEBUG_F("ref : %s\n", ref); LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", distance, min_distance, gap_read_len, first, last); LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, read->id); } } // insert gap in the list new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0); new_s->info = (void *) cigar_code; linked_list_insert_last(new_s, cal->sr_list); if (sw_prepare) { sw_prepare->seed_region = new_s; sw_prepare->cal = cal; sw_prepare->read = read; } } linked_list_iterator_free(itr); } // free memory if (revcomp_seq) { free(revcomp_seq); revcomp_seq = NULL; } } // display_sr_lists("ATER pre-process in fill_gaps", mapping_batch); LOG_DEBUG_F("\nR U N S W (sw_count = %i, sw_prepare_list size = %i)\n", sw_count, array_list_size(sw_prepare_list)); assert(sw_count == array_list_size(sw_prepare_list)); char *q[sw_count], *r[sw_count]; for (int i = 0; i < sw_count; i++) { sw_prepare = array_list_get(i, sw_prepare_list); q[i] = sw_prepare->query; r[i] = sw_prepare->ref; } sw_multi_output_t *output = sw_multi_output_new(sw_count); // run Smith-Waterman smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output); LOG_DEBUG("P O S T - P R O C E S S\n"); cigar_op_t* cigar_op; for (int i = 0; i < sw_count; i++) { sw_prepare = array_list_get(i, sw_prepare_list); s = sw_prepare->seed_region; int read_gap_len = s->read_end - s->read_start + 1; int genome_gap_len = s->genome_end - s->genome_start + 1; int read_gap_len_ex = read_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank; int genome_gap_len_ex = genome_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank; LOG_DEBUG_F("\tgap (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", s->read_start, s->read_end, s->genome_start, s->genome_end, read_gap_len, genome_gap_len, sw_prepare->read->id); LOG_DEBUG_F("\tflanks (left, right) = (%i, %i)\n", sw_prepare->left_flank, sw_prepare->right_flank); LOG_DEBUG_F("\tquery : %s\n", sw_prepare->query); LOG_DEBUG_F("\tref : %s\n", sw_prepare->ref); LOG_DEBUG_F("\tmquery: %s (start %i)\n", output->query_map_p[i], output->query_start_p[i]); LOG_DEBUG_F("\tmref : %s (start %i)\n", output->ref_map_p[i], output->ref_start_p[i]); cigar_code_t *cigar_c = generate_cigar_code(output->query_map_p[i], output->ref_map_p[i], strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], read_gap_len, genome_gap_len, &distance, sw_prepare->ref_type); LOG_DEBUG_F("\tscore : %0.2f, cigar: %s (distance = %i)\n", output->score_p[i], new_cigar_code_string(cigar_c), distance); /* if (output->query_start_p[i] > 0 && output->ref_start_p[i] > 0 && output->query_start_p[i] != output->ref_start_p[i]) { LOG_DEBUG("both map start points > 0 and are different lengths"); exit(-1); } */ // assert(output->query_start_p[i] == 0); // assert(output->ref_start_p[i] == 0); cigar_op = cigar_code_get_op(0, cigar_c); if (cigar_op) { if (cigar_op->name == 'H') { if (output->ref_start_p[i] == 0) { cigar_op->name = 'I'; } else { cigar_op->name = 'M'; } } else if (cigar_op->name == '=') cigar_op->name = 'M'; } cigar_op = cigar_code_get_last_op(cigar_c); if (cigar_op && cigar_op->name == 'H') cigar_op->name = 'I'; LOG_DEBUG_F("gap_read_len = %i, cigar_code_length (%s) = %i\n", read_gap_len, new_cigar_code_string(cigar_c), cigar_code_nt_length(cigar_c)); assert(read_gap_len == cigar_code_nt_length(cigar_c)); /* if (cigar_code_get_num_ops(cigar_c) > 2) { if (sw_prepare->left_flank > 0) { cigar_op = cigar_code_get_op(0, cigar_c); assert(cigar_op->number >= sw_prepare->left_flank && cigar_op->name == 'M'); cigar_op->number -= sw_prepare->left_flank; } if (sw_prepare->right_flank > 0) { cigar_op = cigar_code_get_last_op(cigar_c); assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M'); cigar_op->number -= sw_prepare->right_flank; } init_cigar_string(cigar_c); LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c)); } else { assert(cigar_code_get_num_ops(cigar_c) == 1); if (sw_prepare->right_flank > 0) { cigar_op = cigar_code_get_last_op(cigar_c); assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M'); cigar_op->number -= (sw_prepare->left_flank + sw_prepare->right_flank); if (cigar_op->number > read_gap_len) { cigar_code_append_op(cigar_op_new(cigar_op->number - read_gap_len, 'D'), cigar_c); } else if (cigar_op->number < read_gap_len) { cigar_code_append_op(cigar_op_new(read_gap_len - cigar_op->number, 'I'), cigar_c); } else{ init_cigar_string(cigar_c); } // LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c)); } } */ // and now set the cigar for this gap s->info = (void *) cigar_c; // free sw_prepare_free(sw_prepare); } display_sr_lists("END of fill_gaps", mapping_batch); // free memory sw_multi_output_free(output); array_list_free(sw_prepare_list, (void *) NULL); }
size_t bwt_search_pair_anchors(array_list_t *list, unsigned int read_length) { bwt_anchor_t *bwt_anchor; int max_anchor_length = 0; bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw; int anchor_length_tmp, anchor_back, anchor_forw; int strand = 0, type = 0; int found_anchor = 0, found_double_anchor = 0; const int MIN_ANCHOR = 25; const int MIN_SINGLE_ANCHOR = 40; //const int MIN_DOUBLE_ANCHOR = MIN_ANCHOR*2; const int MAX_BWT_REGIONS = 50; const int MAX_BWT_ANCHOR_DISTANCE = 500000; array_list_t *anchor_list_tmp, *forward_anchor_list, *backward_anchor_list; cal_t *cal; int seed_size, gap_read, gap_genome; array_list_t *backward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *forward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED); array_list_t *backward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *forward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED); array_list_t *big_anchor_list = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED); //printf("Tot Anchors %i\n", array_list_size(list)); for (int i = 0; i < array_list_size(list); i++) { bwt_anchor = array_list_get(i, list); if (bwt_anchor->strand == 1) { //printf("(-)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1); if (bwt_anchor->type == FORWARD_ANCHOR) { array_list_insert(bwt_anchor, forward_anchor_list_1); //printf("FORW\n"); } else { array_list_insert(bwt_anchor, backward_anchor_list_1); //printf("BACK\n"); } } else { //printf("(+)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1); if (bwt_anchor->type == FORWARD_ANCHOR) { array_list_insert(bwt_anchor, forward_anchor_list_0); //printf("FORW\n"); } else { array_list_insert(bwt_anchor, backward_anchor_list_0); //printf("BACK\n"); } } anchor_length_tmp = bwt_anchor->end - bwt_anchor->start + 1; if (anchor_length_tmp > MIN_SINGLE_ANCHOR && anchor_length_tmp > max_anchor_length) { max_anchor_length = anchor_length_tmp; found_anchor = 1; strand = bwt_anchor->strand; type = bwt_anchor->type; } if (read_length - anchor_length_tmp < 16) { array_list_insert(bwt_anchor, big_anchor_list); } } array_list_clear(list, NULL); if (array_list_size(big_anchor_list) > 0) { for (int i = array_list_size(big_anchor_list) - 1; i >= 0; i--) { //printf("Insert cal %i\n", i); bwt_anchor = array_list_remove_at(i, big_anchor_list); size_t seed_size = bwt_anchor->end - bwt_anchor->start; if (bwt_anchor->type == FORWARD_ANCHOR) { cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size); } else { cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1); } array_list_insert(cal, list); } array_list_set_flag(SINGLE_ANCHORS, list); goto exit; } for (int type = 1; type >= 0; type--) { if (!type) { forward_anchor_list = forward_anchor_list_1; backward_anchor_list = backward_anchor_list_1; //printf("Strand (+): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list)); } else { forward_anchor_list = forward_anchor_list_0; backward_anchor_list = backward_anchor_list_0; //printf("Strand (-): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list)); } int *set_forward = (int *)calloc(array_list_size(forward_anchor_list), sizeof(int)); int *set_backward = (int *)calloc(array_list_size(backward_anchor_list), sizeof(int)); //Associate Anchors (+)/(-) for (int i = 0; i < array_list_size(forward_anchor_list); i++) { if (set_forward[i]) { continue; } bwt_anchor_forw = array_list_get(i, forward_anchor_list); for (int j = 0; j < array_list_size(backward_anchor_list); j++) { if (set_backward[j]) { continue; } bwt_anchor_back = array_list_get(j, backward_anchor_list); anchor_forw = (bwt_anchor_forw->end - bwt_anchor_forw->start + 1); anchor_back = (bwt_anchor_back->end - bwt_anchor_back->start + 1); anchor_length_tmp = anchor_forw + anchor_back; //printf("\tCommpare %i:%lu-%lu with %i:%lu-%lu\n", bwt_anchor_forw->chromosome + 1, // bwt_anchor_forw->start, bwt_anchor_forw->end, bwt_anchor_back->chromosome + 1, // bwt_anchor_back->start, bwt_anchor_back->end); if (bwt_anchor_forw->chromosome == bwt_anchor_back->chromosome && abs(bwt_anchor_back->start - bwt_anchor_forw->end) <= MAX_BWT_ANCHOR_DISTANCE && anchor_forw >= MIN_ANCHOR && anchor_back >= MIN_ANCHOR) { if (bwt_anchor_back->start < bwt_anchor_forw->end) { continue; } gap_read = read_length - (anchor_forw + anchor_back); gap_genome = bwt_anchor_back->start - bwt_anchor_forw->end; //printf("anchor_forw = %i, anchor_back = %i, gap_read = %i, gap_genome = %i\n", // anchor_forw, anchor_back, gap_read, gap_genome); int apply_flank = 0; if (gap_read < 2 || gap_genome < 2) { int gap; if (gap_read < 0 && gap_genome < 0) { gap = abs(gap_read) > abs(gap_genome) ? abs(gap_read) : abs(gap_genome); } else if (gap_read < 0) { gap = abs(gap_read); } else if (gap_genome < 0) { gap = abs(gap_genome); } else { gap = 2; } int flank = 5; apply_flank = 1; if (abs(gap) >= flank*2) { //Solve read overlap flank = abs(gap)/2 + flank/2; } //printf("\tgap = %i, flank = %i\n", gap, flank); if (flank >= anchor_forw) { bwt_anchor_forw->end -= anchor_forw/2; } else { bwt_anchor_forw->end -= flank; } if (flank >= anchor_back) { bwt_anchor_back->start += anchor_back/2; } else { bwt_anchor_back->start += flank; } } cal = convert_bwt_anchor_to_CAL(bwt_anchor_forw, 0, bwt_anchor_forw->end - bwt_anchor_forw->start); //printf("INSERT-1 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end); array_list_insert(cal, list); seed_size = bwt_anchor_back->end - bwt_anchor_back->start + 1; //if (bwt_anchor_forw->end + read_length >= bwt_anchor_back->start) { //seed_region_t *seed_region = seed_region_new(read_length - seed_size, read_length - 1, //bwt_anchor_back->start, bwt_anchor_back->end, 1); //cal->end = bwt_anchor_back->end; //linked_list_insert_last(seed_region, cal->sr_list); //} else { cal = convert_bwt_anchor_to_CAL(bwt_anchor_back, read_length - seed_size, read_length - 1); //printf("INSERT-2 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end); array_list_insert(cal, list); if (array_list_size(list) > 5) { free(set_backward); free(set_forward); goto exit; } array_list_set_flag(DOUBLE_ANCHORS, list); found_double_anchor = 1; set_forward[i] = 1; set_backward[j] = 1; break; } } } free(set_backward); free(set_forward); } if (!found_double_anchor && found_anchor) { //Not Double anchor found but one Yes!! if (strand == 1) { if (type == FORWARD_ANCHOR) { anchor_list_tmp = forward_anchor_list_1; } else { anchor_list_tmp = backward_anchor_list_1; } } else { if (type == FORWARD_ANCHOR) { anchor_list_tmp = forward_anchor_list_0; } else { anchor_list_tmp = backward_anchor_list_0; } } //printf("LIST SIZE %i\n", array_list_size(anchor_list_tmp)); for (int i = 0; i < array_list_size(anchor_list_tmp); i++) { bwt_anchor = array_list_get(i, anchor_list_tmp); size_t seed_size = bwt_anchor->end - bwt_anchor->start; //array_list_insert(bwt_anchor_new(bwt_anchor->strand, bwt_anchor->chromosome, // bwt_anchor->start, bwt_anchor->end, bwt_anchor->type), anchor_list); if (bwt_anchor->type == FORWARD_ANCHOR) { //printf("------------------------> start %i\n", 0); cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size); } else { //printf("------------------------> start %i\n", read_length - seed_size); cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1); } array_list_insert(cal, list); } array_list_set_flag(SINGLE_ANCHORS, list); } exit: array_list_free(forward_anchor_list_1, (void *)bwt_anchor_free); array_list_free(backward_anchor_list_1, (void *)bwt_anchor_free); array_list_free(forward_anchor_list_0, (void *)bwt_anchor_free); array_list_free(backward_anchor_list_0, (void *)bwt_anchor_free); array_list_free(big_anchor_list, (void *)bwt_anchor_free); return array_list_size(list); }
int main (int argc, char *argv[]) { if(!strcmp("count-lines", argv[1])) { fastq_file_t *file = fastq_fopen(argv[2]); array_list_t *reads = array_list_new(2000000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_fread_se(reads, 100000, file)) != 0) { count += nread; for(int i=0; i<reads->size; i++) { fastq_read_print(array_list_get(i, reads)); } // printf("Size: %i, Capacity: %i\n", reads->size, reads->capacity); array_list_clear(reads, fastq_read_free); } // printf("Total num reads: %i\n", reads->size); // fastq_read_print(array_list_get(0, reads)); // fastq_read_print(array_list_get(reads->size-1, reads)); array_list_free(reads, fastq_read_free); fastq_fclose(file); } if(!strcmp("count-lines-gz", argv[1])) { fastq_gzfile_t *file = fastq_gzopen(argv[2]); // printf("=>%i\n", file->ret); array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_gzread_se(reads, 100000, file)) != 0) { // nread = fastq_gzread_se(reads, 1000000, file); count += nread; // printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread); for(int i=0; i<reads->size; i++) { fastq_read_print(array_list_get(i, reads)); } // fastq_read_print((fastq_read_t*)array_list_get(reads->size-1, reads)); array_list_clear(reads, fastq_read_free); } // printf("Total num reads: %i\n", count); // fastq_read_print(array_list_get(0, reads)); array_list_free(reads, fastq_read_free); fastq_gzclose(file); } if(!strcmp("count-bytes-gz", argv[1])) { fastq_gzfile_t *file = fastq_gzopen(argv[2]); // printf("=>%i\n", file->ret); array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_gzread_bytes_se(reads, 10000000, file)) != 0) { // nread = fastq_gzread_bytes_se(reads, 100000, file); count += reads->size; // printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread); for(int i=0; i<reads->size; i++) { fastq_read_print(array_list_get(i, reads)); } // fastq_read_print(array_list_get(reads->size-1, reads)); array_list_clear(reads, fastq_read_free); } // printf("Total num reads: %i\n", count); // fastq_read_print(array_list_get(0, reads)); array_list_free(reads, fastq_read_free); fastq_gzclose(file); } if(!strcmp("filter", argv[1])) { fastq_file_t *file = fastq_fopen(argv[2]); fastq_filter_options_t *fastq_filter_options = fastq_filter_options_new(50,150, 30, 80, 2, 100); array_list_t *reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); array_list_t *passed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); array_list_t *failed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_fread_se(reads, 1000000, file)) != 0) { count += reads->size; passed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); failed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); // for(int i=0; i<reads->size; i++) { // fastq_read_print(array_list_get(i, reads)); // } fastq_filter(reads, passed_reads, failed_reads, fastq_filter_options); fastq_read_print(array_list_get(0, passed_reads)); fastq_read_print(array_list_get(0, failed_reads)); printf("Total Reads: %lu, Passed Reads: %lu, Reads failed: %lu\n", reads->size, passed_reads->size, failed_reads->size); array_list_clear(reads, fastq_read_free); array_list_free(passed_reads, NULL); array_list_free(failed_reads, NULL); // fastq_read_print(array_list_get(0, passed_reads)); // fastq_read_print(array_list_get(0, failed_reads)); // printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size); } // fastq_read_print(array_list_get(0, passed_reads)); // fastq_read_print(array_list_get(0, failed_reads)); // printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size); fastq_filter_options_free(fastq_filter_options); array_list_free(reads, NULL); // array_list_free(passed_reads, fastq_read_free); // array_list_free(failed_reads, fastq_read_free); fastq_fclose(file); } return 0; }
int run_filter(shared_options_data_t *shared_options_data, filter_options_data_t *options_data) { int ret_code; double start, stop, total; vcf_file_t *file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!file) { LOG_FATAL("VCF file does not exist!\n"); } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); if (shared_options_data->batch_bytes > 0) { ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, file); } else if (shared_options_data->batch_lines > 0) { ret_code = vcf_parse_batches(shared_options_data->batch_lines, file); } stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(file); } #pragma omp section { filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); if (!options_data->save_rejected) { fclose(failed_file); } LOG_DEBUG("File streams created\n"); start = omp_get_wtime(); int i = 0; vcf_batch_t *batch = NULL; while ((batch = fetch_vcf_batch(file)) != NULL) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], file); } // Write file format, header entries and delimiter write_vcf_header(file, passed_file); if (options_data->save_rejected) { write_vcf_header(file, failed_file); } LOG_DEBUG("VCF header written created\n"); } array_list_t *input_records = batch->records; array_list_t *passed_records, *failed_records; if (i % 100 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); } if (filters == NULL) { passed_records = input_records; } else { failed_records = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); passed_records = run_filter_chain(input_records, failed_records, filters, num_filters); } // Write records that passed and failed to 2 new separated files if (passed_records != NULL && passed_records->size > 0) { LOG_DEBUG_F("[batch %d] %zu passed records\n", i, passed_records->size); #pragma omp critical { for (int r = 0; r < passed_records->size; r++) { write_vcf_record(passed_records->items[r], passed_file); } // write_batch(passed_records, passed_file); } } if (options_data->save_rejected && failed_records != NULL && failed_records->size > 0) { LOG_DEBUG_F("[batch %d] %zu failed records\n", i, failed_records->size); #pragma omp critical { for (int r = 0; r < failed_records->size; r++) { write_vcf_record(failed_records->items[r], failed_file); } // write_batch(failed_records, failed_file); } } // Free batch and its contents vcf_batch_free(batch); // Free items in both lists (not their internal data) if (passed_records != input_records) { array_list_free(passed_records, NULL); } if (failed_records) { array_list_free(failed_records, NULL); } i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (options_data->save_rejected && failed_file) { fclose(failed_file); } free_filters(filters, num_filters); } } vcf_close(file); return 0; }
int sa_bam_writer(void *data) { sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data; sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch; if (mapping_batch == NULL) { printf("bam_writer1: error, NULL mapping batch\n"); return 0; } // for (int i = 0; i < NUM_COUNTERS; i++) { // counters[i] += mapping_batch->counters[i]; // } #ifdef _TIMING for (int i = 0; i < NUM_TIMING; i++) { func_times[i] += mapping_batch->func_times[i]; } #endif int flag, len; char *sequence, *quality; fastq_read_t *read; array_list_t *read_list = mapping_batch->fq_reads; bam1_t *bam1; alignment_t *alig; array_list_t *mapping_list; bam_file_t *out_file = wf_batch->writer_input->bam_file; sa_genome3_t *genome = wf_batch->sa_index->genome; size_t num_reads, num_mappings, num_mate_mappings; num_reads = mapping_batch->num_reads; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { alig = (alignment_t *) array_list_get(j, mapping_list); // update alignment if (num_mappings > 1) { alig->map_quality = 0; } else { alig->map_quality = alig->mapq; } bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, out_file); bam_destroy1(bam1); alignment_free(alig); } } else { num_unmapped_reads++; if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { // sequences sequence = read->sequence; quality = read->quality; } alig = alignment_new(); alignment_init_single_end(strdup(read->id), sequence, quality, 0, -1, -1, /*strdup(aux)*/"", 0, 0, 0, 0, 0, NULL, alig); bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, out_file); // free memory bam_destroy1(bam1); alig->sequence = NULL; alig->quality = NULL; alig->cigar = NULL; alignment_free(alig); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } // free memory sa_mapping_batch_free(mapping_batch); if (wf_batch) sa_wf_batch_free(wf_batch); return 0; }
//==================================================================================== // apply_caling //==================================================================================== int apply_caling(cal_seeker_input_t* input, batch_t *batch) { mapping_batch_t *mapping_batch = batch->mapping_batch; array_list_t *list = NULL; size_t read_index, num_cals; int min_seeds, max_seeds; cal_t *cal; array_list_t *cal_list; fastq_read_t *read; size_t num_chromosomes = input->genome->num_chromosomes + 1; size_t num_targets = mapping_batch->num_targets; size_t *targets = mapping_batch->targets; size_t new_num_targets = 0; array_list_t *region_list; bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw; linked_list_t *linked_list; int anchor_nt, gap_nt; seed_region_t *seed_region_start, *seed_region_end; //max_seeds = input->cal_optarg->num_seeds; // size_t *new_targets = (size_t *) calloc(num_targets, sizeof(size_t)); // set to zero mapping_batch->num_to_do = 0; for (size_t i = 0; i < num_targets; i++) { read_index = targets[i]; read = array_list_get(read_index, mapping_batch->fq_batch); region_list = mapping_batch->mapping_lists[read_index]; // for debugging // LOG_DEBUG_F("%s\n", ((fastq_read_t *) array_list_get(read_index, mapping_batch->fq_batch))->id); if (!list) { list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); } if (array_list_get_flag(region_list) == 0 || array_list_get_flag(region_list) == 2) { //We have normal and extend seeds (anchors) max_seeds = (read->length / 15)*2 + 10; num_cals = bwt_generate_cal_list_linked_list(region_list, input->cal_optarg, &min_seeds, &max_seeds, num_chromosomes, list, read->length, input->cal_optarg->min_cal_size, 0); } else { //We have double anchors with smaller distance between they //printf("Easy case... Two anchors and same distance between read gap and genome distance\n"); num_cals = 0; for (int a = array_list_size(region_list) - 1; a >= 0; a -= 2) { max_seeds = 2; min_seeds = 2; bwt_anchor_back = array_list_remove_at(a, region_list); bwt_anchor_forw = array_list_remove_at(a - 1, region_list); linked_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); //Seed for the first anchor anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; //printf("\t seed0[%i-%i][%lu-%lu]\n", 0, anchor_nt - 1, // bwt_anchor_forw->start, bwt_anchor_forw->end); seed_region_start = seed_region_new(0, anchor_nt - 1, bwt_anchor_forw->start, bwt_anchor_forw->end, 0, 0, 0); //Seed for the first anchor gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); //printf("\t gap_nt = %i, anchor_nt = %i\n", gap_nt, anchor_nt); //printf("\t seed1[%i-%i][%lu-%lu]\n", anchor_nt + gap_nt, read->length - 1, // bwt_anchor_back->start + 1, bwt_anchor_back->end); seed_region_end = seed_region_new(anchor_nt + gap_nt, read->length - 1, bwt_anchor_back->start + 1, bwt_anchor_back->end, 1, 0, 0); //The reference distance is 0 and the read distance not //The read distance is 0 and the reference distance not //if (seed_region_start->genome_end > seed_region_end->genome_start || // seed_region_start->read_end > seed_region_end->read_start) { //array_list_clear(region_list, NULL); //continue; if (seed_region_end->genome_start - seed_region_start->genome_end < 5 || seed_region_end->read_start - seed_region_start->read_end < 5) { seed_region_start->genome_end -= 5; seed_region_start->read_end -= 5; seed_region_end->genome_start += 5; seed_region_end->read_start += 5; } linked_list_insert(seed_region_start, linked_list); linked_list_insert_last(seed_region_end, linked_list); cal = cal_new(bwt_anchor_forw->chromosome + 1, bwt_anchor_forw->strand, bwt_anchor_forw->start, bwt_anchor_back->end + 1, 2, linked_list, linked_list_new(COLLECTION_MODE_ASYNCHRONIZED)); array_list_insert(cal, list); num_cals++; } } // for debugging LOG_DEBUG_F("read %s : num. cals = %i, min. seeds = %i, max. seeds = %i\n", read->id, num_cals, min_seeds, max_seeds); /* if (num_cals == 0) { int seed_size = 24; //First, Delete old regions array_list_clear(mapping_batch->mapping_lists[read_index], region_bwt_free); //Second, Create new regions with seed_size 24 and 1 Mismatch bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2, bwt_optarg, bwt_index, mapping_batch->mapping_lists[read_index]); num_cals = bwt_generate_cal_list_linked_list(mapping_batch->mapping_lists[mapping_batch->targets[i]], input->cal_optarg, &min_seeds, &max_seeds, num_chromosomes, list, read->length); }*/ /* for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); LOG_DEBUG_F("\tchr: %i, strand: %i, start: %lu, end: %lu, num_seeds = %i, num. regions = %lu\n", cal->chromosome_id, cal->strand, cal->start, cal->end, cal->num_seeds, cal->sr_list->size); } */ // printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", // min_seeds, max_seeds, min_limit, array_list_size(list)); // filter incoherent CALs int founds[num_cals], found = 0; for (size_t j = 0; j < num_cals; j++) { founds[j] = 0; cal = array_list_get(j, list); LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", j, num_cals, cal->sr_list->size, cal->num_seeds, cal->chromosome_id, cal->start, cal->end); if (cal->sr_list->size > 0) { int start = 0; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start); if (start > s->read_start) { LOG_DEBUG("\t\t\t:: remove\n"); found++; founds[j] = 1; } start = s->read_end + 1; } } else { found++; founds[j] = 1; } } if (found) { min_seeds = 100000; max_seeds = 0; cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { if (!founds[j]) { cal = array_list_get(j, list); cal->num_seeds = cal->sr_list->size; if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds; if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds; array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_free(list, (void *) cal_free); num_cals = array_list_size(cal_list); list = cal_list; } // LOG_FATAL_F("num. cals = %i, min. seeds = %i, max. seeds = %i\n", num_cals, min_seeds, max_seeds); // filter CALs by the number of seeds cal_list = list; list = NULL; /* int min_limit = input->cal_optarg->min_num_seeds_in_cal; if (min_limit < 0) min_limit = max_seeds; // min_limit -= 3; if (min_seeds == max_seeds || min_limit <= min_seeds) { cal_list = list; list = NULL; } else { cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); if (cal->num_seeds >= min_limit) { array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_clear(list, (void *) cal_free); num_cals = array_list_size(cal_list); } */ if (num_cals > MAX_CALS) { for (size_t j = num_cals - 1; j >= MAX_CALS; j--) { cal = (cal_t *) array_list_remove_at(j, cal_list); cal_free(cal); } num_cals = array_list_size(cal_list); } // LOG_DEBUG_F("num. cals = %i, MAX_CALS = %i\n", num_cals, MAX_CALS); if (num_cals > 0 && num_cals <= MAX_CALS) { array_list_set_flag(2, cal_list); targets[new_num_targets++] = read_index; /* int count1 = 0, count2 = 0; // count number of sw to do // method #1 // printf("method #1\n"); seed_region_t *s, *prev_s; linked_list_iterator_t* itr; for (size_t j = 0; j < num_cals; j++) { prev_s = NULL; cal = array_list_get(j, cal_list); itr = linked_list_iterator_new(cal->sr_list); s = (seed_region_t *) linked_list_iterator_curr(itr); while (s != NULL) { if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) { // printf("\t\t\tcase 1\n"); count1++; } prev_s = s; linked_list_iterator_next(itr); s = linked_list_iterator_curr(itr); } if (prev_s != NULL && prev_s->read_end < read->length - 1) { count1++; // printf("\t\t\tcase 2 (%i < %i)\n", prev_s->read_end, read->length - 1); } linked_list_iterator_free(itr); } // method #2 printf("method #2\n"); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, cal_list); printf("\t: %i\n", j); if (cal->sr_list->size > 0) { int start = 0; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; printf("\t\t[%i|%i - %i|%i]\n", s->genome_start, s->read_start, s->read_end, s->genome_end); if (s->read_start != start) { count2++; } start = s->read_end + 1; } if (start < read->length) { count2++; } } } printf("count #1 = %i, count #2 = %i\n", count1, count2); assert(count1 == count2); mapping_batch->num_to_do += count1; */ // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; } else { array_list_set_flag(0, mapping_batch->mapping_lists[read_index]); // we have to free the region list array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); if (cal_list) array_list_free(cal_list, (void *) cal_free); if (list) array_list_clear(list, (void *) cal_free); } /* cal_list = list; list = NULL; array_list_set_flag(2, cal_list); // mapping_batch->num_to_do += num_cals; targets[new_num_targets++] = read_index; // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; */ /* // filter CALs by the number of seeds int min_limit = input->cal_optarg->min_num_seeds_in_cal; if (min_limit < 0) min_limit = max_seeds; printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", min_seeds, max_seeds, min_limit, array_list_size(list)); if (min_seeds == max_seeds || min_limit <= min_seeds) { cal_list = list; list = NULL; } else { cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); if (cal->num_seeds >= min_limit) { array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_clear(list, (void *) cal_free); num_cals = array_list_size(cal_list); printf("************, num_cals = %i\n", num_cals); } if (num_cals > MAX_CALS) { for (size_t j = num_cals - 1; j >= MAX_CALS; j--) { cal = (cal_t *) array_list_remove_at(j, cal_list); cal_free(cal); } num_cals = array_list_size(cal_list); } if (num_cals > 0 && num_cals <= MAX_CALS) { array_list_set_flag(2, cal_list); mapping_batch->num_to_do += num_cals; targets[new_num_targets++] = read_index; // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; } else { array_list_set_flag(0, mapping_batch->mapping_lists[read_index]); // we have to free the region list array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); if (cal_list) array_list_free(cal_list, (void *) cal_free); if (list) array_list_clear(list, (void *) cal_free); } */ } // end for 0 ... num_targets // update batch mapping_batch->num_targets = new_num_targets; // LOG_DEBUG_F("num. SW to do: %i\n", mapping_batch->num_to_do); // exit(-1); // free memory if (list) array_list_free(list, NULL); if (batch->mapping_mode == RNA_MODE) { return RNA_STAGE; } if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) { return PRE_PAIR_STAGE; } else if (batch->mapping_batch->num_targets > 0) { return SW_STAGE; } return DNA_POST_PAIR_STAGE; }
int apply_caling_rna(cal_seeker_input_t* input, batch_t *batch) { LOG_DEBUG("========= APPLY CALING RNA =========\n"); //if (time_on) { start_timer(start); } bwt_optarg_t *bwt_optarg = input->bwt_optarg; bwt_index_t *bwt_index = input->index; cal_optarg_t *cal_optarg = input->cal_optarg; mapping_batch_t *mapping_batch = batch->mapping_batch; size_t num_cals, select_cals; size_t total_reads = 0; size_t num_targets, target_pos, total_targets, extra_target_pos; fastq_read_t *read; genome_t *genome = input->genome; unsigned int num_chromosomes = genome->num_chromosomes; int min_seeds, max_seeds; int seed_size = input->cal_optarg->seed_size; array_list_t *cal_list, *list; cal_t *cal; //array_list_t *region_list; num_targets = mapping_batch->num_targets; total_targets = 0; extra_target_pos = 0; total_reads += num_targets; target_pos = 0; mapping_batch->extra_stage_do = 1; /* int t, target; for (t = 0; t < num_targets; t++) { target = mapping_batch->targets[t]; mapping_batch->mapping_lists[target]->size = 0; } return RNA_POST_PAIR_STAGE; */ array_list_t *region_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); //extern size_t TOTAL_READS_SEEDING, TOTAL_READS_SEEDING2; //pthread_mutex_lock(&mutex_sp); //TOTAL_READS_SEEDING += num_targets; //pthread_mutex_unlock(&mutex_sp); //printf("Num targets = %i\n", num_targets); for (size_t i = 0; i < num_targets; i++) { read = array_list_get(mapping_batch->targets[i], mapping_batch->fq_batch); //printf("From CAL Seeker %s\n", read->id); list = mapping_batch->mapping_lists[mapping_batch->targets[i]]; //if (array_list_get_flag(region_list) == 0 || // array_list_get_flag(region_list) == 2) { //We have normal and extend seeds (anchors) max_seeds = (read->length / 15)*2 + 10; //printf("%i\n", input->cal_optarg->min_cal_size); num_cals = bwt_generate_cals(read->sequence, seed_size, bwt_optarg, cal_optarg, bwt_index, list, num_chromosomes); // if we want to seed with 24-length seeds, if (num_cals == 0) { //printf("No Cals seeding...\n"); //pthread_mutex_lock(&mutex_sp); //extern size_t seeds_1err; //seeds_1err++; //pthread_mutex_unlock(&mutex_sp); int seed_size = 24; //First, Delete old regions array_list_clear(region_list, (void *)region_bwt_free); //Second, Create new regions with seed_size 24 and 1 Mismatch bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2, bwt_optarg, bwt_index, region_list); max_seeds = (read->length / 15)*2 + 10; //int prev_min_cal = input->cal_optarg->min_cal_size; //input->cal_optarg->min_cal_size = seed_size + seed_size / 2; //printf("NO CALS, new seeds %lu\n", array_list_size(region_list)); num_cals = bwt_generate_cal_list_linked_list(region_list, input->cal_optarg, &min_seeds, &max_seeds, genome->num_chromosomes + 1, list, read->length, cal_optarg->min_cal_size, 0); //input->cal_optarg->min_cal_size = prev_min_cal; //pthread_mutex_lock(&mutex_sp); //TOTAL_READS_SEEDING2++; //pthread_mutex_unlock(&mutex_sp); } array_list_clear(region_list, (void *)region_bwt_free); //filter-incoherent CALs int founds[num_cals], found = 0; for (size_t j = 0; j < num_cals; j++) { founds[j] = 0; cal = array_list_get(j, list); LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", j, num_cals, cal->sr_list->size, cal->num_seeds, cal->chromosome_id, cal->start, cal->end); if (cal->sr_list->size > 0) { int start = 0; size_t genome_start = 0; int first = 1; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start); LOG_DEBUG_F("\t\t:: read_star %lu > read_end %lu \n", s->read_start, s->read_end); if (start > s->read_start || s->read_start >= s->read_end) { LOG_DEBUG("\t\t\t:: remove\n"); found++; founds[j] = 1; } if (!first && ((s->genome_start < genome_start) || (s->genome_start - genome_start) > 2*read->length)) { //printf("Remove (genome_start = %i s->genome_start = %i)\n", genome_start, s->genome_start); //cal_print(cal); found++; founds[j] = 1; } first = 0; start = s->read_end + 1; genome_start = s->genome_end + 1; } } else { found++; founds[j] = 1; } } if (found) { min_seeds = 100000; max_seeds = 0; cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { if (!founds[j]) { cal = array_list_get(j, list); cal->num_seeds = cal->sr_list->size; if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds; if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds; array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_free(list, (void *) cal_free); num_cals = array_list_size(cal_list); list = cal_list; } mapping_batch->mapping_lists[mapping_batch->targets[i]] = list; num_cals = array_list_size(list); int max = 100; if (num_cals > max) { select_cals = num_cals - max; for(int j = num_cals - 1; j >= max; j--) { cal_free(array_list_remove_at(j, mapping_batch->mapping_lists[mapping_batch->targets[i]])); } } //mapping_batch->targets[target_pos++] = mapping_batch->targets[i]; //} //else if (num_cals > 0) { mapping_batch->targets[target_pos++] = mapping_batch->targets[i]; /* printf("<<<<<===== CAL SERVER =====>>>>>\n"); */ /* for (int c = 0; c < array_list_size(mapping_batch->mapping_lists[mapping_batch->targets[i]]); c++) { */ /* cal_t *cal_aux = array_list_get(c, mapping_batch->mapping_lists[mapping_batch->targets[i]]); */ /* cal_print(cal_aux); */ /* } */ /* printf("<<<<<===== CAL SERVER END =====>>>>>\n"); */ //printf("Total CALs %i\n", num_cals); } mapping_batch->num_targets = target_pos; array_list_free(region_list, NULL); //if (time_on) { stop_timer(start, end, time); timing_add(time, CAL_SEEKER, timing); } LOG_DEBUG("========= APPLY CALING RNA END =========\n"); // return RNA_STAGE; if (batch->mapping_mode == RNA_MODE) { return RNA_STAGE; } if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) { return PRE_PAIR_STAGE; } else if (batch->mapping_batch->num_targets > 0) { return SW_STAGE; } return DNA_POST_PAIR_STAGE; }
static int filesystem_readdir(const char *path, void *buffer, fuse_fill_dir_t fill_dir, off_t offset, struct fuse_file_info *file_info) { int retstat = 0; // DIR *dp; // struct dirent *de; logging_log("Filesystem", LOGGING_LEVEL_INFO, "filesystem_readdir(path=\"%s\", buf=0x%08x, filler=0x%08x, offset=%ld, fi=0x%08x)...", path, buffer, fill_dir, offset, file_info); if(!strcmp(path, "/")) { pthread_mutex_lock(filesystem_io_mutex); ArrayList *search_dirs = searcher_get_searches(); pthread_mutex_unlock(filesystem_io_mutex); if(search_dirs != NULL) { for(size_t i = 0; i < array_list_get_length(search_dirs); i++) { char *name; array_list_get(search_dirs, (const void **)&name, i); if(fill_dir(buffer, name, NULL, 0) != 0) { array_list_free(search_dirs); return -ENOMEM; } } array_list_free(search_dirs); } } else { path_t *path_parsed = path_parse(path); logging_log("Filesystem", LOGGING_LEVEL_INFO, "path_parsed->parts_length: %lu...", path_parsed->parts_length); if(path_parsed->parts_length) { pthread_mutex_lock(filesystem_io_mutex); ALDictionary *results = searcher_get_search_results(path_parsed->parts[0]); if(results != NULL) { logging_log("Filesystem", LOGGING_LEVEL_INFO, "results != NULL..."); ALDictionaryEnumerator *e = al_dictionary_get_enumerator(results); while(al_dictionary_enumerator_move_next(e)) { ALDictionaryKeyValuePair *pair; al_dictionary_enumerator_get_current(e, &pair); logging_log("Filesystem", LOGGING_LEVEL_INFO, "pair->key: %s...", (char*)pair->key); if(fill_dir(buffer, (char*)pair->key, NULL, 0) != 0) { al_dictionary_enumerator_free(e); path_free(path_parsed); pthread_mutex_unlock(filesystem_io_mutex); return -ENOMEM; } } al_dictionary_enumerator_free(e); } else retstat = -1; pthread_mutex_unlock(filesystem_io_mutex); } else retstat = -1; path_free(path_parsed); } return retstat; }
void apply_sw(sw_server_input_t* input, aligner_batch_t *batch) { // printf("START: apply_sw\n"); int tid = omp_get_thread_num(); cal_t *cal = NULL; array_list_t *cal_list = NULL, *mapping_list = NULL;//, *old_list = NULL, *new_list = NULL; fastq_batch_t *fq_batch = batch->fq_batch; size_t start, end; genome_t *genome = input->genome_p; size_t flank_length = input->flank_length; // SIMD support for Smith-Waterman float score, min_score = input->min_score; // size_t curr_depth = 0; sw_output_t *sw_output; // sw_simd_input_t *sw_sinput = sw_simd_input_new(SIMD_DEPTH); // sw_simd_output_t *sw_soutput = sw_simd_output_new(SIMD_DEPTH); //sw_simd_context_t *context = sw_simd_context_new(input->match, input->mismatch, // input->gap_open, input->gap_extend); // for tracking the current read, cal being processed using sw_channel_t //sw_channel_t *channel; //sw_channel_t sw_channels[SIMD_DEPTH]; //memset(sw_channels, 0, sizeof(sw_channels)); //size_t header_len, read_len; //size_t strands[SIMD_DEPTH], chromosomes[SIMD_DEPTH], starts[SIMD_DEPTH]; size_t index, num_cals; size_t total = 0, valids = 0; size_t num_seqs = batch->num_targets; // set to zero batch->num_done = batch->num_to_do; batch->num_to_do = 0; size_t sw_total = batch->num_done; /* // for all seqs pending to process !! size_t sw_total = 0; for (size_t i = 0; i < num_seqs; i++) { sw_total += array_list_size(batch->mapping_lists[batch->targets[i]]); } printf("number of sw to run: %d (vs num_done = %d)\n", sw_total, batch->num_done); */ sw_optarg_t *sw_optarg = &input->sw_optarg; /* sw_optarg_t sw_optarg; //= sw_optarg_new(gap_open, gap_extend, matrix_filename); sw_optarg.gap_open = input->gap_open; sw_optarg.gap_extend = input->gap_extend; sw_optarg.subst_matrix['A']['A'] = input->match; sw_optarg.subst_matrix['C']['A'] = input->mismatch; sw_optarg.subst_matrix['T']['A'] = input->mismatch; sw_optarg.subst_matrix['G']['A'] = input->mismatch; sw_optarg.subst_matrix['A']['C'] = input->mismatch; sw_optarg.subst_matrix['C']['C'] = input->match; sw_optarg.subst_matrix['T']['C'] = input->mismatch; sw_optarg.subst_matrix['G']['C'] = input->mismatch; sw_optarg.subst_matrix['A']['G'] = input->mismatch; sw_optarg.subst_matrix['C']['T'] = input->mismatch; sw_optarg.subst_matrix['T']['T'] = input->match; sw_optarg.subst_matrix['G']['T'] = input->mismatch; sw_optarg.subst_matrix['A']['T'] = input->mismatch; sw_optarg.subst_matrix['C']['G'] = input->mismatch; sw_optarg.subst_matrix['T']['G'] = input->mismatch; sw_optarg.subst_matrix['G']['G'] = input->match; */ sw_multi_output_t *output = sw_multi_output_new(sw_total); char *q[sw_total], *r[sw_total]; uint8_t strands[sw_total], chromosomes[sw_total]; size_t starts[sw_total]; size_t sw_count = 0, read_indices[sw_total]; int read_len; // debugging: to kown how many reads are not mapped by SW score // int unmapped_by_score[fq_batch->num_reads]; // memset(unmapped_by_score, 0, fq_batch->num_reads * sizeof(int)); // printf("num of sw to do: %i\n", sw_total); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_seqs; i++) { index = batch->targets[i]; cal_list = batch->mapping_lists[index]; num_cals = array_list_size(cal_list); // printf("sw_server: read #%i with %i cals\n", index, num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = index; // query sequence, revcomp if necessary read_len = fq_batch->data_indices[index + 1] - fq_batch->data_indices[index]; q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); memcpy(q[sw_count], &(fq_batch->seq[fq_batch->data_indices[index]]), read_len); if (cal->strand == 1) { seq_reverse_complementary(q[sw_count], read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); start = cal->start - flank_length; end = cal->end + flank_length; r[sw_count] = calloc(1, end - start + 2); genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome); // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; // printf("read #%i (sw #%i): query: %s (%i)\nref : %s (%i)\n\n", index, sw_count, q[sw_count], strlen(q[sw_count]), r[sw_count], strlen(r[sw_count])); // increase counter sw_count++; } // free cal_list array_list_free(cal_list, (void *)cal_free); batch->mapping_lists[index] = NULL; } // run Smith-Waterman // printf("before smith_waterman: number of sw = %i\n", sw_total); smith_waterman_mqmr(q, r, sw_total, sw_optarg, 1, output); // printf("after smith_waterman\n"); /* // debugging { FILE *fd = fopen("sw.out", "w"); sw_multi_output_save(sw_total, output, fd); fclose(fd); } */ size_t num_targets = 0; // filter alignments by min_score for (size_t i = 0; i < sw_total; i++) { // score = output->score_p[i] / (strlen(output->query_map_p[i]) * input->match); // if (score >= min_score) { /* printf("--------------------------------------------------------------\n"); printf("Smith-Waterman results:\n"); printf("id\t%s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[read_indices[i]]])); printf("ref\n%s\n", r[i]); printf("query\n%s\n", q[i]); printf("map\n%s\n", output->ref_map_p[i]); printf("ref: chr = %d, strand = %d, start = %d, len = %d\n", chromosomes[i], strands[i], starts[i], strlen(r[i])); printf("query-map-start = %d, ref-map-start = %d\n", output->query_start_p[i], output->ref_start_p[i]); printf("score = %0.2f (min. score = %0.2f)\n", output->score_p[i], min_score); printf("--------------------------------------------------------------\n"); */ if (output->score_p[i] >= min_score) { // valid mappings, //insert in the list for further processing index = read_indices[i]; if (batch->mapping_lists[index] == NULL) { mapping_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_set_flag(0, mapping_list); batch->mapping_lists[index] = mapping_list; batch->targets[num_targets++] = index; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); batch->num_to_do++; // debugging //unmapped_by_score[index] = 1; } // free query and reference free(q[i]); free(r[i]); } batch->num_targets = num_targets; /* // debugging for (size_t i = 0; i < fq_batch->num_reads; i++) { if (unmapped_by_score[i] == 0) { unmapped_by_score_counter[tid]++; //printf("by score: %s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[index]])); } } */ // update counter thr_sw_items[tid] += sw_count; // free sw_multi_output_free(output); // printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids); }
int sa_sam_writer(void *data) { sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data; sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch; if (mapping_batch == NULL) { printf("bam_writer1: error, NULL mapping batch\n"); return 0; } /* for (int i = 0; i < NUM_COUNTERS; i++) { counters[i] += mapping_batch->counters[i]; } */ #ifdef _TIMING for (int i = 0; i < NUM_TIMING; i++) { func_times[i] += mapping_batch->func_times[i]; } #endif int num_mismatches, num_cigar_ops; size_t flag, pnext = 0, tlen = 0; char *cigar_string, *cigar_M_string, *rnext = "*"; fastq_read_t *read; array_list_t *read_list = mapping_batch->fq_reads; array_list_t *mapping_list, *mate_list; FILE *out_file = (FILE *) wf_batch->writer_input->bam_file; sa_genome3_t *genome = wf_batch->sa_index->genome; size_t num_reads, num_mappings, num_mate_mappings; num_reads = mapping_batch->num_reads; if (mapping_batch->options->pair_mode != SINGLE_END_MODE) { // PAIR MODE int len; char *sequence, *quality; char *seq, *opt_fields; alignment_t *alig; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); // seq = read->sequence; /* if (i % 2 == 0) { mate_list = mapping_batch->mapping_lists[i+1]; num_mate_mappings = array_list_size(mate_list); } else { mate_list = mapping_list; num_mate_mappings = num_mappings; } */ mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { alig = (alignment_t *) array_list_get(j, mapping_list); /* // update alignment alig->secondary_alignment = 0; if (num_mate_mappings != 1) { alig->is_mate_mapped = 0; alig->is_paired_end_mapped = 0; alig->mate_strand = 0; } */ if (alig->optional_fields) { opt_fields = (char *) calloc(strlen(alig->optional_fields) + 100, sizeof(char)); sprintf(opt_fields, "NH:i:%i\t%s", num_mappings, alig->optional_fields); // sprintf(opt_fields, "NH:i:%i\t%s\tXU:i:%i", num_mappings, alig->optional_fields, mapping_batch->status[i]); } else { opt_fields = (char *) calloc(100, sizeof(char)); sprintf(opt_fields, "NH:i:%i", num_mappings); // sprintf(opt_fields, "NH:i:%i\tXU:i:%i", num_mappings, mapping_batch->status[i]); } /* // update alignment alig->secondary_alignment = 0; if (num_mate_mappings != 1) { alig->is_mate_mapped = 0; alig->is_paired_end_mapped = 0; alig->mate_strand = 0; } */ flag = 0; if (alig->is_paired_end) flag += BAM_FPAIRED; if (alig->is_paired_end_mapped) flag += BAM_FPROPER_PAIR; if (!alig->is_seq_mapped) flag += BAM_FUNMAP; if ((!alig->is_mate_mapped) && (alig->is_paired_end)) flag += BAM_FMUNMAP; if (alig->mate_strand) flag += BAM_FMREVERSE; if (alig->pair_num == 1) flag += BAM_FREAD1; if (alig->pair_num == 2) flag += BAM_FREAD2; if (alig->secondary_alignment) flag += BAM_FSECONDARY; if (alig->fails_quality_check) flag += BAM_FQCFAIL; if (alig->pc_optical_duplicate) flag += BAM_FDUP; if (alig->seq_strand) flag += BAM_FREVERSE; fprintf(out_file, "%s\t%lu\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n", read->id, flag, genome->chrom_names[alig->chromosome], alig->position + 1, (num_mappings > 1 ? 0 : alig->mapq), //60, //(alig->map_quality > 3 ? 0 : alig->map_quality), alig->cigar, (alig->chromosome == alig->mate_chromosome ? "=" : genome->chrom_names[alig->mate_chromosome]), alig->mate_position + 1, alig->template_length, alig->sequence, alig->quality, opt_fields ); // free memory free(opt_fields); alignment_free(alig); } // end for num_mappings } else { num_unmapped_reads++; opt_fields = (char *) calloc(100, sizeof(char)); sprintf(opt_fields, "XM:i:%i XU:i:%i", num_mappings, mapping_batch->status[i]); if (read->adapter) { len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { sequence = read->sequence; quality = read->quality; } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\t%s\n", read->id, sequence, quality, opt_fields ); free(opt_fields); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } } else { // SINGLE MODE int len, mapq; char *seq; seed_cal_t *cal; cigar_t *cigar; char *sequence, *revcomp, *quality; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { cal = (seed_cal_t *) array_list_get(j, mapping_list); if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); revcomp = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); cigar = cigar_new_empty(); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ( (cal->strand == 1 && ((read->adapter_strand == 0 && read->adapter_length > 0) || (read->adapter_strand == 1 && read->adapter_length < 0))) || (cal->strand == 0 && ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0))) ) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); strcpy(revcomp, read->adapter_revcomp); strcat(revcomp, read->revcomp); cigar_append_op(abs(read->adapter_length), 'S', cigar); cigar_concat(&cal->cigar, cigar); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); strcpy(revcomp, read->revcomp); strcat(revcomp, read->adapter_revcomp); cigar_concat(&cal->cigar, cigar); cigar_append_op(read->adapter_length, 'S', cigar); } sequence[len] = 0; revcomp[len] = 0; quality[len] = 0; } else { // sequences and cigar sequence = read->sequence; revcomp = read->revcomp; quality = read->quality; cigar = &cal->cigar; } if (cal->strand) { flag = 16; seq = revcomp; } else { flag = 0; seq = sequence; } /* if (i == 0) { flag += BAM_FSECONDARY; } */ cigar_string = cigar_to_string(cigar); cigar_M_string = cigar_to_M_string(&num_mismatches, &num_cigar_ops, cigar); if (num_mappings > 1) { cal->mapq = 0; } fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%lu\t%i\t%s\t%s\tNH:i:%i\tNM:i:%i\n", read->id, flag, genome->chrom_names[cal->chromosome_id], cal->start + 1, (num_mappings == 1 ? cal->mapq : 0), cigar_M_string, rnext, pnext, tlen, seq, quality, num_mappings, num_mismatches ); // free memory free(cigar_M_string); free(cigar_string); seed_cal_free(cal); if (read->adapter) { free(sequence); free(revcomp); free(quality); cigar_free(cigar); } } } else { num_unmapped_reads++; if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { // sequences sequence = read->sequence; quality = read->quality; } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", read->id, sequence, quality ); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } // end for num_reads } // free memory sa_mapping_batch_free(mapping_batch); if (wf_batch) sa_wf_batch_free(wf_batch); return 0; }
array_list_t *filter_cals(size_t num_cals, size_t read_length, array_list_t *list) { cal_t *cal; int min_seeds, max_seeds; array_list_t *cal_list; size_t select_cals; //filter-incoherent CALs int founds[num_cals], found = 0; for (size_t j = 0; j < num_cals; j++) { founds[j] = 0; cal = array_list_get(j, list); LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", j, num_cals, cal->sr_list->size, cal->num_seeds, cal->chromosome_id, cal->start, cal->end); if (cal->sr_list->size > 0) { int start = 0; size_t genome_start = 0; int first = 1; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start); LOG_DEBUG_F("\t\t:: read_star %lu > read_end %lu \n", s->read_start, s->read_end); if (start > s->read_start || s->read_start >= s->read_end) { LOG_DEBUG("\t\t\t:: remove\n"); found++; founds[j] = 1; } if (!first && ((s->genome_start < genome_start) || (s->genome_start - genome_start) > 2 * read_length)) { //printf("Remove (genome_start = %i s->genome_start = %i)\n", genome_start, s->genome_start); //cal_print(cal); found++; founds[j] = 1; } first = 0; start = s->read_end + 1; genome_start = s->genome_end + 1; } } else { found++; founds[j] = 1; } } if (found) { min_seeds = 100000; max_seeds = 0; cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { if (!founds[j]) { cal = array_list_get(j, list); cal->num_seeds = cal->sr_list->size; if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds; if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds; array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_free(list, (void *) cal_free); num_cals = array_list_size(cal_list); list = cal_list; } num_cals = array_list_size(list); int max = 100; if (num_cals > max) { select_cals = num_cals - max; for(int j = num_cals - 1; j >= max; j--) { cal_free(array_list_remove_at(j, list)); } } return list; }