bed_file_t *bed_open(char *filename) { size_t len; char *data = mmap_file(&len, filename); bed_file_t *bed_file = (bed_file_t *) malloc(sizeof(bed_file_t)); bed_file->filename = filename; bed_file->data = data; bed_file->data_len = len; bed_file->header_entries = linked_list_new(COLLECTION_MODE_SYNCHRONIZED); bed_file->records = linked_list_new(COLLECTION_MODE_SYNCHRONIZED); return bed_file; }
cal_t *convert_bwt_anchor_to_CAL(bwt_anchor_t *bwt_anchor, size_t read_start, size_t read_end) { linked_list_t *linked_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); seed_region_t *seed_region = seed_region_new(read_start, read_end, bwt_anchor->start, bwt_anchor->end, 0, 0, 0); linked_list_insert_first(seed_region, linked_list); cal_t *cal = cal_new(bwt_anchor->chromosome + 1, bwt_anchor->strand, bwt_anchor->start, bwt_anchor->end, 1, linked_list, linked_list_new(COLLECTION_MODE_ASYNCHRONIZED)); return cal; }
suffix_mng_t *suffix_mng_new(sa_genome3_t *genome) { suffix_mng_t *p = (suffix_mng_t *) calloc(1, sizeof(suffix_mng_t)); int num_chroms = genome->num_chroms; char *name; Container *subject = (Container *) malloc(sizeof(Container)); bl_containerInit(subject, num_chroms, sizeof(char *)); linked_list_t **suffix_lists = (linked_list_t **) malloc (sizeof(linked_list_t *) * num_chroms); for (int i = 0; i < num_chroms; i++) { suffix_lists[i] = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); name = calloc(64, sizeof(char)); sprintf(name, "%i", i); bl_containerAdd(subject, &name); } p->num_seeds = 0; p->num_chroms = num_chroms; p->subject = subject; p->suffix_lists = suffix_lists; return p; }
void test_insert() { // regular insert linked_list* ll = linked_list_new(); linked_list_insert(ll, 2); linked_list_insert(ll, 4); linked_list_insert(ll, 6); assert_equals(linked_list_size(ll), 3); assert_true(linked_list_contains(ll, 2)); assert_true(linked_list_contains(ll, 4)); assert_true(linked_list_contains(ll, 6)); assert_false(linked_list_contains(ll, -1)); assert_false(linked_list_contains(ll, 0)); linked_list_clear(ll); linked_list* ll = linked_list_new(); assert_equals(linked_list_size(ll), 0); assert_false(linked_list_contains(ll, 1)); linked_list_insert(ll, 1); assert_equals(linked_list_size(ll), 1); assert_true(linked_list_contains(ll, 1)); linked_list_clear(ll); // insert by index // set by index }
/** * Initialize empty BAM framework data structure. */ void bfwork_init(bam_fwork_t *fwork) { assert(fwork); //Set all to zero memset(fwork, 0, sizeof(bam_fwork_t)); //Create regions fwork->regions_list = linked_list_new(COLLECTION_MODE_SYNCHRONIZED); //Init locks omp_init_lock(&fwork->regions_lock); omp_init_lock(&fwork->free_slots); omp_init_lock(&fwork->output_file_lock); omp_init_lock(&fwork->reference_lock); }
void test_lookup() { // get linked_list* ll = linked_list_new(); int i; for (i = 0; i < 100; i++) { linked_list_insert(ll, i * 2); } assert_equals(linked_list_get(ll, 0), 0); assert_equals(linked_list_get(ll, 99), 99 * 2); assert_equals(linked_list_get(ll, 50), 50 * 2); // contains assert_true(linked_list_contains(ll, 10)); assert_false(linked_list_contains(ll, 1)); // peek //assert_equals(linked_list_first(ll), 0); // last //assert_equals(linked_list_last(ll), 99 * 2); // index of }
//==================================================================================== // apply_caling //==================================================================================== int apply_caling(cal_seeker_input_t* input, batch_t *batch) { mapping_batch_t *mapping_batch = batch->mapping_batch; array_list_t *list = NULL; size_t read_index, num_cals; int min_seeds, max_seeds; cal_t *cal; array_list_t *cal_list; fastq_read_t *read; size_t num_chromosomes = input->genome->num_chromosomes + 1; size_t num_targets = mapping_batch->num_targets; size_t *targets = mapping_batch->targets; size_t new_num_targets = 0; array_list_t *region_list; bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw; linked_list_t *linked_list; int anchor_nt, gap_nt; seed_region_t *seed_region_start, *seed_region_end; //max_seeds = input->cal_optarg->num_seeds; // size_t *new_targets = (size_t *) calloc(num_targets, sizeof(size_t)); // set to zero mapping_batch->num_to_do = 0; for (size_t i = 0; i < num_targets; i++) { read_index = targets[i]; read = array_list_get(read_index, mapping_batch->fq_batch); region_list = mapping_batch->mapping_lists[read_index]; // for debugging // LOG_DEBUG_F("%s\n", ((fastq_read_t *) array_list_get(read_index, mapping_batch->fq_batch))->id); if (!list) { list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); } if (array_list_get_flag(region_list) == 0 || array_list_get_flag(region_list) == 2) { //We have normal and extend seeds (anchors) max_seeds = (read->length / 15)*2 + 10; num_cals = bwt_generate_cal_list_linked_list(region_list, input->cal_optarg, &min_seeds, &max_seeds, num_chromosomes, list, read->length, input->cal_optarg->min_cal_size, 0); } else { //We have double anchors with smaller distance between they //printf("Easy case... Two anchors and same distance between read gap and genome distance\n"); num_cals = 0; for (int a = array_list_size(region_list) - 1; a >= 0; a -= 2) { max_seeds = 2; min_seeds = 2; bwt_anchor_back = array_list_remove_at(a, region_list); bwt_anchor_forw = array_list_remove_at(a - 1, region_list); linked_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); //Seed for the first anchor anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; //printf("\t seed0[%i-%i][%lu-%lu]\n", 0, anchor_nt - 1, // bwt_anchor_forw->start, bwt_anchor_forw->end); seed_region_start = seed_region_new(0, anchor_nt - 1, bwt_anchor_forw->start, bwt_anchor_forw->end, 0, 0, 0); //Seed for the first anchor gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); //printf("\t gap_nt = %i, anchor_nt = %i\n", gap_nt, anchor_nt); //printf("\t seed1[%i-%i][%lu-%lu]\n", anchor_nt + gap_nt, read->length - 1, // bwt_anchor_back->start + 1, bwt_anchor_back->end); seed_region_end = seed_region_new(anchor_nt + gap_nt, read->length - 1, bwt_anchor_back->start + 1, bwt_anchor_back->end, 1, 0, 0); //The reference distance is 0 and the read distance not //The read distance is 0 and the reference distance not //if (seed_region_start->genome_end > seed_region_end->genome_start || // seed_region_start->read_end > seed_region_end->read_start) { //array_list_clear(region_list, NULL); //continue; if (seed_region_end->genome_start - seed_region_start->genome_end < 5 || seed_region_end->read_start - seed_region_start->read_end < 5) { seed_region_start->genome_end -= 5; seed_region_start->read_end -= 5; seed_region_end->genome_start += 5; seed_region_end->read_start += 5; } linked_list_insert(seed_region_start, linked_list); linked_list_insert_last(seed_region_end, linked_list); cal = cal_new(bwt_anchor_forw->chromosome + 1, bwt_anchor_forw->strand, bwt_anchor_forw->start, bwt_anchor_back->end + 1, 2, linked_list, linked_list_new(COLLECTION_MODE_ASYNCHRONIZED)); array_list_insert(cal, list); num_cals++; } } // for debugging LOG_DEBUG_F("read %s : num. cals = %i, min. seeds = %i, max. seeds = %i\n", read->id, num_cals, min_seeds, max_seeds); /* if (num_cals == 0) { int seed_size = 24; //First, Delete old regions array_list_clear(mapping_batch->mapping_lists[read_index], region_bwt_free); //Second, Create new regions with seed_size 24 and 1 Mismatch bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2, bwt_optarg, bwt_index, mapping_batch->mapping_lists[read_index]); num_cals = bwt_generate_cal_list_linked_list(mapping_batch->mapping_lists[mapping_batch->targets[i]], input->cal_optarg, &min_seeds, &max_seeds, num_chromosomes, list, read->length); }*/ /* for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); LOG_DEBUG_F("\tchr: %i, strand: %i, start: %lu, end: %lu, num_seeds = %i, num. regions = %lu\n", cal->chromosome_id, cal->strand, cal->start, cal->end, cal->num_seeds, cal->sr_list->size); } */ // printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", // min_seeds, max_seeds, min_limit, array_list_size(list)); // filter incoherent CALs int founds[num_cals], found = 0; for (size_t j = 0; j < num_cals; j++) { founds[j] = 0; cal = array_list_get(j, list); LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", j, num_cals, cal->sr_list->size, cal->num_seeds, cal->chromosome_id, cal->start, cal->end); if (cal->sr_list->size > 0) { int start = 0; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start); if (start > s->read_start) { LOG_DEBUG("\t\t\t:: remove\n"); found++; founds[j] = 1; } start = s->read_end + 1; } } else { found++; founds[j] = 1; } } if (found) { min_seeds = 100000; max_seeds = 0; cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { if (!founds[j]) { cal = array_list_get(j, list); cal->num_seeds = cal->sr_list->size; if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds; if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds; array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_free(list, (void *) cal_free); num_cals = array_list_size(cal_list); list = cal_list; } // LOG_FATAL_F("num. cals = %i, min. seeds = %i, max. seeds = %i\n", num_cals, min_seeds, max_seeds); // filter CALs by the number of seeds cal_list = list; list = NULL; /* int min_limit = input->cal_optarg->min_num_seeds_in_cal; if (min_limit < 0) min_limit = max_seeds; // min_limit -= 3; if (min_seeds == max_seeds || min_limit <= min_seeds) { cal_list = list; list = NULL; } else { cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); if (cal->num_seeds >= min_limit) { array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_clear(list, (void *) cal_free); num_cals = array_list_size(cal_list); } */ if (num_cals > MAX_CALS) { for (size_t j = num_cals - 1; j >= MAX_CALS; j--) { cal = (cal_t *) array_list_remove_at(j, cal_list); cal_free(cal); } num_cals = array_list_size(cal_list); } // LOG_DEBUG_F("num. cals = %i, MAX_CALS = %i\n", num_cals, MAX_CALS); if (num_cals > 0 && num_cals <= MAX_CALS) { array_list_set_flag(2, cal_list); targets[new_num_targets++] = read_index; /* int count1 = 0, count2 = 0; // count number of sw to do // method #1 // printf("method #1\n"); seed_region_t *s, *prev_s; linked_list_iterator_t* itr; for (size_t j = 0; j < num_cals; j++) { prev_s = NULL; cal = array_list_get(j, cal_list); itr = linked_list_iterator_new(cal->sr_list); s = (seed_region_t *) linked_list_iterator_curr(itr); while (s != NULL) { if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) { // printf("\t\t\tcase 1\n"); count1++; } prev_s = s; linked_list_iterator_next(itr); s = linked_list_iterator_curr(itr); } if (prev_s != NULL && prev_s->read_end < read->length - 1) { count1++; // printf("\t\t\tcase 2 (%i < %i)\n", prev_s->read_end, read->length - 1); } linked_list_iterator_free(itr); } // method #2 printf("method #2\n"); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, cal_list); printf("\t: %i\n", j); if (cal->sr_list->size > 0) { int start = 0; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; printf("\t\t[%i|%i - %i|%i]\n", s->genome_start, s->read_start, s->read_end, s->genome_end); if (s->read_start != start) { count2++; } start = s->read_end + 1; } if (start < read->length) { count2++; } } } printf("count #1 = %i, count #2 = %i\n", count1, count2); assert(count1 == count2); mapping_batch->num_to_do += count1; */ // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; } else { array_list_set_flag(0, mapping_batch->mapping_lists[read_index]); // we have to free the region list array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); if (cal_list) array_list_free(cal_list, (void *) cal_free); if (list) array_list_clear(list, (void *) cal_free); } /* cal_list = list; list = NULL; array_list_set_flag(2, cal_list); // mapping_batch->num_to_do += num_cals; targets[new_num_targets++] = read_index; // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; */ /* // filter CALs by the number of seeds int min_limit = input->cal_optarg->min_num_seeds_in_cal; if (min_limit < 0) min_limit = max_seeds; printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", min_seeds, max_seeds, min_limit, array_list_size(list)); if (min_seeds == max_seeds || min_limit <= min_seeds) { cal_list = list; list = NULL; } else { cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); if (cal->num_seeds >= min_limit) { array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_clear(list, (void *) cal_free); num_cals = array_list_size(cal_list); printf("************, num_cals = %i\n", num_cals); } if (num_cals > MAX_CALS) { for (size_t j = num_cals - 1; j >= MAX_CALS; j--) { cal = (cal_t *) array_list_remove_at(j, cal_list); cal_free(cal); } num_cals = array_list_size(cal_list); } if (num_cals > 0 && num_cals <= MAX_CALS) { array_list_set_flag(2, cal_list); mapping_batch->num_to_do += num_cals; targets[new_num_targets++] = read_index; // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; } else { array_list_set_flag(0, mapping_batch->mapping_lists[read_index]); // we have to free the region list array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); if (cal_list) array_list_free(cal_list, (void *) cal_free); if (list) array_list_clear(list, (void *) cal_free); } */ } // end for 0 ... num_targets // update batch mapping_batch->num_targets = new_num_targets; // LOG_DEBUG_F("num. SW to do: %i\n", mapping_batch->num_to_do); // exit(-1); // free memory if (list) array_list_free(list, NULL); if (batch->mapping_mode == RNA_MODE) { return RNA_STAGE; } if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) { return PRE_PAIR_STAGE; } else if (batch->mapping_batch->num_targets > 0) { return SW_STAGE; } return DNA_POST_PAIR_STAGE; }
void suffix_mng_create_cals(fastq_read_t *read, int min_area, int strand, sa_index3_t *sa_index, array_list_t *cal_list, suffix_mng_t *p) { if (!p) return; if (!p->suffix_lists) return; if (p->num_seeds <= 0) return; int read_area, chrom; seed_t *seed; seed_cal_t *cal; linked_list_t *seed_list; claspinfo_t info; bl_claspinfoInit(&info); // initialization info.fragments = (Container *) malloc(sizeof(Container)); bl_containerInit(info.fragments, p->num_seeds, sizeof(slmatch_t)); info.subject = p->subject; slmatch_t frag; linked_list_t *suffix_list; for (unsigned int i = 0; i < p->num_chroms; i++) { suffix_list = p->suffix_lists[i]; if (suffix_list) { for (linked_list_item_t *item = suffix_list->first; item != NULL; item = item->next) { seed = item->item; bl_slmatchInit(&frag, 0); frag.i = seed->read_start; frag.j = seed->read_end - seed->read_start + 1; frag.p = seed->genome_start; frag.q = seed->genome_end - seed->genome_start + 1; frag.scr = seed->genome_end - seed->genome_start + 1; frag.subject = seed->chromosome_id; bl_containerAdd(info.fragments, &frag); } } } // sort fragments qsort(info.fragments->contspace, bl_containerSize(info.fragments), sizeof(slmatch_t), cmp_slmatch_qsort); int begin = 0; for (int i = 1; i <= bl_containerSize(info.fragments); i++){ // end of fragments list or different database sequence // --> process fragment[begin]...fragment[i-1], write output // and free chains (less memory consumption with large input files) if (i == bl_containerSize(info.fragments) || ((slmatch_t *) bl_containerGet(info.fragments, begin))->subject != ((slmatch_t *) bl_containerGet(info.fragments, i))->subject){ if (info.chainmode == SOP){ // only use chaining without clustering if no ids are specified bl_slClusterSop((slmatch_t *) info.fragments->contspace + begin, i - begin, info.epsilon, info.lambda, info.maxgap); } else { bl_slClusterLin((slmatch_t *) info.fragments->contspace + begin, i - begin, info.epsilon, info.lambda, info.maxgap); } for (int j = begin; j < i; j++) { slmatch_t *match = (slmatch_t *) bl_containerGet(info.fragments, j); if (match->chain) { slchain_t *chain = (slchain_t *) match->chain; if (chain->scr >= info.minscore && bl_containerSize(chain->matches) >= info.minfrag) { chrom = atoi(*(char **) bl_containerGet(info.subject, chain->subject)); read_area = 0; seed_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); for (int k = 0; k < bl_containerSize(chain->matches); k++){ slmatch_t *frag = *(slmatch_t **) bl_containerGet(chain->matches, k); seed = seed_new(frag->i, frag->i + frag->j - 1, frag->p, frag->p + frag->q - 1); seed->chromosome_id = chrom; seed->strand = strand; read_area += frag->j; cigar_append_op(frag->j, '=', &seed->cigar); linked_list_insert_last(seed, seed_list); } // extend seeds cal = seed_cal_new(chrom, strand, chain->p, chain->p + chain->q - 1, seed_list); cal->read = read; extend_seeds(cal, sa_index); seed_cal_update_info(cal); if (cal->read_area >= min_area) { array_list_insert(cal, cal_list); } else { seed_cal_free(cal); } } bl_slchainDestruct(chain); free(chain); match->chain = NULL; } } // END OF for (j = begin; j < i; j++) begin = i; } // END OF if (i == bl_containerSize(info.fragments) || } // END OF for (i = 1; i <= bl_containerSize(info.fragments); i++) // destruct everything info.subject = NULL; bl_claspinfoDestruct(&info); // finally, clear suffix manager suffix_mng_clear(p); }