//{{{void test_init_file(void) void test_init_file(void) { struct input_file *i = input_file_init("../data/1k.sort.bed.gz"); input_file_destroy(&i); TEST_ASSERT_EQUAL(NULL, i); }
//{{{void test_bed_file_read(void) void test_bed_file_read(void) { struct input_file *i = input_file_init("../data/1k.unsort.bed.gz"); int chrm_len = 10; char *chrm = (char *)malloc(chrm_len*sizeof(char)); uint32_t start, end; long offset; char *chrm_A[10] = {"chr11", "chr11", "chr10", "chr16", "chr15", "chr19", "chr19", "chr18", "chr21", "chr7"}; uint32_t start_A[10] = {64691252, 129871988, 74031859, 3070038, 93346819, 4374094, 42805980, 3602738, 9825304, 2393484}; uint32_t end_A[10] = {64692359, 129873775, 74037598, 3072761, 93347932, 4376369, 42807400, 3605403, 9827741, 2394629}; int j; for (j = 0; j < 10; ++j) { int ret = input_file_get_next_interval(i, &chrm, &chrm_len, &start, &end, &offset); TEST_ASSERT_EQUAL(0,strcmp(chrm_A[j], chrm)); TEST_ASSERT_EQUAL(start_A[j], start); TEST_ASSERT_EQUAL(end_A[j], end); } while (input_file_get_next_interval(i, &chrm, &chrm_len, &start, &end, &offset) >= 0) { ++j; } TEST_ASSERT_EQUAL(0,strcmp("chr1", chrm)); TEST_ASSERT_EQUAL(25895359, start); TEST_ASSERT_EQUAL(25896171, end); TEST_ASSERT_EQUAL(1000, j); input_file_destroy(&i); }
void test_get_file_stats(void) { struct input_file *i = input_file_init("../data/1k.unsort.bed.gz"); struct unordered_list *file_index = unordered_list_init(1); struct file_data *fd = (struct file_data *) calloc(1, sizeof(struct file_data)); uint32_t file_id = unordered_list_add(file_index, fd); fd->file_name = strdup("../data/1k.unsort.bed.gz"); fd->num_intervals = 0; fd->mean_interval_size = 0; int chrm_len = 10; char *chrm = (char *)malloc(chrm_len*sizeof(char)); uint32_t start, end; long offset; uint32_t j = 0; struct file_id_offset_pair *p; uint32_t intrv_id; while (input_file_get_next_interval(i, &chrm, &chrm_len, &start, &end, &offset) >= 0) { fd->mean_interval_size += end-start; fd->num_intervals += 1; } fd->mean_interval_size = fd->mean_interval_size/fd->num_intervals; input_file_destroy(&i); free(chrm); char *out_file_name = "test_file_data_read_write.tmp"; FILE *f = fopen(out_file_name, "wb"); unordered_list_store(file_index, f, out_file_name, file_data_store); fclose(f); f = fopen(out_file_name, "rb"); struct unordered_list *file_index_r = unordered_list_load(f, out_file_name, file_data_load); struct file_data *fd_r = (struct file_data *) unordered_list_get(file_index_r, file_id); TEST_ASSERT_EQUAL(0, strcmp(fd->file_name, fd_r->file_name)); TEST_ASSERT_EQUAL(fd->num_intervals, fd_r->num_intervals); TEST_ASSERT_EQUAL(fd->mean_interval_size, fd_r->mean_interval_size); unordered_list_destroy(&file_index, file_data_free); unordered_list_destroy(&file_index_r, file_data_free); remove(out_file_name); }
int main(int argc, char **argv) { WAH_SIZE = 32; WAH_MAX_FILL_WORDS = (1<<(WAH_SIZE-1)) - 1; uint32_t num_chrms = 100; if ((argc != 4)) { errx(1, "usage:\t%s <input file> <index dir> <w|i>", argv[0]); } double genome_size = 3095677412.0; char *input_file = argv[1]; char *index_dir = argv[2]; char *i_type = argv[3]; struct input_file *in_f = input_file_init(input_file); int chrm_len = 50; char *chrm = (char *)malloc(chrm_len*sizeof(char)); uint32_t start, end; long offset; struct giggle_index *gi; gi = giggle_load(index_dir, uint32_t_ll_giggle_set_data_handler); uint32_t *file_counts = (uint32_t *) calloc(gi->file_index->num, sizeof(uint32_t)); uint32_t num_intervals = 0; double mean_interval_size = 0.0; while ( in_f->input_file_get_next_interval(in_f, &chrm, &chrm_len, &start, &end, &offset) >= 0 ) { num_intervals += 1; mean_interval_size += end - start; struct uint32_t_ll *R = (struct uint32_t_ll *)giggle_query_region(gi, chrm, start, end); if (R != NULL) { struct uint32_t_ll_node *curr = R->head; while (curr != NULL) { /* struct file_id_offset_pair *fid_off = (struct file_id_offset_pair *) unordered_list_get(gi->offset_index, curr->val); */ struct file_id_offset_pair fid_off = gi->offset_index->vals[curr->val]; struct file_data *fd = (struct file_data *) unordered_list_get(gi->file_index, fid_off.file_id); file_counts[fid_off.file_id] += 1; curr = curr->next; } uint32_t_ll_free((void **)&R); } } mean_interval_size = mean_interval_size/num_intervals; struct doubles_uint32_t_tuple *sig = (struct doubles_uint32_t_tuple *) calloc(gi->file_index->num, sizeof(struct doubles_uint32_t_tuple)); uint32_t i; for (i = 0; i < gi->file_index->num; ++i) { struct file_data *fd = (struct file_data *) unordered_list_get(gi->file_index, i); long long n11 = (long long)(file_counts[i]); long long n12 = (long long)(MAX(0,num_intervals - file_counts[i])); long long n21 = (long long)(MAX(0,fd->num_intervals - file_counts[i])); double comp_mean = ((fd->mean_interval_size+mean_interval_size)); long long n22_full = (long long) MAX(n11 + n12 + n21, genome_size/comp_mean); long long n22 = MAX(0, n22_full - (n11 + n12 + n21)); double left, right, two; double r = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two); double ratio = (((double)n11/(double)n12) / ((double)n21/(double)n22)); //fprintf(stderr, "%s\t%f\n", fd->file_name, two); sig[i].d1 = right; sig[i].d2 = ratio; sig[i].u1 = i; sig[i].u2 = file_counts[i]; } qsort(sig, gi->file_index->num, sizeof(struct doubles_uint32_t_tuple), doubles_uint32_t_tuple_cmp); for (i = 0; i < gi->file_index->num; ++i) { struct file_data *fd = (struct file_data *) unordered_list_get(gi->file_index, sig[i].u1); /* printf("%s\t" "right:%f\t" "%f\n", fd->file_name, sig[i].d1, sig[i].d2); */ printf( "sig:%f\t" "size:%u\t" "overlap:%u\t" "ratio:%f\t" "%s\n", sig[i].d1, fd->num_intervals, sig[i].u2, sig[i].d2, fd->file_name); } giggle_index_destroy(&gi); cache.destroy(); }