void read_ref_vector(ref_vector *vector, const char *directory, const char *name) { size_t err=0; FILE *fp; char path[500]; path[0]='\0'; strcat(path, directory); strcat(path, "/"); strcat(path, name); strcat(path, ".vec"); fp = fopen(path, "rb+"); check_file_open(fp, path); err = fread(&vector->n, sizeof(uint64_t), 1, fp); check_file_read(err, 1, path); err = fread(&vector->dollar, sizeof(uint64_t), 1, fp); check_file_read(err, 1, path); check_file_read(err, 1, path); vector->vector = (uint8_t *) malloc((vector->n + 1) * sizeof(uint8_t)); //Valgrind errors on dbwt check_malloc(vector->vector, path); err = fread(vector->vector, sizeof(uint8_t), vector->n, fp); check_file_read(err, vector->n, path); vector->vector[vector->n] = 0; //Valgrind errors on dbwt fclose(fp); }
void read_comp_vector(comp_vector *vector, const char *directory, const char *name) { size_t err=0; FILE *fp; char path[500]; path[0]='\0'; strcat(path, directory); strcat(path, "/"); strcat(path, name); strcat(path, ".vec"); fp = fopen(path, "rb+"); check_file_open(fp, path); err = fread(&vector->siz, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); err = fread(&vector->n, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); err = fread(&vector->ratio, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); vector->vector = (SA_TYPE *) malloc(vector->n * sizeof(SA_TYPE)); check_malloc(vector->vector, path); err = fread(vector->vector, sizeof(SA_TYPE), vector->n, fp); check_file_read(err, vector->n, path); fclose(fp); }
struct bim_file_header *read_bim_file_header(struct bim_file *b) { struct bim_file_header *h = (struct bim_file_header *) malloc(sizeof(struct bim_file_header)); if (fseek(b->file, sizeof(struct gqt_file_header), SEEK_SET)) err(EX_IOERR, "Error seeking to header in BIM file '%s'.", b->file_name); size_t fr = fread(&(h->u_size), sizeof(uint64_t), 1, b->file); check_file_read(b->file_name, b->file, 1, fr); fr = fread(&(h->c_size), sizeof(uint64_t), 1, b->file); check_file_read(b->file_name, b->file, 1, fr); fr = fread(&(h->h_size), sizeof(uint64_t), 1, b->file); check_file_read(b->file_name, b->file, 1, fr); h->md_line_lens = (uint64_t *) malloc(b->gqt_header->num_variants*sizeof(uint64_t)); if (!(h->md_line_lens)) err(EX_OSERR, "malloc error"); fr = fread(h->md_line_lens, sizeof(uint64_t), b->gqt_header->num_variants, b->file); check_file_read(b->file_name, b->file, b->gqt_header->num_variants, fr); return h; }
void read_vector(vector *vector, const char *directory, const char *name) { size_t err=0; FILE *fp; char path[500]; //TODO: Change to dynamic allocation to avoid buffer overflow path[0]='\0'; strcat(path, directory); strcat(path, "/"); strcat(path, name); strcat(path, ".vec"); fp = fopen(path, "rb+"); check_file_open(fp, path); err = fread(&vector->n, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); vector->vector = (SA_TYPE *) malloc(vector->n * sizeof(SA_TYPE)); check_malloc(vector->vector, path); err = fread(vector->vector, sizeof(SA_TYPE), vector->n, fp); check_file_read(err, vector->n, path); fclose(fp); }
//{{{ struct wah_file init_wah_file(char *file_name) struct wah_file init_wah_file(char *file_name) { struct wah_file wf; wf.file_name = strdup(file_name); wf.file = fopen(file_name, "rb"); if (!wf.file) err(EX_NOINPUT, "Cannot open file \"%s\"", file_name); // Jump to the begining of the file to grab the record size fseek(wf.file, 0, SEEK_SET); size_t fr = fread(&wf.num_fields,sizeof(uint32_t),1,wf.file); check_file_read(file_name, wf.file, 1, fr); fr = fread(&wf.num_records,sizeof(uint32_t),1,wf.file); check_file_read(file_name, wf.file, 1, fr); wf.record_offsets = (uint64_t *) malloc(sizeof (uint64_t)*wf.num_records); if (!wf.record_offsets) err(EX_OSERR, "malloc error"); uint32_t i; for (i = 0; i < wf.num_records; ++i) { fr = fread(&(wf.record_offsets[i]),sizeof(uint64_t),1,wf.file); check_file_read(file_name, wf.file, 1, fr); } wf.header_offset = ftell(wf.file); return wf; }
//{{{void load_vid_data(struct vid_file *v) void load_vid_data(struct vid_file *v) { if (v->vids != NULL) errx(EX_SOFTWARE, "VID data has already been loaded for file '%s'.", v->file_name); v->vids = (uint32_t *) malloc(v->gqt_header->num_variants*sizeof(uint32_t)); if (!v->vids) err(EX_OSERR, "malloc error"); if (v->type == VID_LOCAL) { if (fseek(v->file.local, sizeof(struct gqt_file_header), SEEK_SET)) err(EX_IOERR, "Error seeking to data in VID file '%s'.", v->file_name); size_t fr = fread(v->vids, sizeof(uint32_t), v->gqt_header->num_variants, v->file.local); check_file_read(v->file_name, v->file.local, v->gqt_header->num_variants, fr); } else { if (knet_seek(v->file.remote, sizeof(struct gqt_file_header), SEEK_SET) == -1) err(EX_IOERR, "Error seeking to data in remote VID file '%s'.", v->file_name); size_t fr = knet_read(v->file.remote, v->vids, v->gqt_header->num_variants*sizeof(uint32_t)); check_remote_file_read(v->file_name, v->gqt_header->num_variants*sizeof(uint32_t), fr); } }
struct off_file *open_off_file(char *file_name) { struct off_file *o = (struct off_file *) malloc(sizeof(struct off_file)); if (!o) err(EX_OSERR, "malloc error"); o->file_name = strdup(file_name); o->file = fopen(file_name,"rb+"); if (!(o->file)) err(EX_NOINPUT, "Cannot open OFF file '%s'", file_name); o->gqt_header = read_gqt_file_header(o->file_name, o->file); if ( !((o->gqt_header->marker[0] == 'G') && (o->gqt_header->marker[1] == 'Q') && (o->gqt_header->marker[2] == 'T')) ) errx(EX_NOINPUT, "File '%s' is not a GQT file.", file_name); if (o->gqt_header->type != 'o') errx(EX_NOINPUT, "File '%s' is not a OFF file.", file_name); o->offsets = (uint64_t *) malloc((o->gqt_header->num_variants)*sizeof(uint64_t)); if (!(o->offsets)) err(EX_OSERR, "malloc error"); size_t fr = fread(o->offsets, sizeof(uint64_t), o->gqt_header->num_variants, o->file); check_file_read(o->file_name, o->file, o->gqt_header->num_variants, fr); return o; }
//{{{ int query(int argc, char **argv, char *full_cmd) int query(int argc, char **argv, char *full_cmd) { if (argc < 2) return query_help(); int c; char *wahbm_file_name=NULL, *id_query=NULL, *gt_query=NULL, *db_file_name=NULL, *bim_file_name=NULL, *src_bcf_file_name=NULL, *vid_file_name=NULL; int i_is_set = 0, id_q_count = 0, gt_q_count = 0, d_is_set = 0, c_is_set = 0, v_is_set = 0, s_is_set = 0, b_is_set = 0, bcf_output = 0; char *id_query_list[100]; char *gt_query_list[100]; //{{{ parse cmd line opts while ((c = getopt (argc, argv, "chi:p:g:d:b:v:s:B")) != -1) { switch (c) { case 'c': c_is_set = 1; break; case 'i': i_is_set = 1; wahbm_file_name = optarg; break; case 'p': id_query_list[id_q_count] = optarg; id_q_count += 1; break; case 'g': gt_query_list[gt_q_count] = optarg; gt_q_count += 1; break; case 'd': d_is_set = 1; db_file_name = optarg; break; case 'b': b_is_set = 1; bim_file_name = optarg; break; case 'v': v_is_set = 1; vid_file_name = optarg; break; case 's': s_is_set = 1; src_bcf_file_name = optarg; break; case 'B': bcf_output = 1; break; case 'h': return query_help(); case '?': if ( (optopt == 'i') || (optopt == 'p') || (optopt == 'g') || (optopt == 'd') || (optopt == 'b') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return query_help(); default: return query_help(); } } if (i_is_set == 0) { fprintf(stderr, "GQT file is not set\n"); return query_help(); } else { if ( access( wahbm_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing GQT file \"%s\"", wahbm_file_name); } if (d_is_set == 1) { if ( access( db_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing PED DB file \"%s\"", db_file_name); } // Try to auto-detect file names based on GQT if ( (i_is_set == 1) && (b_is_set == 0)) { int auto_bim_file_name_size = asprintf(&bim_file_name, "%s", wahbm_file_name); strcpy(bim_file_name + strlen(bim_file_name) - 3, "bim"); if ( access( bim_file_name, F_OK) != -1 ) { b_is_set = 1; } else { fprintf(stderr, "Auto detect failure: BIM file %s not found\n", bim_file_name); return query_help(); } } if ( (i_is_set == 1) && (v_is_set == 0)) { int auto_vid_file_name_size = asprintf(&vid_file_name, "%s", wahbm_file_name); strcpy(vid_file_name + strlen(vid_file_name) - 3, "vid"); if ( access( vid_file_name, F_OK) != -1 ) { v_is_set = 1; } else { fprintf(stderr, "Auto detect failure: VID file %s not found\n", vid_file_name); return query_help(); } } /////////////////////////////// if ( (i_is_set == 1) && (d_is_set == 0)) { int auto_db_file_name_size = asprintf(&db_file_name, "%s", wahbm_file_name); strcpy(db_file_name + strlen(db_file_name) - 3, "db\0"); if ( access( db_file_name, F_OK) != -1 ) { d_is_set = 1; } else { fprintf(stderr, "Auto detect failure: PED DB file %s not found\n", db_file_name); return query_help(); } } if (v_is_set == 0) { fprintf(stderr, "VID file is not set\n"); return query_help(); } if (b_is_set == 0) { fprintf(stderr, "BIM file is not set\n"); return query_help(); } if (d_is_set == 0) { fprintf(stderr, "PED database file is not set\n"); return query_help(); } if (gt_q_count != id_q_count) { fprintf(stderr, "Mismatched number of individual and genotype query strings\n"); return query_help(); } //}}} struct gqt_query q[100]; uint32_t *gt_mask[100]; uint32_t *counts[100]; uint32_t *mapped_counts[100]; uint32_t id_lens[100]; int r, i, j, k; for (i = 0; i < gt_q_count; ++i) { if (parse_q(gt_query_list[i], &(q[i]))) { fprintf(stderr, "in the %dth genotype query.\n", i+1); return 1; } } // open WAH/GQT file struct wah_file wf = init_wahbm_file(wahbm_file_name); // open VID file FILE *vid_f = fopen(vid_file_name, "rb"); if (!vid_f) err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name); uint32_t *vids = (uint32_t *) malloc(wf.num_fields*sizeof(uint32_t)); if (!vids) err(EX_OSERR, "malloc error"); size_t fr = fread(vids, sizeof(uint32_t), wf.num_fields, vid_f); check_file_read(vid_file_name, vid_f, wf.num_fields, fr); fclose(vid_f); uint32_t num_ints = (wf.num_fields + 32 - 1)/ 32; uint32_t len_ints; for (i = 0; i < gt_q_count; ++i) { uint32_t len_count_R; uint32_t *R; /* * Submit the population query to the PED database and get back both * the list of of ids in R and the length of R in id_lens[i] */ id_lens[i] = resolve_ind_query(&R, id_query_list[i], db_file_name); // Enforce that the offsets of the relevant samples is // within the number of samples in the GQT index. if (id_lens[i] > wf.num_records) { fprintf(stderr, "ERROR: there are more samples in the PED database (%d) " "that match this condition \nthan there are in the GQT " "index (%d). Perhaps your PED file is a superset of " "the\nsamples in your VCF/BCF file?\n", id_lens[i], wf.num_records); return 1; } uint32_t low_v, high_v; /* * q holds the parameters of each query, first determin the range of * bitmaps to pull */ if ( q[i].variant_op == p_maf ) { low_v = 1; high_v = 3; } else { if ( q[i].genotype_condition[0] == 1) low_v = 0; else if ( q[i].genotype_condition[1] == 1) low_v = 1; else if ( q[i].genotype_condition[2] == 1) low_v = 2; else if ( q[i].genotype_condition[3] == 1) low_v = 3; if ( q[i].genotype_condition[3] == 1) high_v = 4; else if ( q[i].genotype_condition[2] == 1) high_v = 3; else if ( q[i].genotype_condition[1] == 1) high_v = 2; else if ( q[i].genotype_condition[0] == 1) high_v = 1; } /* * The set of variants that are printed is stored in a mask for each * query, then those masks are combine to a final mask. Each mask is a * 32-bit packed int, where each bit correspons to one variant. How * those bits are set depends on the filter the user specifices. * * If they simply ask for a count or perecent, then there is not filter * and the mask is set to all 1s. * * If count is followed by a condition, then the count/pct is compared * to that condition and the bits are set for those that meet the * condition. * * If no funtion is used then we simply run the wahbm range query and * convert the wah results to packed ints for the mask * */ /* User asks for a count, percent, or maf */ if ( ( q[i].variant_op == p_count ) || ( q[i].variant_op == p_pct ) || ( q[i].variant_op == p_maf ) ) { if (q[i].variant_op == p_maf) { #ifdef __AVX2__ len_count_R = avx_sum_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #else len_count_R = sum_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #endif } else { #ifdef __AVX2__ len_count_R = avx_count_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #else len_count_R = count_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #endif } /* Since the variants are in allele freq order, we need to copy * the resulting value to an array that is back in the original * order */ mapped_counts[i] = (uint32_t *)calloc(len_count_R, sizeof(uint32_t)); for ( j = 0; j < len_count_R; ++j) mapped_counts[i][vids[j]] = counts[i][j]; gt_mask[i] = (uint32_t *) malloc(num_ints * sizeof(uint32_t)); if (!gt_mask[i]) err(EX_OSERR, "malloc error"); /* User specifies a condition */ if ( q[i].op_condition != -1) { /* Since we only find counts, when the user asks for a * perecent, just convert that back to the count that meets the * percent condition */ float condition_value = q[i].condition_value; if (q[i].variant_op == p_pct) condition_value *= id_lens[i]; else if (q[i].variant_op == p_maf) condition_value *= id_lens[i]*2; /* Test to see if each count meets the condition */ uint32_t v = 0, int_i = 0, bit_i = 0; for ( j = 0; j < len_count_R; ++j) { if ( query_cmp(counts[i][j], q[i].op_condition, condition_value) ) { v |= 1 << (31 - bit_i); } bit_i += 1; if ( bit_i == 32 ) { gt_mask[i][int_i] = v; int_i += 1; bit_i = 0; v = 0; } } if ( bit_i > 0) gt_mask[i][int_i] = v; } else { // if no op is set then let everything pass for (j = 0; j < num_ints; ++j) gt_mask[i][j] = -1; // set all the bits to 1 } /* User only gives genotype filters, no funtion/condition */ } else { uint32_t *gt_R; uint32_t len_wf_R = range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, >_R); len_ints = wah_to_ints(gt_R,len_wf_R,&(gt_mask[i])); free(gt_R); } free(R); } uint32_t *final_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t)); // combine all of the masks to see what we need to print for (i = 0; i < num_ints; ++i) { final_mask[i] = ~0; for (j = 0; j < gt_q_count; ++j) final_mask[i] &= gt_mask[j][i]; } if (c_is_set == 1) { uint32_t masked_vid_count = 0; for (i = 0; i < num_ints; ++i) masked_vid_count += popcount(final_mask[i]); if (masked_vid_count <= wf.num_fields) printf("%u\n", masked_vid_count); else printf("%u\n", wf.num_fields); } else if ((v_is_set == 1) && (s_is_set == 1)) { get_bcf_query_result(final_mask, num_ints, q, id_query_list, id_lens, gt_q_count, wf.num_fields, vid_file_name, src_bcf_file_name, bcf_output); } else if (b_is_set == 1){ uint32_t *mapped_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t)); uint32_t v,p,leading_zeros, hit; for (i = 0; i < num_ints; ++i) { if (final_mask[i] != 0) { v = final_mask[i]; p = popcount(v); for (j = 0; j < p; ++j) { leading_zeros = __builtin_clz(v); if (i*32 + leading_zeros + 1 > wf.num_fields) break; hit = vids[leading_zeros + i*32]; mapped_mask[hit/32] |= 1 << (31-hit%32); v &= ~(1 << (32 - leading_zeros - 1)); } } if (i*32 + leading_zeros + 1 > wf.num_fields) break; } print_query_result(mapped_mask, num_ints, vids, q, mapped_counts, id_lens, gt_q_count, wf.num_fields, bim_file_name, full_cmd); } for (j = 0; j < gt_q_count; ++j) { free(gt_mask[j]); if ( (q[j].variant_op == p_count) || (q[j].variant_op == p_pct) || (q[j].variant_op == p_maf) ) free(counts[j]); } destroy_wahbm_file(&wf); return 0; }
//{{{ void get_bcf_query_result(uint32_t *mask, void get_bcf_query_result(uint32_t *mask, uint32_t mask_len, struct gqt_query *q, char **id_query_list, uint32_t *id_lens, uint32_t num_qs, uint32_t num_fields, char *vid_file_name, char *src_bcf_file_name, int bcf_output) { /* The VID file contains the line numbers of the variants after they have * been sorted. To reach back into the BCF file to print the metadata * associated with the variants marked in the mask, we need to create a * sorted list of line numbers we want. So first we intersect the VID file * and the mask, then sort it. */ FILE *vid_f = fopen(vid_file_name, "rb"); if (!vid_f) err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name); uint32_t *vids = (uint32_t *) malloc(num_fields*sizeof(uint32_t)); if (!vids ) err(EX_OSERR, "malloc error"); size_t fr = fread(vids, sizeof(uint32_t), num_fields, vid_f); check_file_read(vid_file_name, vid_f, num_fields, fr); fclose(vid_f); uint32_t i, j, masked_vid_count = 0; for (i = 0; i < mask_len; ++i) masked_vid_count += popcount(mask[i]); uint32_t *masked_vids = (uint32_t *) malloc(masked_vid_count*sizeof(uint32_t)); if (!masked_vids ) err(EX_OSERR, "malloc error"); uint32_t masked_vid_i = 0; for (i = 0; i < mask_len; ++i) { uint32_t bytes = mask[i]; if (bytes == 0) continue; /* skip a bunch of ops if you can */ for (j = 0; j < 32; j++) { if (bytes & (1 << (31 - j))) { masked_vids[masked_vid_i] = vids[i*32 + j]; masked_vid_i+=1; } } if (masked_vid_i == masked_vid_count) break; } free(vids); qsort(masked_vids, masked_vid_count, sizeof(uint32_t), compare_uint32_t); htsFile *fp = hts_open(src_bcf_file_name,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *line = bcf_init1(); //bcf_hdr_set_samples(hdr, print_name_csv, 0); htsFile *out; if (!bcf_output) out = hts_open("-", "w"); else out = hts_open("-", "wb"); int r = bcf_hdr_write(out, hdr); uint32_t bcf_line_i = 0; masked_vid_i = 0; while ( bcf_read(fp, hdr, line) != -1) { if (masked_vids[masked_vid_i] == bcf_line_i) { r = bcf_unpack(line, BCF_UN_ALL); r = bcf_write1(out, hdr, line); masked_vid_i+=1; } if (masked_vid_i == masked_vid_count) break; bcf_line_i += 1; } hts_close(out); hts_close(fp); }
void read_comp_matrix(comp_matrix *matrix, const char *directory, const char *name) { size_t err=0; FILE *fp; char path[500]; path[0]='\0'; strcat(path, directory); strcat(path, "/"); strcat(path, name); strcat(path, ".desp"); fp = fopen(path, "rb+"); check_file_open(fp, path); err = fread(&matrix->siz, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); err = fread(&matrix->n_desp, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); err = fread(&matrix->m_desp, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); matrix->desp = (SA_TYPE **) malloc(matrix->n_desp * sizeof(SA_TYPE *)); check_malloc(matrix->desp, path); for (SA_TYPE i=0; i<matrix->n_desp; i++) { matrix->desp[i] = (SA_TYPE *) malloc(matrix->m_desp * sizeof(SA_TYPE)); check_malloc(matrix->desp[i], path); err = fread(matrix->desp[i], sizeof(SA_TYPE), matrix->m_desp, fp); check_file_read(err, matrix->m_desp, path); } fclose(fp); #if defined FM_COMP_32 || FM_COMP_64 path[0]='\0'; strcat(path, directory); strcat(path, "/"); strcat(path, name); strcat(path, ".count"); fp = fopen(path, "rb+"); check_file_open(fp, path); err = fread(&matrix->n_count, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); err = fread(&matrix->m_count, sizeof(SA_TYPE), 1, fp); check_file_read(err, 1, path); matrix->count = (FM_COMP_TYPE **) malloc(matrix->n_count * sizeof(FM_COMP_TYPE *)); check_malloc(matrix->count, path); for (SA_TYPE i=0; i<matrix->n_count; i++){ matrix->count[i] = (FM_COMP_TYPE *) malloc(matrix->m_count * sizeof(FM_COMP_TYPE)); check_malloc(matrix->count[i], path); err = fread(matrix->count[i], sizeof(FM_COMP_TYPE), matrix->m_count, fp); check_file_read(err, matrix->m_count, path); } fclose(fp); #endif }