/* * query_usage -- * Query usage message. */ static int query_usage(void) { fprintf(stderr, "%s: query syntax error\n", progname); return (query_help(NULL)); }
//{{{ int query(int argc, char **argv, char *full_cmd) int query(int argc, char **argv, char *full_cmd) { if (argc < 2) return query_help(EX_USAGE); int c; char *input_file_name=NULL, *gqt_file_name=NULL, *ped_db_file_name=NULL, *bim_file_name=NULL, *off_file_name=NULL, *bcf_file_name=NULL, *vid_file_name=NULL, *tmp_dir_name=NULL; int c_is_set = 0, i_is_set = 0, id_q_count = 0, gt_q_count = 0, d_is_set = 0, v_is_set = 0, V_is_set = 0, G_is_set = 0, B_is_set = 0, O_is_set = 0, S_is_set = 0, t_is_set = 0, bcf_output = 0; char *id_query_list[100]; char *gt_query_list[100]; //{{{ parse cmd line opts while ((c = getopt (argc, argv, "chvi:p:g:d:B:V:G:O:S:t:")) != -1) { switch (c) { case 'v': v_is_set = 1; break; case 'c': c_is_set = 1; break; case 'i': i_is_set = 1; input_file_name = optarg; break; case 'p': id_query_list[id_q_count] = optarg; id_q_count += 1; break; case 'g': gt_query_list[gt_q_count] = optarg; gt_q_count += 1; break; case 'd': d_is_set = 1; ped_db_file_name = optarg; break; case 'B': B_is_set = 1; bim_file_name = optarg; break; case 'V': V_is_set = 1; vid_file_name = optarg; break; case 'G': G_is_set = 1; gqt_file_name = optarg; break; case 'S': S_is_set = 1; bcf_file_name = optarg; break; case 'O': O_is_set = 1; off_file_name = optarg; break; case 't': t_is_set = 1; tmp_dir_name = optarg; break; case 'h': return query_help(EX_OK); case '?': if ( (optopt == 'i') || (optopt == 'p') || (optopt == 'g') || (optopt == 'd') || (optopt == 'b') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return query_help(EX_USAGE); default: return query_help(EX_OK); } } if (t_is_set == 0) { if (asprintf(&tmp_dir_name,"./") == -1) err(EX_OSERR, "asprintf error"); } if (i_is_set == 0) { fprintf (stderr, "No input file given (vcf.gz/bcf/gqt).\n"); return query_help(EX_NOINPUT); } if (strlen(input_file_name) < 4) { fprintf (stderr, "Cannot determine input file type for file '%s'.\n", input_file_name); fprintf (stderr, "NOTE: The file must have a supported extension " "(vcf.gz/bcf/gqt).\n"); return query_help(EX_NOINPUT); } // See if -i is a gqt file or a bcf file char *input_file_type = (char *)malloc(4*sizeof(char)); strncpy(input_file_type, input_file_name + ( strlen(input_file_name) - 3), 3 * sizeof(char)); input_file_type[3] = '\0'; if (strcmp(input_file_type, "gqt") == 0 ) { G_is_set = 1; gqt_file_name = input_file_name; } else if (strcmp(input_file_type, "bcf") == 0 ) { S_is_set = 1; bcf_file_name = input_file_name; } else { if (strlen(input_file_name) < 8) { fprintf (stderr, "Cannot determine input file type for file '%s'.\n", input_file_name); fprintf (stderr, "NOTE: The file must have a supported extension " "(vcf.gz/bcf/gqt).\n"); return query_help(EX_NOINPUT); } free(input_file_type); input_file_type = (char *)malloc(6*sizeof(char)); strncpy(input_file_type, input_file_name + ( strlen(input_file_name) - 6), 6 * sizeof(char)); if (strcmp(input_file_type, "vcf.gz") == 0 ) { S_is_set = 1; bcf_file_name = input_file_name; } else { fprintf (stderr, "Cannot determine input file type for file '%s'.\n", input_file_name); fprintf (stderr, "NOTE: The file must have a supported extension " "(vcf.gz/bcf/gqt).\n"); return query_help(EX_NOINPUT); } } // BCF/VCFGZ file is set if (S_is_set == 1) { /* if ( access( bcf_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing BCF file '%s'", bcf_file_name); */ // GQT is not, autodetect if (G_is_set == 0) { gqt_file_name = (char*)malloc(strlen(bcf_file_name) + 5); if (!gqt_file_name) err(EX_OSERR, "malloc error"); strcpy(gqt_file_name, bcf_file_name); strcat(gqt_file_name, ".gqt"); if ( ping_file(gqt_file_name) != 0 ) { G_is_set = 1; } else { fprintf(stderr, "Auto detect failure: GQT file '%s' not found\n", gqt_file_name); return query_help(EX_NOINPUT); } /* if ( access( gqt_file_name, F_OK) != -1 ) { G_is_set = 1; } else { fprintf(stderr, "Auto detect failure: GQT file '%s' not found\n", gqt_file_name); return query_help(EX_NOINPUT); } */ } /* else { if ( access( gqt_file_name, F_OK) == -1 ) { fprintf(stderr, "GQT file '%s' not found\n", gqt_file_name); return query_help(EX_NOINPUT); } } */ // PED DB is not, autodetect if (d_is_set == 0) { ped_db_file_name = (char*)malloc(strlen(bcf_file_name) + 4); if (!ped_db_file_name) err(EX_OSERR, "malloc error"); strcpy(ped_db_file_name, bcf_file_name); strcat(ped_db_file_name, ".db"); d_is_set = 1; /* if ( access( ped_db_file_name, F_OK) != -1 ) { d_is_set = 1; } else { fprintf(stderr, "Auto detect failure: DB file '%s' not found\n", ped_db_file_name); return query_help(EX_NOINPUT); } */ } // VID is not, autodetect if (V_is_set == 0) { vid_file_name = (char*)malloc(strlen(bcf_file_name) + 5); if (!vid_file_name) err(EX_OSERR, "malloc error"); strcpy(vid_file_name, bcf_file_name); strcat(vid_file_name, ".vid"); V_is_set = 1; /* if ( access( vid_file_name, F_OK) != -1 ) { V_is_set = 1; } else { fprintf(stderr, "Auto detect failure: VID file '%s' not found\n", vid_file_name); return query_help(EX_NOINPUT); } */ } // Try and find the BIM file, okay if not there (for now) if (B_is_set == 0) { bim_file_name = (char*)malloc(strlen(bcf_file_name) + 5); if (!bim_file_name) err(EX_OSERR, "malloc error"); strcpy(bim_file_name, bcf_file_name); strcat(bim_file_name, ".bim"); B_is_set = 1; /* if ( access( bim_file_name, F_OK) != -1 ) B_is_set = 1; */ } // Try and find the OFF file, okay if not there (for now) if (O_is_set == 0) { off_file_name = (char*)malloc(strlen(bcf_file_name) + 5); if (!off_file_name) err(EX_OSERR, "malloc error"); strcpy(off_file_name, bcf_file_name); strcat(off_file_name, ".off"); O_is_set = 1; /* if ( access( off_file_name, F_OK) != -1 ) O_is_set = 1; */ } } else if (G_is_set == 1) { /* if ( access( gqt_file_name, F_OK) == -1 ) { fprintf(stderr, "GQT file '%s' not found\n", gqt_file_name); return query_help(EX_NOINPUT); } */ // Try and find the BIM file, okay if not there (for now) if (B_is_set == 0) { bim_file_name = (char*) malloc((1+strlen(gqt_file_name))*sizeof(char)); if (!bim_file_name) err(EX_OSERR, "malloc error"); strcpy(bim_file_name, gqt_file_name); strcpy(bim_file_name + strlen(bim_file_name) - 3, "bim"); B_is_set = 1; /* if ( access( bim_file_name, F_OK) != -1 ) B_is_set = 1; */ } if (O_is_set == 0) { off_file_name = (char*) malloc((1+strlen(gqt_file_name))*sizeof(char)); if (!off_file_name) err(EX_OSERR, "malloc error"); strcpy(off_file_name, gqt_file_name); strcpy(off_file_name + strlen(off_file_name) - 3, "off"); O_is_set = 1; /* if ( access( off_file_name, F_OK) != -1 ) O_is_set = 1; */ } if (V_is_set == 0) { vid_file_name = (char*) malloc((1+strlen(gqt_file_name))*sizeof(char)); if (!vid_file_name) err(EX_OSERR, "malloc error"); strcpy(vid_file_name, gqt_file_name); strcpy(vid_file_name + strlen(vid_file_name) - 3, "vid"); V_is_set = 1; /* if ( access( vid_file_name, F_OK) != -1 ) { V_is_set = 1; } else { fprintf(stderr, "Auto detect failure: VID file '%s' not found\n", vid_file_name); return query_help(EX_NOINPUT); } */ } if (d_is_set == 0) { ped_db_file_name = (char*) malloc((1+strlen(gqt_file_name))*sizeof(char)); if (!ped_db_file_name) err(EX_OSERR, "malloc error"); strcpy(ped_db_file_name, gqt_file_name); strcpy(ped_db_file_name + strlen(gqt_file_name) - 3, "db\0"); d_is_set = 1; /* if ( access( ped_db_file_name, F_OK) != -1 ) { d_is_set = 1; } else { fprintf(stderr, "Auto detect failure: PED DB file '%s' not found\n", ped_db_file_name); return query_help(EX_NOINPUT); } */ } } else { fprintf(stderr, "Neither GQT or BCF/VCF.GZ file given.\n"); return query_help(EX_NOINPUT); } /* if (B_is_set == 1) { if ( access( bim_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing BIM file '%s'", bim_file_name); } if (O_is_set == 1) { if ( access( off_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing OFF file '%s'", bim_file_name); } */ if (V_is_set == 0) { fprintf(stderr, "VID file is not set\n"); return query_help(EX_NOINPUT); } if (d_is_set == 0) { fprintf(stderr, "DB file is not set\n"); return query_help(EX_NOINPUT); } if (gt_q_count != id_q_count) { fprintf(stderr, "Mismatched number of individual and genotype query strings\n"); return query_help(EX_USAGE); } if ((B_is_set == 0) && (O_is_set == 0) && (c_is_set == 0)) { fprintf(stderr, "Must set either BIM or OFF files when doing anything " "other than counting.\n"); return query_help(EX_USAGE); } if ( (v_is_set == 1) && ((O_is_set == 0) || (S_is_set == 0)) ) { fprintf(stderr, "To get genotypes source BCF/VCF.GZ and OFF files " "must be set.\n"); return query_help(EX_USAGE); } if ( (v_is_set == 0) && (B_is_set == 0) && (c_is_set == 0)) { fprintf(stderr, "To get variant data only BIM file must be set.\n"); return query_help(EX_USAGE); } //}}} struct gqt_query q[100]; uint32_t *gt_mask[100]; uint32_t *counts[100]; uint32_t *mapped_counts[100]; uint32_t id_lens[100]; int r, i, j, k; for (i = 0; i < gt_q_count; ++i) { if (parse_q(gt_query_list[i], &(q[i]))) { fprintf(stderr, "in the %dth genotype query.\n", i+1); return 1; } } struct wahbm_file *wf = open_wahbm_file(gqt_file_name); struct vid_file *vid_f = open_vid_file(vid_file_name); load_vid_data(vid_f); //uint32_t num_ints = (wf.num_fields + 32 - 1)/ 32; uint32_t num_ints = (wf->gqt_header->num_variants + 32 - 1)/ 32; uint32_t len_ints; uint32_t *U_R = NULL; uint32_t U_R_len = 0; for (i = 0; i < gt_q_count; ++i) { uint32_t len_count_R; uint32_t *R; /* * Submit the population query to the PED database and get back both * the list of of ids in R and the length of R in id_lens[i] */ id_lens[i] = resolve_ind_query(&R, id_query_list[i], ped_db_file_name, tmp_dir_name); uint32_t *tmp_U_R = (uint32_t *) realloc(U_R, (U_R_len + id_lens[i]) * sizeof(uint32_t)); if (!tmp_U_R) err(EX_OSERR, "malloc error"); else U_R = tmp_U_R; for (j = 0; j < id_lens[i]; ++j) { U_R[U_R_len] = R[j]; U_R_len += 1; } // Enforce that the offsets of the relevant samples is // within the number of samples in the GQT index. if (id_lens[i] > wf->gqt_header->num_samples) { fprintf(stderr, "ERROR: there are more samples in the PED database (%d) " "that match this condition \nthan there are in the GQT " "index (%d). Perhaps your PED file is a superset of " "the\nsamples in your VCF/BCF file?\n", id_lens[i], wf->gqt_header->num_samples); return 1; } uint32_t low_v = 0, high_v = 0; /* * q holds the parameters of each query, first determin the range of * bitmaps to pull */ if ( q[i].variant_op == p_maf ) { low_v = 1; high_v = 3; } else { if ( q[i].genotype_condition[0] == 1) low_v = 0; else if ( q[i].genotype_condition[1] == 1) low_v = 1; else if ( q[i].genotype_condition[2] == 1) low_v = 2; else if ( q[i].genotype_condition[3] == 1) low_v = 3; if ( q[i].genotype_condition[3] == 1) high_v = 4; else if ( q[i].genotype_condition[2] == 1) high_v = 3; else if ( q[i].genotype_condition[1] == 1) high_v = 2; else if ( q[i].genotype_condition[0] == 1) high_v = 1; } /* * The set of variants that are printed is stored in a mask for each * query, then those masks are combine to a final mask. Each mask is a * 32-bit packed int, where each bit correspons to one variant. How * those bits are set depends on the filter the user specifices. * * If they simply ask for a count or perecent, then there is not filter * and the mask is set to all 1s. * * If count is followed by a condition, then the count/pct is compared * to that condition and the bits are set for those that meet the * condition. * * If no funtion is used then we simply run the wahbm range query and * convert the wah results to packed ints for the mask * */ /* User asks for a count, percent, or maf */ if ( ( q[i].variant_op == p_count ) || ( q[i].variant_op == p_pct ) || ( q[i].variant_op == p_maf ) ) { if (q[i].variant_op == p_maf) { #ifdef __AVX2__ len_count_R = avx_sum_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #else len_count_R = sum_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #endif } else { #ifdef __AVX2__ len_count_R = avx_count_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #else len_count_R = count_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #endif } /* Since the variants are in allele freq order, we need to copy * the resulting value to an array that is back in the original * order */ mapped_counts[i] = (uint32_t *)calloc(len_count_R, sizeof(uint32_t)); for ( j = 0; j < len_count_R; ++j) mapped_counts[i][vid_f->vids[j]] = counts[i][j]; gt_mask[i] = (uint32_t *) malloc(num_ints * sizeof(uint32_t)); if (!gt_mask[i]) err(EX_OSERR, "malloc error"); /* User specifies a condition */ if ( q[i].op_condition != -1) { /* Since we only find counts, when the user asks for a * perecent, just convert that back to the count that meets the * percent condition */ float condition_value = q[i].condition_value; if (q[i].variant_op == p_pct) condition_value *= id_lens[i]; else if (q[i].variant_op == p_maf) condition_value *= id_lens[i]*2; /* Test to see if each count meets the condition */ uint32_t v = 0, int_i = 0, bit_i = 0; for ( j = 0; j < len_count_R; ++j) { if ( query_cmp(counts[i][j], q[i].op_condition, condition_value) ) { v |= 1 << (31 - bit_i); } bit_i += 1; if ( bit_i == 32 ) { gt_mask[i][int_i] = v; int_i += 1; bit_i = 0; v = 0; } } if ( bit_i > 0) gt_mask[i][int_i] = v; } else { // if no op is set then let everything pass for (j = 0; j < num_ints; ++j) gt_mask[i][j] = -1; // set all the bits to 1 } /* User only gives genotype filters, no funtion/condition */ } else { uint32_t *gt_R; uint32_t len_wf_R = range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, >_R); len_ints = wah_to_ints(gt_R,len_wf_R,&(gt_mask[i])); free(gt_R); } free(R); } if (U_R == NULL) { U_R_len= resolve_ind_query(&U_R, "", ped_db_file_name, tmp_dir_name); } // Get the uniq elements in place qsort(U_R, U_R_len, sizeof(uint32_t), compare_uint32_t); for (i = j = 0; i < U_R_len; i++) if (U_R[i] != U_R[j]) U_R[++j] = U_R[i]; U_R_len = j + 1; uint32_t *final_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t)); // combine all of the masks to see what we need to print for (i = 0; i < num_ints; ++i) { final_mask[i] = ~0; for (j = 0; j < gt_q_count; ++j) final_mask[i] &= gt_mask[j][i]; } if (c_is_set == 1) { uint32_t masked_vid_count = 0; for (i = 0; i < num_ints; ++i) masked_vid_count += popcount(final_mask[i]); if (masked_vid_count <= wf->gqt_header->num_variants) printf("%u\n", masked_vid_count); else printf("%u\n", wf->gqt_header->num_variants); } else if ( (B_is_set == 1) || (O_is_set == 1)){ uint32_t *mapped_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t)); uint32_t v,p,leading_zeros=32, hit; for (i = 0; i < num_ints; ++i) { if (final_mask[i] != 0) { v = final_mask[i]; p = popcount(v); for (j = 0; j < p; ++j) { leading_zeros = __builtin_clz(v); if (i*32 + leading_zeros + 1 > wf->gqt_header->num_variants) break; hit = vid_f->vids[leading_zeros + i*32]; mapped_mask[hit/32] |= 1 << (31-hit%32); v &= ~(1 << (32 - leading_zeros - 1)); } } if (i*32 + leading_zeros + 1 > wf->gqt_header->num_variants) break; } if (v_is_set == 0) print_query_result_bim(mapped_mask, num_ints, vid_f->vids, q, mapped_counts, id_lens, gt_q_count, wf->gqt_header->num_variants, bim_file_name, full_cmd); else print_query_result_offset(mapped_mask, num_ints, vid_f->vids, q, mapped_counts, id_lens, U_R, U_R_len, id_query_list, gt_query_list, gt_q_count, wf->gqt_header->num_variants, off_file_name, bcf_file_name, full_cmd); } for (j = 0; j < gt_q_count; ++j) { free(gt_mask[j]); if ( (q[j].variant_op == p_count) || (q[j].variant_op == p_pct) || (q[j].variant_op == p_maf) ) free(counts[j]); } destroy_vid_file(vid_f); destroy_wahbm_file(wf); return 0; }
//{{{ int query(int argc, char **argv, char *full_cmd) int query(int argc, char **argv, char *full_cmd) { if (argc < 2) return query_help(); int c; char *wahbm_file_name=NULL, *id_query=NULL, *gt_query=NULL, *db_file_name=NULL, *bim_file_name=NULL, *src_bcf_file_name=NULL, *vid_file_name=NULL; int i_is_set = 0, id_q_count = 0, gt_q_count = 0, d_is_set = 0, c_is_set = 0, v_is_set = 0, s_is_set = 0, b_is_set = 0, bcf_output = 0; char *id_query_list[100]; char *gt_query_list[100]; //{{{ parse cmd line opts while ((c = getopt (argc, argv, "chi:p:g:d:b:v:s:B")) != -1) { switch (c) { case 'c': c_is_set = 1; break; case 'i': i_is_set = 1; wahbm_file_name = optarg; break; case 'p': id_query_list[id_q_count] = optarg; id_q_count += 1; break; case 'g': gt_query_list[gt_q_count] = optarg; gt_q_count += 1; break; case 'd': d_is_set = 1; db_file_name = optarg; break; case 'b': b_is_set = 1; bim_file_name = optarg; break; case 'v': v_is_set = 1; vid_file_name = optarg; break; case 's': s_is_set = 1; src_bcf_file_name = optarg; break; case 'B': bcf_output = 1; break; case 'h': return query_help(); case '?': if ( (optopt == 'i') || (optopt == 'p') || (optopt == 'g') || (optopt == 'd') || (optopt == 'b') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return query_help(); default: return query_help(); } } if (i_is_set == 0) { fprintf(stderr, "GQT file is not set\n"); return query_help(); } else { if ( access( wahbm_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing GQT file \"%s\"", wahbm_file_name); } if (d_is_set == 1) { if ( access( db_file_name, F_OK) == -1 ) err(EX_NOINPUT, "Error accessing PED DB file \"%s\"", db_file_name); } // Try to auto-detect file names based on GQT if ( (i_is_set == 1) && (b_is_set == 0)) { int auto_bim_file_name_size = asprintf(&bim_file_name, "%s", wahbm_file_name); strcpy(bim_file_name + strlen(bim_file_name) - 3, "bim"); if ( access( bim_file_name, F_OK) != -1 ) { b_is_set = 1; } else { fprintf(stderr, "Auto detect failure: BIM file %s not found\n", bim_file_name); return query_help(); } } if ( (i_is_set == 1) && (v_is_set == 0)) { int auto_vid_file_name_size = asprintf(&vid_file_name, "%s", wahbm_file_name); strcpy(vid_file_name + strlen(vid_file_name) - 3, "vid"); if ( access( vid_file_name, F_OK) != -1 ) { v_is_set = 1; } else { fprintf(stderr, "Auto detect failure: VID file %s not found\n", vid_file_name); return query_help(); } } /////////////////////////////// if ( (i_is_set == 1) && (d_is_set == 0)) { int auto_db_file_name_size = asprintf(&db_file_name, "%s", wahbm_file_name); strcpy(db_file_name + strlen(db_file_name) - 3, "db\0"); if ( access( db_file_name, F_OK) != -1 ) { d_is_set = 1; } else { fprintf(stderr, "Auto detect failure: PED DB file %s not found\n", db_file_name); return query_help(); } } if (v_is_set == 0) { fprintf(stderr, "VID file is not set\n"); return query_help(); } if (b_is_set == 0) { fprintf(stderr, "BIM file is not set\n"); return query_help(); } if (d_is_set == 0) { fprintf(stderr, "PED database file is not set\n"); return query_help(); } if (gt_q_count != id_q_count) { fprintf(stderr, "Mismatched number of individual and genotype query strings\n"); return query_help(); } //}}} struct gqt_query q[100]; uint32_t *gt_mask[100]; uint32_t *counts[100]; uint32_t *mapped_counts[100]; uint32_t id_lens[100]; int r, i, j, k; for (i = 0; i < gt_q_count; ++i) { if (parse_q(gt_query_list[i], &(q[i]))) { fprintf(stderr, "in the %dth genotype query.\n", i+1); return 1; } } // open WAH/GQT file struct wah_file wf = init_wahbm_file(wahbm_file_name); // open VID file FILE *vid_f = fopen(vid_file_name, "rb"); if (!vid_f) err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name); uint32_t *vids = (uint32_t *) malloc(wf.num_fields*sizeof(uint32_t)); if (!vids) err(EX_OSERR, "malloc error"); size_t fr = fread(vids, sizeof(uint32_t), wf.num_fields, vid_f); check_file_read(vid_file_name, vid_f, wf.num_fields, fr); fclose(vid_f); uint32_t num_ints = (wf.num_fields + 32 - 1)/ 32; uint32_t len_ints; for (i = 0; i < gt_q_count; ++i) { uint32_t len_count_R; uint32_t *R; /* * Submit the population query to the PED database and get back both * the list of of ids in R and the length of R in id_lens[i] */ id_lens[i] = resolve_ind_query(&R, id_query_list[i], db_file_name); // Enforce that the offsets of the relevant samples is // within the number of samples in the GQT index. if (id_lens[i] > wf.num_records) { fprintf(stderr, "ERROR: there are more samples in the PED database (%d) " "that match this condition \nthan there are in the GQT " "index (%d). Perhaps your PED file is a superset of " "the\nsamples in your VCF/BCF file?\n", id_lens[i], wf.num_records); return 1; } uint32_t low_v, high_v; /* * q holds the parameters of each query, first determin the range of * bitmaps to pull */ if ( q[i].variant_op == p_maf ) { low_v = 1; high_v = 3; } else { if ( q[i].genotype_condition[0] == 1) low_v = 0; else if ( q[i].genotype_condition[1] == 1) low_v = 1; else if ( q[i].genotype_condition[2] == 1) low_v = 2; else if ( q[i].genotype_condition[3] == 1) low_v = 3; if ( q[i].genotype_condition[3] == 1) high_v = 4; else if ( q[i].genotype_condition[2] == 1) high_v = 3; else if ( q[i].genotype_condition[1] == 1) high_v = 2; else if ( q[i].genotype_condition[0] == 1) high_v = 1; } /* * The set of variants that are printed is stored in a mask for each * query, then those masks are combine to a final mask. Each mask is a * 32-bit packed int, where each bit correspons to one variant. How * those bits are set depends on the filter the user specifices. * * If they simply ask for a count or perecent, then there is not filter * and the mask is set to all 1s. * * If count is followed by a condition, then the count/pct is compared * to that condition and the bits are set for those that meet the * condition. * * If no funtion is used then we simply run the wahbm range query and * convert the wah results to packed ints for the mask * */ /* User asks for a count, percent, or maf */ if ( ( q[i].variant_op == p_count ) || ( q[i].variant_op == p_pct ) || ( q[i].variant_op == p_maf ) ) { if (q[i].variant_op == p_maf) { #ifdef __AVX2__ len_count_R = avx_sum_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #else len_count_R = sum_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #endif } else { #ifdef __AVX2__ len_count_R = avx_count_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #else len_count_R = count_range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, &(counts[i])); #endif } /* Since the variants are in allele freq order, we need to copy * the resulting value to an array that is back in the original * order */ mapped_counts[i] = (uint32_t *)calloc(len_count_R, sizeof(uint32_t)); for ( j = 0; j < len_count_R; ++j) mapped_counts[i][vids[j]] = counts[i][j]; gt_mask[i] = (uint32_t *) malloc(num_ints * sizeof(uint32_t)); if (!gt_mask[i]) err(EX_OSERR, "malloc error"); /* User specifies a condition */ if ( q[i].op_condition != -1) { /* Since we only find counts, when the user asks for a * perecent, just convert that back to the count that meets the * percent condition */ float condition_value = q[i].condition_value; if (q[i].variant_op == p_pct) condition_value *= id_lens[i]; else if (q[i].variant_op == p_maf) condition_value *= id_lens[i]*2; /* Test to see if each count meets the condition */ uint32_t v = 0, int_i = 0, bit_i = 0; for ( j = 0; j < len_count_R; ++j) { if ( query_cmp(counts[i][j], q[i].op_condition, condition_value) ) { v |= 1 << (31 - bit_i); } bit_i += 1; if ( bit_i == 32 ) { gt_mask[i][int_i] = v; int_i += 1; bit_i = 0; v = 0; } } if ( bit_i > 0) gt_mask[i][int_i] = v; } else { // if no op is set then let everything pass for (j = 0; j < num_ints; ++j) gt_mask[i][j] = -1; // set all the bits to 1 } /* User only gives genotype filters, no funtion/condition */ } else { uint32_t *gt_R; uint32_t len_wf_R = range_records_in_place_wahbm(wf, R, id_lens[i], low_v, high_v, >_R); len_ints = wah_to_ints(gt_R,len_wf_R,&(gt_mask[i])); free(gt_R); } free(R); } uint32_t *final_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t)); // combine all of the masks to see what we need to print for (i = 0; i < num_ints; ++i) { final_mask[i] = ~0; for (j = 0; j < gt_q_count; ++j) final_mask[i] &= gt_mask[j][i]; } if (c_is_set == 1) { uint32_t masked_vid_count = 0; for (i = 0; i < num_ints; ++i) masked_vid_count += popcount(final_mask[i]); if (masked_vid_count <= wf.num_fields) printf("%u\n", masked_vid_count); else printf("%u\n", wf.num_fields); } else if ((v_is_set == 1) && (s_is_set == 1)) { get_bcf_query_result(final_mask, num_ints, q, id_query_list, id_lens, gt_q_count, wf.num_fields, vid_file_name, src_bcf_file_name, bcf_output); } else if (b_is_set == 1){ uint32_t *mapped_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t)); uint32_t v,p,leading_zeros, hit; for (i = 0; i < num_ints; ++i) { if (final_mask[i] != 0) { v = final_mask[i]; p = popcount(v); for (j = 0; j < p; ++j) { leading_zeros = __builtin_clz(v); if (i*32 + leading_zeros + 1 > wf.num_fields) break; hit = vids[leading_zeros + i*32]; mapped_mask[hit/32] |= 1 << (31-hit%32); v &= ~(1 << (32 - leading_zeros - 1)); } } if (i*32 + leading_zeros + 1 > wf.num_fields) break; } print_query_result(mapped_mask, num_ints, vids, q, mapped_counts, id_lens, gt_q_count, wf.num_fields, bim_file_name, full_cmd); } for (j = 0; j < gt_q_count; ++j) { free(gt_mask[j]); if ( (q[j].variant_op == p_count) || (q[j].variant_op == p_pct) || (q[j].variant_op == p_maf) ) free(counts[j]); } destroy_wahbm_file(&wf); return 0; }