示例#1
0
文件: query.c 项目: CoREse/gqt
//{{{ int query(int argc, char **argv, char *full_cmd)
int query(int argc, char **argv, char *full_cmd)
{
    if (argc < 2) return query_help(EX_USAGE);

    int c;
    char *input_file_name=NULL,
         *gqt_file_name=NULL,
         *ped_db_file_name=NULL,
         *bim_file_name=NULL,
         *off_file_name=NULL,
         *bcf_file_name=NULL,
         *vid_file_name=NULL,
         *tmp_dir_name=NULL;
    int c_is_set = 0,
        i_is_set = 0,
        id_q_count = 0,
        gt_q_count = 0,
        d_is_set = 0,
        v_is_set = 0,
        V_is_set = 0,
        G_is_set = 0,
        B_is_set = 0,
        O_is_set = 0,
        S_is_set = 0,
        t_is_set = 0,
        bcf_output = 0;

    char *id_query_list[100];
    char *gt_query_list[100];

    //{{{ parse cmd line opts
    while ((c = getopt (argc, argv, "chvi:p:g:d:B:V:G:O:S:t:")) != -1) {
        switch (c) {
        case 'v':
            v_is_set = 1;
            break;
        case 'c':
            c_is_set = 1;
            break;
        case 'i':
            i_is_set = 1;
            input_file_name = optarg;
            break;
        case 'p':
            id_query_list[id_q_count] = optarg;
            id_q_count += 1;
            break;
        case 'g':
            gt_query_list[gt_q_count] = optarg;
            gt_q_count += 1;
            break;
        case 'd':
            d_is_set = 1;
            ped_db_file_name = optarg;
            break;
        case 'B':
            B_is_set = 1;
            bim_file_name = optarg;
            break;
        case 'V':
            V_is_set = 1;
            vid_file_name = optarg;
            break;
        case 'G':
            G_is_set = 1;
            gqt_file_name = optarg;
            break;
        case 'S':
            S_is_set = 1;
            bcf_file_name = optarg;
            break;
        case 'O':
            O_is_set = 1;
            off_file_name = optarg;
            break;
        case 't':
            t_is_set = 1;
            tmp_dir_name = optarg;
            break;
        case 'h':
            return query_help(EX_OK);
        case '?':
            if ( (optopt == 'i') ||
                    (optopt == 'p') ||
                    (optopt == 'g') ||
                    (optopt == 'd') ||
                    (optopt == 'b') )
                fprintf (stderr,
                        "Option -%c requires an argument.\n",
                         optopt);
            else if (isprint (optopt))
                fprintf (stderr, "Unknown option `-%c'.\n", optopt);
            else
                fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
            return query_help(EX_USAGE);
        default:
            return query_help(EX_OK);
        }
    }

    if (t_is_set == 0) {
        if (asprintf(&tmp_dir_name,"./") == -1)
            err(EX_OSERR, "asprintf error");
    }

    if (i_is_set == 0) {
        fprintf (stderr, "No input file given (vcf.gz/bcf/gqt).\n");
        return query_help(EX_NOINPUT);
    }

    if (strlen(input_file_name) < 4) {
        fprintf (stderr,
                 "Cannot determine input file type for file '%s'.\n",
                 input_file_name);
        fprintf (stderr,
                 "NOTE: The file must have a supported extension "
                 "(vcf.gz/bcf/gqt).\n");
        return query_help(EX_NOINPUT);
    }
    // See if -i is a gqt file or a bcf file
    char *input_file_type = (char *)malloc(4*sizeof(char));

    strncpy(input_file_type,
            input_file_name + ( strlen(input_file_name) - 3),
            3 * sizeof(char));
    input_file_type[3] = '\0';

    if (strcmp(input_file_type, "gqt") == 0 ) {
        G_is_set = 1;
        gqt_file_name = input_file_name;
    } else if (strcmp(input_file_type, "bcf") == 0 ) {
        S_is_set = 1;
        bcf_file_name = input_file_name;
   } else {
        if (strlen(input_file_name) < 8) {
            fprintf (stderr,
                    "Cannot determine input file type for file '%s'.\n",
                    input_file_name);
            fprintf (stderr,
                     "NOTE: The file must have a supported extension "
                     "(vcf.gz/bcf/gqt).\n");
            return query_help(EX_NOINPUT);
        }

        free(input_file_type);
        input_file_type = (char *)malloc(6*sizeof(char));

        strncpy(input_file_type,
                input_file_name + ( strlen(input_file_name) - 6),
                6 * sizeof(char));

        if (strcmp(input_file_type, "vcf.gz") == 0 ) {
            S_is_set = 1;
            bcf_file_name = input_file_name;
        } else {
            fprintf (stderr,
                    "Cannot determine input file type for file '%s'.\n",
                    input_file_name);
            fprintf (stderr,
                     "NOTE: The file must have a supported extension "
                     "(vcf.gz/bcf/gqt).\n");
            return query_help(EX_NOINPUT);
        }
    }

    // BCF/VCFGZ file is set
    if (S_is_set == 1) {
        /*
        if ( access( bcf_file_name, F_OK) == -1 )
            err(EX_NOINPUT, "Error accessing BCF file '%s'", bcf_file_name);
        */

        // GQT is not, autodetect
        if (G_is_set == 0) {
            gqt_file_name  = (char*)malloc(strlen(bcf_file_name) + 5); 
            if (!gqt_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(gqt_file_name, bcf_file_name);
            strcat(gqt_file_name, ".gqt");

            if ( ping_file(gqt_file_name) != 0 ) {
                G_is_set = 1;
            } else {
                fprintf(stderr,
                        "Auto detect failure: GQT file '%s' not found\n",
                        gqt_file_name);
                return query_help(EX_NOINPUT);
            }

            /*
            if ( access( gqt_file_name, F_OK) != -1 ) {
                G_is_set = 1;
            } else {
                fprintf(stderr,
                        "Auto detect failure: GQT file '%s' not found\n",
                        gqt_file_name);
                return query_help(EX_NOINPUT);
            }
            */
        } 
        /*
        else {
            if ( access( gqt_file_name, F_OK) == -1 ) {
                fprintf(stderr, "GQT file '%s' not found\n", gqt_file_name);
                return query_help(EX_NOINPUT);
            }
        }
        */

        // PED DB is not, autodetect
        if (d_is_set == 0) {
            ped_db_file_name  = (char*)malloc(strlen(bcf_file_name) + 4); 
            if (!ped_db_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(ped_db_file_name, bcf_file_name);
            strcat(ped_db_file_name, ".db");

            d_is_set = 1;

            /*
            if ( access( ped_db_file_name, F_OK) != -1 ) {
                d_is_set = 1;
            } else {
                fprintf(stderr,
                        "Auto detect failure: DB file '%s' not found\n",
                        ped_db_file_name);
                return query_help(EX_NOINPUT);
            }
            */
        }

        // VID is not, autodetect
        if (V_is_set == 0) {
            vid_file_name  = (char*)malloc(strlen(bcf_file_name) + 5); 
            if (!vid_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(vid_file_name, bcf_file_name);
            strcat(vid_file_name, ".vid");
            V_is_set = 1;

            /*
            if ( access( vid_file_name, F_OK) != -1 ) {
                V_is_set = 1;
            } else {
                fprintf(stderr,
                        "Auto detect failure: VID file '%s' not found\n",
                        vid_file_name);
                return query_help(EX_NOINPUT);
            }
            */
        }

        // Try and find the BIM file, okay if not there (for now)
        if (B_is_set == 0) {
            bim_file_name  = (char*)malloc(strlen(bcf_file_name) + 5); 
            if (!bim_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(bim_file_name, bcf_file_name);
            strcat(bim_file_name, ".bim");
            B_is_set = 1;

            /*
            if ( access( bim_file_name, F_OK) != -1 ) 
                B_is_set = 1;
            */
        }

        // Try and find the OFF file, okay if not there (for now)
        if (O_is_set == 0) {
            off_file_name  = (char*)malloc(strlen(bcf_file_name) + 5); 
            if (!off_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(off_file_name, bcf_file_name);
            strcat(off_file_name, ".off");
            O_is_set = 1;

            /*
            if ( access( off_file_name, F_OK) != -1 ) 
                O_is_set = 1;
            */
        } 
    } else if (G_is_set == 1) { 
        /*
        if ( access( gqt_file_name, F_OK) == -1 ) {
            fprintf(stderr, "GQT file '%s' not found\n", gqt_file_name);
            return query_help(EX_NOINPUT);
        }
        */

        // Try and find the BIM file, okay if not there (for now)
        if (B_is_set == 0) {
            bim_file_name  = (char*)
                    malloc((1+strlen(gqt_file_name))*sizeof(char)); 
            if (!bim_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(bim_file_name, gqt_file_name);
            strcpy(bim_file_name + strlen(bim_file_name) - 3, "bim");
            B_is_set = 1;

            /*
            if ( access( bim_file_name, F_OK) != -1 ) 
                B_is_set = 1;
            */
        } 

        if (O_is_set == 0) {
            off_file_name  = (char*)
                malloc((1+strlen(gqt_file_name))*sizeof(char)); 
            if (!off_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(off_file_name, gqt_file_name);
            strcpy(off_file_name + strlen(off_file_name) - 3, "off");
            O_is_set = 1;
            /*
            if ( access( off_file_name, F_OK) != -1 ) 
                O_is_set = 1;
            */
        } 


        if (V_is_set == 0) {
            vid_file_name  = (char*)
                malloc((1+strlen(gqt_file_name))*sizeof(char)); 
            if (!vid_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(vid_file_name, gqt_file_name);
            strcpy(vid_file_name + strlen(vid_file_name) - 3, "vid");

            V_is_set = 1;

            /*
            if ( access( vid_file_name, F_OK) != -1 ) {
                V_is_set = 1;
            } else {
                fprintf(stderr,
                        "Auto detect failure: VID file '%s' not found\n",
                        vid_file_name);
                return query_help(EX_NOINPUT);
            }
            */
        }

        if (d_is_set == 0) {
            ped_db_file_name  = (char*)
                malloc((1+strlen(gqt_file_name))*sizeof(char)); 
            if (!ped_db_file_name)
                err(EX_OSERR, "malloc error");
            strcpy(ped_db_file_name, gqt_file_name);
            strcpy(ped_db_file_name + strlen(gqt_file_name) - 3, "db\0");
            d_is_set = 1;

            /*
            if ( access( ped_db_file_name, F_OK) != -1 ) {
                d_is_set = 1;
            } else {
                fprintf(stderr,
                        "Auto detect failure: PED DB file '%s' not found\n",
                        ped_db_file_name);
                return query_help(EX_NOINPUT);
            }
            */
        }
    }  else {
        fprintf(stderr,
                "Neither GQT or BCF/VCF.GZ file given.\n");
        return query_help(EX_NOINPUT);
    } 

    /*
    if (B_is_set == 1) {
        if ( access( bim_file_name, F_OK) == -1 )
            err(EX_NOINPUT, "Error accessing BIM file '%s'", bim_file_name);
    }

    if (O_is_set == 1) {
        if ( access( off_file_name, F_OK) == -1 )
            err(EX_NOINPUT, "Error accessing OFF file '%s'", bim_file_name);
    }
    */

    if (V_is_set == 0) {
        fprintf(stderr, "VID file is not set\n");
        return query_help(EX_NOINPUT);
    }

    if (d_is_set == 0) {
        fprintf(stderr, "DB file is not set\n");
        return query_help(EX_NOINPUT);
    } 

    if (gt_q_count != id_q_count) {
        fprintf(stderr, 
                "Mismatched number of individual and genotype query strings\n");
        return query_help(EX_USAGE);
    }


    if ((B_is_set == 0) && (O_is_set == 0) && (c_is_set == 0)) {
        fprintf(stderr, 
                "Must set either BIM or OFF files when doing anything "
                "other than counting.\n");
        return query_help(EX_USAGE);
    }

    if ( (v_is_set == 1) && ((O_is_set == 0) || (S_is_set == 0)) ) {
        fprintf(stderr, 
                "To get genotypes source BCF/VCF.GZ and OFF files "
                "must be set.\n");
        return query_help(EX_USAGE);
    }

    if ( (v_is_set == 0) && (B_is_set == 0) && (c_is_set == 0)) {
        fprintf(stderr, 
                "To get variant data only BIM file must be set.\n");
        return query_help(EX_USAGE);
    }


    //}}}

    struct gqt_query q[100];
    uint32_t *gt_mask[100];
    uint32_t *counts[100];
    uint32_t *mapped_counts[100];
    uint32_t id_lens[100];

    int r, i, j, k;

    for (i = 0; i < gt_q_count; ++i) {
        if (parse_q(gt_query_list[i], &(q[i]))) {
            fprintf(stderr, "in the %dth genotype query.\n", i+1);
            return 1;
        }
    }

    struct wahbm_file *wf = open_wahbm_file(gqt_file_name);
    struct vid_file *vid_f = open_vid_file(vid_file_name);
    load_vid_data(vid_f);

    //uint32_t num_ints = (wf.num_fields + 32 - 1)/ 32;
    uint32_t num_ints = (wf->gqt_header->num_variants + 32 - 1)/ 32;
    uint32_t len_ints;
    uint32_t *U_R = NULL;
    uint32_t U_R_len = 0;

    for (i = 0; i < gt_q_count; ++i) {
        uint32_t len_count_R;
        uint32_t *R;
        /* 
         * Submit the population query to the PED database and get back both
         * the list of of ids in R and the length of R in id_lens[i]
         */
        id_lens[i] = resolve_ind_query(&R,
                                      id_query_list[i],
                                      ped_db_file_name,
                                      tmp_dir_name);

        uint32_t *tmp_U_R = (uint32_t *)
                realloc(U_R, (U_R_len + id_lens[i]) * sizeof(uint32_t));
        if (!tmp_U_R)
            err(EX_OSERR, "malloc error");
        else
            U_R = tmp_U_R;


        for (j = 0; j < id_lens[i]; ++j) {
            U_R[U_R_len] = R[j];
            U_R_len += 1;
        }

        // Enforce that the offsets of the relevant samples is 
        // within the number of samples in the GQT index.
        if (id_lens[i] > wf->gqt_header->num_samples) {
            fprintf(stderr, 
                    "ERROR: there are more samples in the PED database (%d) "
                    "that match this condition \nthan there are in the GQT "
                    "index (%d).  Perhaps your PED file is a superset of "
                    "the\nsamples in your VCF/BCF file?\n", 
                    id_lens[i], 
                    wf->gqt_header->num_samples);
            return 1;
        }

        uint32_t low_v = 0, high_v = 0;

        /*
         * q holds the parameters of each query, first determin the range of 
         * bitmaps to pull
         */
        if ( q[i].variant_op == p_maf ) {
            low_v = 1;
            high_v = 3;
        } else {
            if ( q[i].genotype_condition[0] == 1)
                low_v = 0;
            else if ( q[i].genotype_condition[1] == 1)
                low_v = 1;
            else if ( q[i].genotype_condition[2] == 1)
                low_v = 2;
            else if ( q[i].genotype_condition[3] == 1)
                low_v = 3;

            if ( q[i].genotype_condition[3] == 1)
                high_v = 4;
            else if ( q[i].genotype_condition[2] == 1)
                high_v = 3;
            else if ( q[i].genotype_condition[1] == 1)
                high_v = 2;
            else if ( q[i].genotype_condition[0] == 1)
                high_v = 1;
        }

        /*
         * The set of variants that are printed is stored in a mask for each
         * query, then those masks are combine to a final mask.  Each mask is a
         * 32-bit packed int, where each bit correspons to one variant.  How
         * those bits are set depends on the filter the user specifices.
         *
         * If they simply ask for a count or perecent, then there is not filter
         * and the mask is set to all 1s.
         *
         * If count is followed by a condition, then the count/pct is compared
         * to that condition and the bits are set for those that meet the
         * condition.
         *
         * If no funtion is used then we simply run the wahbm range query and
         * convert the wah results to packed ints for the mask
         *
         */

        /* User asks for a count, percent, or maf */
        if ( ( q[i].variant_op == p_count ) || 
             ( q[i].variant_op == p_pct ) ||
             ( q[i].variant_op == p_maf ) ) {

            if (q[i].variant_op == p_maf) {
#ifdef __AVX2__
            len_count_R = avx_sum_range_records_in_place_wahbm(wf,
                                                               R,
                                                               id_lens[i],
                                                               low_v,
                                                               high_v,
                                                               &(counts[i]));
#else
            len_count_R = sum_range_records_in_place_wahbm(wf,
                                                           R,
                                                           id_lens[i],
                                                           low_v,
                                                           high_v,
                                                           &(counts[i]));
                                                           
#endif
            } else {
#ifdef __AVX2__
                len_count_R = 
                    avx_count_range_records_in_place_wahbm(wf,
                                                           R,
                                                           id_lens[i],
                                                           low_v,
                                                           high_v,
                                                           &(counts[i]));
#else
                len_count_R = 
                    count_range_records_in_place_wahbm(wf,
                                                       R,
                                                       id_lens[i],
                                                       low_v,
                                                       high_v,
                                                       &(counts[i]));
#endif
            }

            /* Since the variants are in allele freq order, we need to copy
             * the resulting value to an array that is back in the original
             * order
             */
            mapped_counts[i] = (uint32_t *)calloc(len_count_R,
                                                  sizeof(uint32_t));
            for ( j = 0; j < len_count_R; ++j)
                mapped_counts[i][vid_f->vids[j]] = counts[i][j];

            gt_mask[i] = (uint32_t *) malloc(num_ints * sizeof(uint32_t));
            if (!gt_mask[i])
                err(EX_OSERR, "malloc error");

            /* User specifies a condition */
            if ( q[i].op_condition != -1) { 

                /* Since we only find counts, when the user asks for a
                 * perecent, just convert that back to the count that meets the
                 * percent condition
                 */
                float condition_value = q[i].condition_value;
                if (q[i].variant_op == p_pct) 
                    condition_value *= id_lens[i];
                else if (q[i].variant_op == p_maf)
                    condition_value *= id_lens[i]*2;


                /* Test to see if each count meets the condition */
                uint32_t v = 0, int_i = 0, bit_i = 0;
                for ( j = 0; j < len_count_R; ++j) {
                    if ( query_cmp(counts[i][j],
                                   q[i].op_condition,
                                   condition_value) ) {
                        v |= 1 << (31 - bit_i);
                    }

                    bit_i += 1;
                    if ( bit_i == 32 ) {
                        gt_mask[i][int_i] = v;
                        int_i += 1;
                        bit_i = 0;
                        v = 0;
                    }
                }
            
                if ( bit_i > 0)
                    gt_mask[i][int_i] = v;
            } else {
                // if no op is set then let everything pass
                for (j = 0; j < num_ints; ++j)
                    gt_mask[i][j] = -1; // set all the bits to 1
            }
        /* User only gives genotype filters, no funtion/condition */
        } else {
            uint32_t *gt_R;
            uint32_t len_wf_R = range_records_in_place_wahbm(wf,
                                                                 R,
                                                                 id_lens[i],
                                                                 low_v,
                                                                 high_v,
                                                                 &gt_R);
            len_ints = wah_to_ints(gt_R,len_wf_R,&(gt_mask[i]));
            free(gt_R);
        }
        free(R);
    }

    if (U_R == NULL) {
        U_R_len= resolve_ind_query(&U_R,
                                   "",
                                   ped_db_file_name,
                                   tmp_dir_name);
    }

    // Get the uniq elements in place
    qsort(U_R, U_R_len, sizeof(uint32_t), compare_uint32_t);
    for (i = j = 0; i < U_R_len; i++)
        if (U_R[i] != U_R[j]) 
            U_R[++j] = U_R[i];
    U_R_len = j + 1;


    uint32_t *final_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t));

    // combine all of the masks to see what we need to print
    for (i = 0; i < num_ints; ++i) {
        final_mask[i] = ~0;
        for (j = 0; j < gt_q_count; ++j)
            final_mask[i] &= gt_mask[j][i];
    }

    if (c_is_set == 1) {
        uint32_t masked_vid_count = 0;

        for (i = 0; i < num_ints; ++i) 
            masked_vid_count += popcount(final_mask[i]);

        if (masked_vid_count <= wf->gqt_header->num_variants)
            printf("%u\n", masked_vid_count);
        else
            printf("%u\n", wf->gqt_header->num_variants);

    } else if ( (B_is_set == 1) || (O_is_set == 1)){

        uint32_t *mapped_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t));

        uint32_t v,p,leading_zeros=32, hit;
        for (i = 0; i < num_ints; ++i) {
            if (final_mask[i] != 0) {
                v = final_mask[i];
                p = popcount(v);
                for (j = 0; j < p; ++j) {
                    leading_zeros = __builtin_clz(v);

                    if (i*32 + leading_zeros + 1 > wf->gqt_header->num_variants)
                        break;

                    hit = vid_f->vids[leading_zeros + i*32];

                    mapped_mask[hit/32] |= 1 << (31-hit%32);
                    v &= ~(1 << (32 - leading_zeros - 1));
                }
            }
            if (i*32 + leading_zeros + 1 > wf->gqt_header->num_variants)
                break;
        }

        if (v_is_set == 0)
            print_query_result_bim(mapped_mask,
                               num_ints,
                               vid_f->vids,
                               q,
                               mapped_counts,
                               id_lens,
                               gt_q_count,
                               wf->gqt_header->num_variants,
                               bim_file_name,
                               full_cmd);
        else
            print_query_result_offset(mapped_mask,
                                      num_ints,
                                      vid_f->vids,
                                      q,
                                      mapped_counts,
                                      id_lens,
                                      U_R,
                                      U_R_len,
                                      id_query_list,
                                      gt_query_list,
                                      gt_q_count,
                                      wf->gqt_header->num_variants,
                                      off_file_name,
                                      bcf_file_name,
                                      full_cmd);
    }

    for (j = 0; j < gt_q_count; ++j) {
        free(gt_mask[j]);
        if ( (q[j].variant_op == p_count) || 
             (q[j].variant_op == p_pct) ||
             (q[j].variant_op == p_maf) )
            free(counts[j]);
    }

    destroy_vid_file(vid_f);
    destroy_wahbm_file(wf);
    return 0;
}
示例#2
0
//{{{ int query(int argc, char **argv, char *full_cmd)
int query(int argc, char **argv, char *full_cmd)
{
    if (argc < 2) return query_help();

    int c;
    char *wahbm_file_name=NULL,
         *id_query=NULL,
         *gt_query=NULL,
         *db_file_name=NULL,
         *bim_file_name=NULL,
         *src_bcf_file_name=NULL,
         *vid_file_name=NULL;
    int i_is_set = 0,
        id_q_count = 0,
        gt_q_count = 0,
        d_is_set = 0,
        c_is_set = 0,
        v_is_set = 0,
        s_is_set = 0,
        b_is_set = 0,
        bcf_output = 0;

    char *id_query_list[100];
    char *gt_query_list[100];

    //{{{ parse cmd line opts
    while ((c = getopt (argc, argv, "chi:p:g:d:b:v:s:B")) != -1) {
        switch (c) {
        case 'c':
            c_is_set = 1;
            break;
        case 'i':
            i_is_set = 1;
            wahbm_file_name = optarg;
            break;
        case 'p':
            id_query_list[id_q_count] = optarg;
            id_q_count += 1;
            break;
        case 'g':
            gt_query_list[gt_q_count] = optarg;
            gt_q_count += 1;
            break;
        case 'd':
            d_is_set = 1;
            db_file_name = optarg;
            break;
        case 'b':
            b_is_set = 1;
            bim_file_name = optarg;
            break;
        case 'v':
            v_is_set = 1;
            vid_file_name = optarg;
            break;
        case 's':
            s_is_set = 1;
            src_bcf_file_name = optarg;
            break;
        case 'B':
            bcf_output = 1;
            break;
        case 'h':
            return query_help();
        case '?':
            if ( (optopt == 'i') ||
                    (optopt == 'p') ||
                    (optopt == 'g') ||
                    (optopt == 'd') ||
                    (optopt == 'b') )
                fprintf (stderr,
                        "Option -%c requires an argument.\n",
                         optopt);
            else if (isprint (optopt))
                fprintf (stderr, "Unknown option `-%c'.\n", optopt);
            else
                fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
            return query_help();
        default:
            return query_help();
        }
    }

    if (i_is_set == 0) {
        fprintf(stderr, "GQT file is not set\n");
        return query_help();
    } else {
        if ( access( wahbm_file_name, F_OK) == -1 )
            err(EX_NOINPUT, "Error accessing GQT file \"%s\"", wahbm_file_name);
    }

    if (d_is_set == 1) {
        if ( access( db_file_name, F_OK) == -1 )
            err(EX_NOINPUT,
                "Error accessing PED DB file \"%s\"",
                db_file_name);
    }


    // Try to auto-detect file names based on GQT
    if ( (i_is_set == 1) && (b_is_set == 0)) {

        int auto_bim_file_name_size = asprintf(&bim_file_name,
                                               "%s",
                                               wahbm_file_name);
        strcpy(bim_file_name + strlen(bim_file_name) - 3, "bim");

        if ( access( bim_file_name, F_OK) != -1 ) {
            b_is_set = 1;
        } else {
            fprintf(stderr,
                    "Auto detect failure: BIM file %s not found\n",
                    bim_file_name);
            return query_help();
        }
    }

    if ( (i_is_set == 1) && (v_is_set == 0)) {

        int auto_vid_file_name_size = asprintf(&vid_file_name,
                                               "%s",
                                               wahbm_file_name);
        strcpy(vid_file_name + strlen(vid_file_name) - 3, "vid");

        if ( access( vid_file_name, F_OK) != -1 ) {
            v_is_set = 1;
        } else {
            fprintf(stderr,
                    "Auto detect failure: VID file %s not found\n",
                    vid_file_name);
            return query_help();
        }
    }

    ///////////////////////////////
    if ( (i_is_set == 1) && (d_is_set == 0)) {

        int auto_db_file_name_size = asprintf(&db_file_name,
                                              "%s",
                                              wahbm_file_name);
        strcpy(db_file_name + strlen(db_file_name) - 3, "db\0");

        if ( access( db_file_name, F_OK) != -1 ) {
            d_is_set = 1;
        } else {
            fprintf(stderr,
                    "Auto detect failure: PED DB file %s not found\n",
                    db_file_name);
            return query_help();
        }
    }


    if (v_is_set == 0) {
        fprintf(stderr, "VID file is not set\n");
        return query_help();
    }

    if (b_is_set == 0) {
        fprintf(stderr, "BIM file is not set\n");
        return query_help();
    }

    if (d_is_set == 0) {
        fprintf(stderr, "PED database file is not set\n");
        return query_help();
    }

    if (gt_q_count != id_q_count) {
        fprintf(stderr, 
                "Mismatched number of individual and genotype query strings\n");
        return query_help();
    }
    //}}}

    struct gqt_query q[100];
    uint32_t *gt_mask[100];
    uint32_t *counts[100];
    uint32_t *mapped_counts[100];
    uint32_t id_lens[100];

    int r, i, j, k;

    for (i = 0; i < gt_q_count; ++i) {
        if (parse_q(gt_query_list[i], &(q[i]))) {
            fprintf(stderr, "in the %dth genotype query.\n", i+1);
            return 1;
        }
    }

    // open WAH/GQT file
    struct wah_file wf = init_wahbm_file(wahbm_file_name);

    // open VID file
    FILE *vid_f = fopen(vid_file_name, "rb");
    if (!vid_f)
        err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name);

    uint32_t *vids = (uint32_t *) malloc(wf.num_fields*sizeof(uint32_t));
    if (!vids)
        err(EX_OSERR, "malloc error");

    size_t fr = fread(vids, sizeof(uint32_t), wf.num_fields, vid_f);
    check_file_read(vid_file_name, vid_f, wf.num_fields, fr);

    fclose(vid_f);

    uint32_t num_ints = (wf.num_fields + 32 - 1)/ 32;
    uint32_t len_ints;

    for (i = 0; i < gt_q_count; ++i) {
        uint32_t len_count_R;
        uint32_t *R;
        /* 
         * Submit the population query to the PED database and get back both
         * the list of of ids in R and the length of R in id_lens[i]
         */
        id_lens[i] = resolve_ind_query(&R,
                                      id_query_list[i],
                                      db_file_name);

        // Enforce that the offsets of the relevant samples is 
        // within the number of samples in the GQT index.
        if (id_lens[i] > wf.num_records) {
            fprintf(stderr, 
                    "ERROR: there are more samples in the PED database (%d) "
                    "that match this condition \nthan there are in the GQT "
                    "index (%d).  Perhaps your PED file is a superset of "
                    "the\nsamples in your VCF/BCF file?\n", 
                    id_lens[i], 
                    wf.num_records);
            return 1;
        }

        uint32_t low_v, high_v;

        /*
         * q holds the parameters of each query, first determin the range of 
         * bitmaps to pull
         */
        if ( q[i].variant_op == p_maf ) {
            low_v = 1;
            high_v = 3;
        } else {
            if ( q[i].genotype_condition[0] == 1)
                low_v = 0;
            else if ( q[i].genotype_condition[1] == 1)
                low_v = 1;
            else if ( q[i].genotype_condition[2] == 1)
                low_v = 2;
            else if ( q[i].genotype_condition[3] == 1)
                low_v = 3;

            if ( q[i].genotype_condition[3] == 1)
                high_v = 4;
            else if ( q[i].genotype_condition[2] == 1)
                high_v = 3;
            else if ( q[i].genotype_condition[1] == 1)
                high_v = 2;
            else if ( q[i].genotype_condition[0] == 1)
                high_v = 1;
        }

        /*
         * The set of variants that are printed is stored in a mask for each
         * query, then those masks are combine to a final mask.  Each mask is a
         * 32-bit packed int, where each bit correspons to one variant.  How
         * those bits are set depends on the filter the user specifices.
         *
         * If they simply ask for a count or perecent, then there is not filter
         * and the mask is set to all 1s.
         *
         * If count is followed by a condition, then the count/pct is compared
         * to that condition and the bits are set for those that meet the
         * condition.
         *
         * If no funtion is used then we simply run the wahbm range query and
         * convert the wah results to packed ints for the mask
         *
         */

        /* User asks for a count, percent, or maf */
        if ( ( q[i].variant_op == p_count ) || 
             ( q[i].variant_op == p_pct ) ||
             ( q[i].variant_op == p_maf ) ) {

            if (q[i].variant_op == p_maf) {
#ifdef __AVX2__
            len_count_R = avx_sum_range_records_in_place_wahbm(wf,
                                                               R,
                                                               id_lens[i],
                                                               low_v,
                                                               high_v,
                                                               &(counts[i]));
#else
            len_count_R = sum_range_records_in_place_wahbm(wf,
                                                           R,
                                                           id_lens[i],
                                                           low_v,
                                                           high_v,
                                                           &(counts[i]));
                                                           
#endif
            } else {
#ifdef __AVX2__
                len_count_R = 
                    avx_count_range_records_in_place_wahbm(wf,
                                                           R,
                                                           id_lens[i],
                                                           low_v,
                                                           high_v,
                                                           &(counts[i]));
#else
                len_count_R = 
                    count_range_records_in_place_wahbm(wf,
                                                       R,
                                                       id_lens[i],
                                                       low_v,
                                                       high_v,
                                                       &(counts[i]));
#endif
            }

            /* Since the variants are in allele freq order, we need to copy
             * the resulting value to an array that is back in the original
             * order
             */
            mapped_counts[i] = (uint32_t *)calloc(len_count_R,
                                                  sizeof(uint32_t));
            for ( j = 0; j < len_count_R; ++j)
                mapped_counts[i][vids[j]] = counts[i][j];

            gt_mask[i] = (uint32_t *) malloc(num_ints * sizeof(uint32_t));
            if (!gt_mask[i])
                err(EX_OSERR, "malloc error");

            /* User specifies a condition */
            if ( q[i].op_condition != -1) { 

                /* Since we only find counts, when the user asks for a
                 * perecent, just convert that back to the count that meets the
                 * percent condition
                 */
                float condition_value = q[i].condition_value;
                if (q[i].variant_op == p_pct) 
                    condition_value *= id_lens[i];
                else if (q[i].variant_op == p_maf)
                    condition_value *= id_lens[i]*2;


                /* Test to see if each count meets the condition */
                uint32_t v = 0, int_i = 0, bit_i = 0;
                for ( j = 0; j < len_count_R; ++j) {
                    if ( query_cmp(counts[i][j],
                                   q[i].op_condition,
                                   condition_value) ) {
                        v |= 1 << (31 - bit_i);
                    }

                    bit_i += 1;
                    if ( bit_i == 32 ) {
                        gt_mask[i][int_i] = v;
                        int_i += 1;
                        bit_i = 0;
                        v = 0;
                    }
                }
            
                if ( bit_i > 0)
                    gt_mask[i][int_i] = v;
            } else {
                // if no op is set then let everything pass
                for (j = 0; j < num_ints; ++j)
                    gt_mask[i][j] = -1; // set all the bits to 1
            }
        /* User only gives genotype filters, no funtion/condition */
        } else {
            uint32_t *gt_R;
            uint32_t len_wf_R = range_records_in_place_wahbm(wf,
                                                                 R,
                                                                 id_lens[i],
                                                                 low_v,
                                                                 high_v,
                                                                 &gt_R);
            len_ints = wah_to_ints(gt_R,len_wf_R,&(gt_mask[i]));
            free(gt_R);
        }
        free(R);
    }

    uint32_t *final_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t));

    // combine all of the masks to see what we need to print
    for (i = 0; i < num_ints; ++i) {
        final_mask[i] = ~0;
        for (j = 0; j < gt_q_count; ++j)
            final_mask[i] &= gt_mask[j][i];
    }

    if (c_is_set == 1) {
        uint32_t masked_vid_count = 0;

        for (i = 0; i < num_ints; ++i) 
            masked_vid_count += popcount(final_mask[i]);

        if (masked_vid_count <= wf.num_fields)
            printf("%u\n", masked_vid_count);
        else
            printf("%u\n", wf.num_fields);

    } else if ((v_is_set == 1) && (s_is_set == 1)) {
        get_bcf_query_result(final_mask,
                             num_ints, 
                             q,
                             id_query_list,
                             id_lens,
                             gt_q_count,
                             wf.num_fields,
                             vid_file_name,
                             src_bcf_file_name,
                             bcf_output);
    } else if (b_is_set == 1){

        uint32_t *mapped_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t));

        uint32_t v,p,leading_zeros, hit;
        for (i = 0; i < num_ints; ++i) {
            if (final_mask[i] != 0) {
                v = final_mask[i];
                p = popcount(v);
                for (j = 0; j < p; ++j) {
                    leading_zeros = __builtin_clz(v);

                    if (i*32 + leading_zeros + 1 > wf.num_fields)
                        break;

                    hit = vids[leading_zeros + i*32];

                    mapped_mask[hit/32] |= 1 << (31-hit%32);
                    v &= ~(1 << (32 - leading_zeros - 1));
                }
            }
            if (i*32 + leading_zeros + 1 > wf.num_fields)
                break;
        }

        print_query_result(mapped_mask,
                           num_ints,
                           vids,
                           q,
                           mapped_counts,
                           id_lens,
                           gt_q_count,
                           wf.num_fields,
                           bim_file_name,
                           full_cmd);
    }

    for (j = 0; j < gt_q_count; ++j) {
        free(gt_mask[j]);
        if ( (q[j].variant_op == p_count) || 
             (q[j].variant_op == p_pct) ||
             (q[j].variant_op == p_maf) )
            free(counts[j]);
    }

    destroy_wahbm_file(&wf);
    return 0;
}