예제 #1
0
파일: gt.c 프로젝트: arq5x/gqt
int gt_wahbm(char *in,
             unsigned int query_value,
             unsigned int *R,
             unsigned int num_records,
             int time,
             int quiet,
             char *bim)

{
    start();
    struct wah_file wf = init_wahbm_file(in);
    unsigned int *wf_R;
    unsigned int len_wf_R = gt_records_wahbm(wf,
                                             R,
                                             num_records,
                                             query_value,
                                             &wf_R);

    unsigned int *ints;
    unsigned int len_ints = wah_to_ints(wf_R,len_wf_R,&ints);
    stop();
    if (time != 0)
        fprintf(stderr,"%lu\n", report());

    if (quiet == 0)
        print_result(len_ints, ints, wf.num_fields, bim);

    free(ints);
    free(wf_R);
    fclose(wf.file);

    return 0;
}
예제 #2
0
파일: query.c 프로젝트: jeromekelleher/gqt
//{{{ int query(int argc, char **argv, char *full_cmd)
int query(int argc, char **argv, char *full_cmd)
{
    if (argc < 2) return query_help();

    int c;
    char *wahbm_file_name=NULL,
         *id_query=NULL,
         *gt_query=NULL,
         *db_file_name=NULL,
         *bim_file_name=NULL,
         *src_bcf_file_name=NULL,
         *vid_file_name=NULL;
    int i_is_set = 0,
        id_q_count = 0,
        gt_q_count = 0,
        d_is_set = 0,
        c_is_set = 0,
        v_is_set = 0,
        s_is_set = 0,
        b_is_set = 0,
        bcf_output = 0;

    char *id_query_list[100];
    char *gt_query_list[100];

    //{{{ parse cmd line opts
    while ((c = getopt (argc, argv, "chi:p:g:d:b:v:s:B")) != -1) {
        switch (c) {
        case 'c':
            c_is_set = 1;
            break;
        case 'i':
            i_is_set = 1;
            wahbm_file_name = optarg;
            break;
        case 'p':
            id_query_list[id_q_count] = optarg;
            id_q_count += 1;
            break;
        case 'g':
            gt_query_list[gt_q_count] = optarg;
            gt_q_count += 1;
            break;
        case 'd':
            d_is_set = 1;
            db_file_name = optarg;
            break;
        case 'b':
            b_is_set = 1;
            bim_file_name = optarg;
            break;
        case 'v':
            v_is_set = 1;
            vid_file_name = optarg;
            break;
        case 's':
            s_is_set = 1;
            src_bcf_file_name = optarg;
            break;
        case 'B':
            bcf_output = 1;
            break;
        case 'h':
            return query_help();
        case '?':
            if ( (optopt == 'i') ||
                    (optopt == 'p') ||
                    (optopt == 'g') ||
                    (optopt == 'd') ||
                    (optopt == 'b') )
                fprintf (stderr,
                        "Option -%c requires an argument.\n",
                         optopt);
            else if (isprint (optopt))
                fprintf (stderr, "Unknown option `-%c'.\n", optopt);
            else
                fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
            return query_help();
        default:
            return query_help();
        }
    }

    if (i_is_set == 0) {
        fprintf(stderr, "GQT file is not set\n");
        return query_help();
    } else {
        if ( access( wahbm_file_name, F_OK) == -1 )
            err(EX_NOINPUT, "Error accessing GQT file \"%s\"", wahbm_file_name);
    }

    if (d_is_set == 1) {
        if ( access( db_file_name, F_OK) == -1 )
            err(EX_NOINPUT,
                "Error accessing PED DB file \"%s\"",
                db_file_name);
    }


    // Try to auto-detect file names based on GQT
    if ( (i_is_set == 1) && (b_is_set == 0)) {

        int auto_bim_file_name_size = asprintf(&bim_file_name,
                                               "%s",
                                               wahbm_file_name);
        strcpy(bim_file_name + strlen(bim_file_name) - 3, "bim");

        if ( access( bim_file_name, F_OK) != -1 ) {
            b_is_set = 1;
        } else {
            fprintf(stderr,
                    "Auto detect failure: BIM file %s not found\n",
                    bim_file_name);
            return query_help();
        }
    }

    if ( (i_is_set == 1) && (v_is_set == 0)) {

        int auto_vid_file_name_size = asprintf(&vid_file_name,
                                               "%s",
                                               wahbm_file_name);
        strcpy(vid_file_name + strlen(vid_file_name) - 3, "vid");

        if ( access( vid_file_name, F_OK) != -1 ) {
            v_is_set = 1;
        } else {
            fprintf(stderr,
                    "Auto detect failure: VID file %s not found\n",
                    vid_file_name);
            return query_help();
        }
    }

    ///////////////////////////////
    if ( (i_is_set == 1) && (d_is_set == 0)) {

        int auto_db_file_name_size = asprintf(&db_file_name,
                                              "%s",
                                              wahbm_file_name);
        strcpy(db_file_name + strlen(db_file_name) - 3, "db\0");

        if ( access( db_file_name, F_OK) != -1 ) {
            d_is_set = 1;
        } else {
            fprintf(stderr,
                    "Auto detect failure: PED DB file %s not found\n",
                    db_file_name);
            return query_help();
        }
    }


    if (v_is_set == 0) {
        fprintf(stderr, "VID file is not set\n");
        return query_help();
    }

    if (b_is_set == 0) {
        fprintf(stderr, "BIM file is not set\n");
        return query_help();
    }

    if (d_is_set == 0) {
        fprintf(stderr, "PED database file is not set\n");
        return query_help();
    }

    if (gt_q_count != id_q_count) {
        fprintf(stderr, 
                "Mismatched number of individual and genotype query strings\n");
        return query_help();
    }
    //}}}

    struct gqt_query q[100];
    uint32_t *gt_mask[100];
    uint32_t *counts[100];
    uint32_t *mapped_counts[100];
    uint32_t id_lens[100];

    int r, i, j, k;

    for (i = 0; i < gt_q_count; ++i) {
        if (parse_q(gt_query_list[i], &(q[i]))) {
            fprintf(stderr, "in the %dth genotype query.\n", i+1);
            return 1;
        }
    }

    // open WAH/GQT file
    struct wah_file wf = init_wahbm_file(wahbm_file_name);

    // open VID file
    FILE *vid_f = fopen(vid_file_name, "rb");
    if (!vid_f)
        err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name);

    uint32_t *vids = (uint32_t *) malloc(wf.num_fields*sizeof(uint32_t));
    if (!vids)
        err(EX_OSERR, "malloc error");

    size_t fr = fread(vids, sizeof(uint32_t), wf.num_fields, vid_f);
    check_file_read(vid_file_name, vid_f, wf.num_fields, fr);

    fclose(vid_f);

    uint32_t num_ints = (wf.num_fields + 32 - 1)/ 32;
    uint32_t len_ints;

    for (i = 0; i < gt_q_count; ++i) {
        uint32_t len_count_R;
        uint32_t *R;
        /* 
         * Submit the population query to the PED database and get back both
         * the list of of ids in R and the length of R in id_lens[i]
         */
        id_lens[i] = resolve_ind_query(&R,
                                      id_query_list[i],
                                      db_file_name);

        // Enforce that the offsets of the relevant samples is 
        // within the number of samples in the GQT index.
        if (id_lens[i] > wf.num_records) {
            fprintf(stderr, 
                    "ERROR: there are more samples in the PED database (%d) "
                    "that match this condition \nthan there are in the GQT "
                    "index (%d).  Perhaps your PED file is a superset of "
                    "the\nsamples in your VCF/BCF file?\n", 
                    id_lens[i], 
                    wf.num_records);
            return 1;
        }

        uint32_t low_v, high_v;

        /*
         * q holds the parameters of each query, first determin the range of 
         * bitmaps to pull
         */
        if ( q[i].variant_op == p_maf ) {
            low_v = 1;
            high_v = 3;
        } else {
            if ( q[i].genotype_condition[0] == 1)
                low_v = 0;
            else if ( q[i].genotype_condition[1] == 1)
                low_v = 1;
            else if ( q[i].genotype_condition[2] == 1)
                low_v = 2;
            else if ( q[i].genotype_condition[3] == 1)
                low_v = 3;

            if ( q[i].genotype_condition[3] == 1)
                high_v = 4;
            else if ( q[i].genotype_condition[2] == 1)
                high_v = 3;
            else if ( q[i].genotype_condition[1] == 1)
                high_v = 2;
            else if ( q[i].genotype_condition[0] == 1)
                high_v = 1;
        }

        /*
         * The set of variants that are printed is stored in a mask for each
         * query, then those masks are combine to a final mask.  Each mask is a
         * 32-bit packed int, where each bit correspons to one variant.  How
         * those bits are set depends on the filter the user specifices.
         *
         * If they simply ask for a count or perecent, then there is not filter
         * and the mask is set to all 1s.
         *
         * If count is followed by a condition, then the count/pct is compared
         * to that condition and the bits are set for those that meet the
         * condition.
         *
         * If no funtion is used then we simply run the wahbm range query and
         * convert the wah results to packed ints for the mask
         *
         */

        /* User asks for a count, percent, or maf */
        if ( ( q[i].variant_op == p_count ) || 
             ( q[i].variant_op == p_pct ) ||
             ( q[i].variant_op == p_maf ) ) {

            if (q[i].variant_op == p_maf) {
#ifdef __AVX2__
            len_count_R = avx_sum_range_records_in_place_wahbm(wf,
                                                               R,
                                                               id_lens[i],
                                                               low_v,
                                                               high_v,
                                                               &(counts[i]));
#else
            len_count_R = sum_range_records_in_place_wahbm(wf,
                                                           R,
                                                           id_lens[i],
                                                           low_v,
                                                           high_v,
                                                           &(counts[i]));
                                                           
#endif
            } else {
#ifdef __AVX2__
                len_count_R = 
                    avx_count_range_records_in_place_wahbm(wf,
                                                           R,
                                                           id_lens[i],
                                                           low_v,
                                                           high_v,
                                                           &(counts[i]));
#else
                len_count_R = 
                    count_range_records_in_place_wahbm(wf,
                                                       R,
                                                       id_lens[i],
                                                       low_v,
                                                       high_v,
                                                       &(counts[i]));
#endif
            }

            /* Since the variants are in allele freq order, we need to copy
             * the resulting value to an array that is back in the original
             * order
             */
            mapped_counts[i] = (uint32_t *)calloc(len_count_R,
                                                  sizeof(uint32_t));
            for ( j = 0; j < len_count_R; ++j)
                mapped_counts[i][vids[j]] = counts[i][j];

            gt_mask[i] = (uint32_t *) malloc(num_ints * sizeof(uint32_t));
            if (!gt_mask[i])
                err(EX_OSERR, "malloc error");

            /* User specifies a condition */
            if ( q[i].op_condition != -1) { 

                /* Since we only find counts, when the user asks for a
                 * perecent, just convert that back to the count that meets the
                 * percent condition
                 */
                float condition_value = q[i].condition_value;
                if (q[i].variant_op == p_pct) 
                    condition_value *= id_lens[i];
                else if (q[i].variant_op == p_maf)
                    condition_value *= id_lens[i]*2;


                /* Test to see if each count meets the condition */
                uint32_t v = 0, int_i = 0, bit_i = 0;
                for ( j = 0; j < len_count_R; ++j) {
                    if ( query_cmp(counts[i][j],
                                   q[i].op_condition,
                                   condition_value) ) {
                        v |= 1 << (31 - bit_i);
                    }

                    bit_i += 1;
                    if ( bit_i == 32 ) {
                        gt_mask[i][int_i] = v;
                        int_i += 1;
                        bit_i = 0;
                        v = 0;
                    }
                }
            
                if ( bit_i > 0)
                    gt_mask[i][int_i] = v;
            } else {
                // if no op is set then let everything pass
                for (j = 0; j < num_ints; ++j)
                    gt_mask[i][j] = -1; // set all the bits to 1
            }
        /* User only gives genotype filters, no funtion/condition */
        } else {
            uint32_t *gt_R;
            uint32_t len_wf_R = range_records_in_place_wahbm(wf,
                                                                 R,
                                                                 id_lens[i],
                                                                 low_v,
                                                                 high_v,
                                                                 &gt_R);
            len_ints = wah_to_ints(gt_R,len_wf_R,&(gt_mask[i]));
            free(gt_R);
        }
        free(R);
    }

    uint32_t *final_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t));

    // combine all of the masks to see what we need to print
    for (i = 0; i < num_ints; ++i) {
        final_mask[i] = ~0;
        for (j = 0; j < gt_q_count; ++j)
            final_mask[i] &= gt_mask[j][i];
    }

    if (c_is_set == 1) {
        uint32_t masked_vid_count = 0;

        for (i = 0; i < num_ints; ++i) 
            masked_vid_count += popcount(final_mask[i]);

        if (masked_vid_count <= wf.num_fields)
            printf("%u\n", masked_vid_count);
        else
            printf("%u\n", wf.num_fields);

    } else if ((v_is_set == 1) && (s_is_set == 1)) {
        get_bcf_query_result(final_mask,
                             num_ints, 
                             q,
                             id_query_list,
                             id_lens,
                             gt_q_count,
                             wf.num_fields,
                             vid_file_name,
                             src_bcf_file_name,
                             bcf_output);
    } else if (b_is_set == 1){

        uint32_t *mapped_mask = (uint32_t *) calloc(num_ints,sizeof(uint32_t));

        uint32_t v,p,leading_zeros, hit;
        for (i = 0; i < num_ints; ++i) {
            if (final_mask[i] != 0) {
                v = final_mask[i];
                p = popcount(v);
                for (j = 0; j < p; ++j) {
                    leading_zeros = __builtin_clz(v);

                    if (i*32 + leading_zeros + 1 > wf.num_fields)
                        break;

                    hit = vids[leading_zeros + i*32];

                    mapped_mask[hit/32] |= 1 << (31-hit%32);
                    v &= ~(1 << (32 - leading_zeros - 1));
                }
            }
            if (i*32 + leading_zeros + 1 > wf.num_fields)
                break;
        }

        print_query_result(mapped_mask,
                           num_ints,
                           vids,
                           q,
                           mapped_counts,
                           id_lens,
                           gt_q_count,
                           wf.num_fields,
                           bim_file_name,
                           full_cmd);
    }

    for (j = 0; j < gt_q_count; ++j) {
        free(gt_mask[j]);
        if ( (q[j].variant_op == p_count) || 
             (q[j].variant_op == p_pct) ||
             (q[j].variant_op == p_maf) )
            free(counts[j]);
    }

    destroy_wahbm_file(&wf);
    return 0;
}