示例#1
0
void bcf_sr_destroy(readers_t *files)
{
    if ( !files->nreaders ) return;
    int i;
    for (i=0; i<files->nreaders; i++)
    {
        reader_t *reader = &files->readers[i];
        if ( reader->tbx ) tbx_destroy(reader->tbx);
        if ( reader->bcf ) hts_idx_destroy(reader->bcf);
        bcf_hdr_destroy(reader->header);
        hts_close(reader->file);
        if ( reader->itr ) tbx_itr_destroy(reader->itr);
        int j;
        for (j=0; j<reader->mbuffer; j++)
            bcf_destroy1(reader->buffer[j]);
        free(reader->buffer);
        if ( reader->samples ) free(reader->samples);
    }
    free(files->readers);
    free(files->seqs);
    for (i=0; i<files->n_smpl; i++) free(files->samples[i]);
    free(files->samples);
    if (files->targets)
    {
        if (files->targets->itr) tbx_itr_destroy(files->targets->itr);
        tbx_destroy(files->targets->tbx);
        if (files->targets->line.m) free(files->targets->line.s);
        hts_close(files->targets->file);
        free(files->targets->seq_names);
        free(files->targets);
    }
    if ( files->tmps.m ) free(files->tmps.s);
    free(files);
}
示例#2
0
文件: tabix.c 项目: Illumina/akt
static int query_chroms(char *fname)
{
    const char **seq;
    int i, nseq, ftype = file_type(fname);
    if ( ftype & IS_TXT || !ftype )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
        seq = tbx_seqnames(tbx, &nseq);
        for (i=0; i<nseq; i++)
            printf("%s\n", seq[i]);
        free(seq);
        tbx_destroy(tbx);
    }
    else if ( ftype==IS_BCF )
    {
        htsFile *fp = hts_open(fname,"r");
        if ( !fp ) error("Could not read %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        hts_close(fp);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        seq = bcf_index_seqnames(idx, hdr, &nseq);
        for (i=0; i<nseq; i++)
            printf("%s\n", seq[i]);
        free(seq);
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( ftype==IS_BAM )   // todo: BAM
        error("BAM: todo\n");
    return 0;
}
示例#3
0
hts_streamer::
~hts_streamer()
{
    if (_titr) tbx_itr_destroy(_titr);
    if (_tidx) tbx_destroy(_tidx);
    if (_hfp) hts_close(_hfp);
    if (_kstr.s) free(_kstr.s);
}
static void bcf_sr_destroy1(bcf_sr_t *reader)
{
    if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx);
    if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx);
    bcf_hdr_destroy(reader->header);
    hts_close(reader->file);
    if ( reader->itr ) tbx_itr_destroy(reader->itr);
    int j;
    for (j=0; j<reader->mbuffer; j++)
        bcf_destroy1(reader->buffer[j]);
    free(reader->buffer);
    free(reader->samples);
    free(reader->filter_ids);
}
示例#5
0
int beds_file_destroy(struct beds_anno_file *file)
{
    int i;
    hts_close(file->fp);
    tbx_destroy(file->idx);
    for ( i = 0; i < file->n_cols; ++i ) 
	free(file->cols[i].hdr_key);
    free(file->cols);
    for ( i = 0; i < file->max; ++i ) 
	beds_anno_tsv_destroy(file->buffer[i]);
    if ( file->fname )
        free(file->fname);
    if ( file->max )
	free(file->buffer);
    return 0;
}
示例#6
0
void cis_data::scanPhenotypes(string fbed) {
	int n_includedP = 0;
	int n_excludedP = 0;
	int n_negativeStrd = 0;

	//Open BED file
	vrb.title("Scanning phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");

	//Read header
	kstring_t str = {0,0,0};
	if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");

	//Scan file
	vector < string > tokens;
	while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
		if (str.l && str.s[0] != tbx->conf.meta_char) {
			stb.split(string(str.s), tokens);
			if (tokens.size() < 5) vrb.error("Incorrect number of columns!");
			if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
				phenotype_id.push_back(tokens[3]);
				phenotype_chr.push_back(tokens[0]);
				phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
				phenotype_end.push_back(atoi(tokens[2].c_str()));
				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                phenotype_neg.push_back(tokens[5] == "-");
                if (phenotype_neg.back()) n_negativeStrd ++;
				n_includedP++;
			} else n_excludedP ++;
		}
	}

	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
	if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
}
void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
{
    int i;
    free(reg->fname);
    if ( reg->itr ) tbx_itr_destroy(reg->itr);
    if ( reg->tbx ) tbx_destroy(reg->tbx);
    if ( reg->file ) hts_close(reg->file);
    if ( reg->als ) free(reg->als);
    if ( reg->als_str.s ) free(reg->als_str.s);
    free(reg->line.s);
    if ( reg->regs )
    {
         // free only in-memory names, tbx names are const
        for (i=0; i<reg->nseqs; i++)
        {
            free(reg->seq_names[i]);
            free(reg->regs[i].regs);
        }
    }
    free(reg->regs);
    free(reg->seq_names);
    khash_str2int_destroy(reg->seq_hash);
    free(reg);
}
示例#8
0
int 
main_vcfset(int argc, char *argv[])
{
     vcfset_conf_t vcfset_conf;
     char *vcf_header = NULL;
     int rc = 0;
     char *vcf_in1, *vcf_in2, *vcf_out;
     long int num_vars_vcf1;
     long int num_vars_vcf1_ign, num_vars_out;
     static int only_passed = 0;
     static int only_pos = 0;
     static int only_snvs = 0;
     static int only_indels = 0;
     static int count_only = 0;
     tbx_t *vcf2_tbx = NULL; /* index for second vcf file */
     htsFile *vcf2_hts = NULL;
     char *add_info_field = NULL;
     int vcf_concat_findex = 0;
     vcf_in1 = vcf_in2 = vcf_out = NULL;
     num_vars_vcf1 = 0;
     num_vars_vcf1_ign = num_vars_out = 0;

     /* default vcfset options */
     memset(&vcfset_conf, 0, sizeof(vcfset_conf_t));
     /* vcfset_conf.vcf_in1 = NULL; */
     /* vcfset_conf.vcf_in2 = NULL; */
     /* vcfset_conf.vcf_out = stdout;*/


    /* keep in sync with long_opts_str and usage 
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         int c;
         static struct option long_opts[] = {
              /* see usage sync */
              {"help", no_argument, NULL, 'h'},
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"only-passed", no_argument, &only_passed, 1},
              {"only-pos", no_argument, &only_pos, 1},
              {"only-indels", no_argument, &only_indels, 1},
              {"only-snvs", no_argument, &only_snvs, 1},
              {"count-only", no_argument, &count_only, 1},

              {"vcf1", required_argument, NULL, '1'},
              {"vcf2", required_argument, NULL, '2'},
              {"vcfout", required_argument, NULL, 'o'},
              {"action", required_argument, NULL, 'a'},
              {"add-info", required_argument, NULL, 'I'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "h1:2:o:a:I:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h': 
              usage(& vcfset_conf); 
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 0;

         case '1': 
              vcf_in1 = strdup(optarg);
              break;

         case '2': 
              vcf_in2 = strdup(optarg);
              break;

         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        free(vcf_in1); free(vcf_in2);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'a': 
              if (0 == strcmp(optarg, "intersect")) {
                   vcfset_conf.vcf_setop = SETOP_INTERSECT;

              } else if (0 == strcmp(optarg, "complement")) {
                   vcfset_conf.vcf_setop = SETOP_COMPLEMENT;

              } else if (0 == strcmp(optarg, "concat")) {
                   vcfset_conf.vcf_setop = SETOP_CONCAT;

              } else {
                   LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg);
                   free(vcf_in1); free(vcf_in2); free(vcf_out);
                   return 1;
              }
              break;

         case 'I': 
              add_info_field = strdup(optarg);
              break;

         case '?': 
              LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); 
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 1;

         default:
              break;
         }
    }

    vcfset_conf.only_passed = only_passed;
    vcfset_conf.only_pos = only_pos;
    vcfset_conf.only_snvs = only_snvs;
    vcfset_conf.only_indels = only_indels;

    if (vcfset_conf.only_indels && vcfset_conf.only_snvs) {
         LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account");
         return 1;
    }

    if (0 != argc - optind - 1) {
         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              vcf_concat_findex = optind;
         } else {
              LOG_FATAL("%s\n", "Unrecognized arguments found\n");
              return 1;
         }
    } else {
         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              LOG_FATAL("%s\n", "No extra files for concat given\n");
              return 1;
         }
    }
#if 0
    int i; for (i=optind+1; i<argc; i++) {
         LOG_FIXME("argv[%d]=%s\n", i, argv[i]);
    }
#endif

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& vcfset_conf);
        free(vcf_in1); free(vcf_in2); free(vcf_out);
        return 1;
    }

    if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) {
         LOG_FATAL("%s\n", "No set operation specified");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }

    if  (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) {
         LOG_FATAL("%s\n\n", "At least one vcf input file not specified");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }
    if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) {
         LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;         
    }

    if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, 
                      HAS_GZIP_EXT(vcf_in1), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in1);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }

    if (vcf_in2) {
         vcf2_hts = hts_open(vcf_in2, "r");
         if (!vcf2_hts) {
              LOG_FATAL("Couldn't load %s\n", vcf_in2);
              return 1;
         }
         vcf2_tbx = tbx_index_load(vcf_in2);
         if (!vcf2_tbx) {
              LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2);
              return 1;
         }
    }

    /* vcf_out default if not set: stdout==- */
    if (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }

    if (! count_only) {
         if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, 
                           HAS_GZIP_EXT(vcf_out), 'w')) {
              LOG_ERROR("Couldn't open %s\n", vcf_out);
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 1;
         }
    }

    /* use meta-data/header of vcf_in1 for output
     */
    LOG_DEBUG("Getting header from %s\n", vcf_in1);
    if (0 !=  vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) {
         LOG_WARN("%s\n", "vcf_parse_header() failed");
         if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return -1;
         }
    } else {
         if (! count_only) {
              /* vcf_write_header would write *default* header */
              vcf_write_header(& vcfset_conf.vcf_out, vcf_header);
         }
         free(vcf_header);
    }

    
    /* parse first vcf file
     */
    LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1);
    while (1) {
         var_t *var1 = NULL;
         int rc;
         int is_indel;
         kstring_t var2_kstr = {0, 0, 0};
         hts_itr_t *var2_itr = NULL;
         char regbuf[1024];
         int var2_match = 0;

         vcf_new_var(&var1);
         rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1);
         if (rc) {
              free(var1);
              
              if (vcfset_conf.vcf_setop != SETOP_CONCAT) {
                   break;
              } else {
                   vcf_concat_findex++;
                   if (vcf_concat_findex==argc) {
                        break;
                   }
                   /* set vcf1 up anew and simply continue as if nothing happened 
                    */
                   vcf_file_close(& vcfset_conf.vcf_in1);
                   free(vcf_in1);

                   vcf_in1 = strdup(argv[vcf_concat_findex]);
                   LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1);
                   if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, 
                                     HAS_GZIP_EXT(vcf_in1), 'r')) {
                        LOG_ERROR("Couldn't open %s\n", vcf_in1);
                        free(vcf_in1); free(vcf_in2); free(vcf_out);
                        return 1;
                   }
                   if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) {
                        LOG_WARN("skip header failed for %s\n", vcf_in1);
                   }
                   continue;
              }
         }

         is_indel = vcf_var_is_indel(var1);
         if (vcfset_conf.only_snvs && is_indel) {
              free(var1);
              continue;
         } else if (vcfset_conf.only_indels && ! is_indel) {
              free(var1);
              continue;
         }

         if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) {
              LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1");
              return -1;
         }
         if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) {
#ifdef TRACE
              LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos);
#endif
              num_vars_vcf1_ign += 1;
              vcf_free_var(& var1);
              continue;
         }
         if (add_info_field) {
              vcf_var_add_to_info(var1, add_info_field);
         }
         num_vars_vcf1 += 1;
#ifdef TRACE
         LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos);
#endif

         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              num_vars_out += 1;
              if (! count_only) {
                   vcf_write_var(& vcfset_conf.vcf_out, var1);
              }
              vcf_free_var(& var1);
              /* skip comparison against vcf2 */
              continue;
         }

         /* use index access to vcf2 */
         snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1);
         var2_itr = tbx_itr_querys(vcf2_tbx, regbuf);
         if (! var2_itr) {
              var2_match = 0;
         } else {
              var2_match = 0;
              while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) {
                   var_t *var2 = NULL;
                   int var2_is_indel = 0;

                   vcf_new_var(&var2);
                   rc = vcf_parse_var_from_line(var2_kstr.s, var2);
                   /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", 
                             var1->pos+1, var1->ref, var1->alt,
                             var2->pos+1, var2->ref, var2->alt, regbuf); */
                   if (rc) {
                        LOG_FATAL("%s\n", "Error while parsing variant returned from tabix");
                        return -1;
                   }

                   var2_is_indel = vcf_var_is_indel(var2);

                   /* iterator returns anything overlapping with that 
                    * position, i.e. this also includes up/downstream
                    * indels, so make sure actual position matches */
                   if (var1->pos != var2->pos) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_snvs && var2_is_indel) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_indels && ! var2_is_indel) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_pos) {
#ifdef TRACE
                        LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos);
#endif
                        var2_match = 1;

                   } else {
                        if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) {
#ifdef TRACE
                             LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos);
#endif
                             var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */                             
                        }
                   }
                   vcf_free_var(&var2);
                   if (var2_match) {
                        break;/* no need to continue */
                   }
              }
         }

         if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) {
              /* relative complement : elements in A but not B */
              if (!var2_match) {
                   num_vars_out += 1;
                   if (! count_only) {
                        vcf_write_var(& vcfset_conf.vcf_out, var1);
                   }
              }
         } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) {
              if (var2_match) {
                   num_vars_out += 1;
                   if (! count_only) {
                        vcf_write_var(& vcfset_conf.vcf_out, var1);
                   }
              }

         } else {
              LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop);
              return 1;
         }

         vcf_free_var(& var1);
         tbx_itr_destroy(var2_itr);
    }/* while (1) */

    vcf_file_close(& vcfset_conf.vcf_in1);
    if (vcf_in2) {
         hts_close(vcf2_hts);
         tbx_destroy(vcf2_tbx);
    }
    LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", 
                num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign);
    LOG_VERBOSE("Wrote %d variants to output\n", 
                num_vars_out);
    if (! count_only) {
         vcf_file_close(& vcfset_conf.vcf_out);
    }

    if (0==rc) {
         if (count_only) {
              printf("%ld\n", num_vars_out);
         }

         LOG_VERBOSE("%s\n", "Successful exit.");
    }

    free(vcf_in1);
    free(vcf_in2);
    free(vcf_out);


    return rc;
}
示例#9
0
int vcf_index_stats(char *fname, int stats)
{
    char *fn_out = NULL;
    FILE *out;
    out = fn_out ? fopen(fn_out, "w") : stdout;

    const char **seq;
    int i, nseq;
    tbx_t *tbx = NULL;
    hts_idx_t *idx = NULL;

    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }

    if ( hts_get_format(fp)->format==vcf )
    {
        tbx = tbx_index_load(fname);
        if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
    }
    else if ( hts_get_format(fp)->format==bcf )
    {
        idx = bcf_index_load(fname);
        if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
    }
    else
    {
        fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
        return 1;
    }

    seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
    uint64_t sum = 0;
    for (i=0; i<nseq; i++)
    {
        uint64_t records, v;
        hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
        sum+=records;
        if (stats&2 || !records) continue;
        bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
        int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
        fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
    }
    if (!sum)
    {
        // No counts found.
        // Is this because index version has no stored count data, or no records?
        bcf1_t *rec = bcf_init1();
        if (bcf_read1(fp, hdr, rec) >= 0)
        {
            fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
            return 1;
        }
        bcf_destroy1(rec);
    }
    if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
    free(seq);
    fclose(out);
    hts_close(fp);
    bcf_hdr_destroy(hdr);
    if (tbx)
        tbx_destroy(tbx);
    if (idx)
        hts_idx_destroy(idx);
    return 0;
}
示例#10
0
void union_data::scanGenotypesBED(string fbed) {
	string buffer;
	int n_includedG = 0;
	int n_excludedG_user = 0;

	//Opening files
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file!");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot load index file!");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Read genotype data
	vector < string > tokens;
    unsigned int linecount = 0;
    //Jump to interesting region
    if (regionGenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str());
        vrb.bullet("target region [" + regionGenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if (genotype_id_to_idx.count(tokens[3])) continue;
            if (filter_genotype.check(tokens[3])) {
                genotype_id.push_back(tokens[3]);
                genotype_chr.push_back(tokens[0]);
                genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                genotype_end.push_back(atoi(tokens[2].c_str()));
                pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
                genotype_id_to_idx.insert(temp);
                n_includedG++;
            } else n_excludedG_user ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if (genotype_id_to_idx.count(tokens[3])) continue;
                if (filter_genotype.check(tokens[3])) {
                    genotype_id.push_back(tokens[3]);
                    genotype_chr.push_back(tokens[0]);
                    genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    genotype_end.push_back(atoi(tokens[2].c_str()));
                    pair < string, int > temp (tokens[3],genotype_id_to_idx.size());
                    genotype_id_to_idx.insert(temp);
                    n_includedG++;
                } else n_excludedG_user ++;
            }
        }
    }

	//Finalize & verbose
	tbx_destroy(tbx);
    genotype_count += n_includedG;
	if (hts_close(fp)) vrb.error("Cannot properly close file!");
	vrb.bullet(stb.str(n_includedG) + " new variants included");
	if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
    if (n_includedG  == 0) vrb.leave("Cannot find variants in target region!");
}
示例#11
0
void union_data::readGenotypesBED(string fbed,string region) {
	string buffer;
	int n_includedG = 0;
	int n_excludedG_user = 0;
	int n_includedS = 0;
	int n_excludedS = 0;
	int n_missingS = 0;
	vector < int > mappingS;
	genotype_id.clear();
	genotype_chr.clear();
	genotype_start.clear();
	genotype_end.clear();
	genotype_val.clear();
	genotype_count=0;
	genotype_id_to_idx.clear();
	//Opening files
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file!");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot load index file!");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) {
		string sid = tokens[i0];
		if (filter_sample.check(sid)) {
			mappingS.push_back(findSample(sid));
			if (mappingS.back() >= 0) n_includedS ++;
			else n_missingS ++;
		} else {
			mappingS.push_back(-1);
			n_excludedS ++;
		}
	}
	//vrb.bullet(stb.str(n_includedS) + " samples included");
	//if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user");
	//if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data");
	//if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!");

    unsigned int linecount = 0;

	//Jump to interesting region

	hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
	//vrb.bullet("target region [" + regionGenotype.get() + "]");
	//if (!itr) vrb.error("Cannot jump to region!");
	while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
		linecount ++;
		if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		stb.split(string(str.s), tokens);
		if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
		if (filter_genotype.check(tokens[3])) {
			genotype_id.push_back(tokens[3]);
			genotype_chr.push_back(tokens[0]);
			genotype_start.push_back(atoi(tokens[1].c_str()) + 1);
			genotype_end.push_back(atoi(tokens[2].c_str()));
			genotype_val.push_back(vector < float > (sample_count, 0.0));
			for (int t = 6 ; t < tokens.size() ; t ++) {
				if (mappingS[t-6] >= 0) {
					if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing;
					else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
				}
			}
			pair < string, int > temp (tokens[3],n_includedG);
			genotype_id_to_idx.insert(temp);
			n_includedG++;
		} else n_excludedG_user ++;
	}
	tbx_itr_destroy(itr);


	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file!");
	genotype_count = n_includedG;
	//vrb.bullet(stb.str(n_includedG) + " variants included");
	//if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
    //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!");
}
示例#12
0
int main(int argc, char* argv[]) {
    namespace po = boost::program_options;

    std::string file;
    std::string output;

    try
    {
        // Declare the supported options.
        po::options_description desc("Allowed options");
        desc.add_options()
            ("help,h", "produce help message")
            ("version", "Show version")            
            ("input-file", po::value< std::string >(), "The input files")
            ("output-file", po::value<std::string>(), "The output file name.")
        ;

        po::positional_options_description popts;
        popts.add("input-file", 1);
        popts.add("output-file", 1);

        po::options_description cmdline_options;
        cmdline_options
            .add(desc)
        ;

        po::variables_map vm;
        
        po::store(po::command_line_parser(argc, argv).
                  options(cmdline_options).positional(popts).run(), vm);
        po::notify(vm); 

        if (vm.count("version")) 
        {
            std::cout << "vcfhdr2json version " << HAPLOTYPES_VERSION << "\n";
            return 0;
        }

        if (vm.count("help")) 
        {
            std::cout << desc << "\n";
            return 1;
        }

        if (vm.count("input-file"))
        {
            file = vm["input-file"].as< std::string > ();
        }

        if (vm.count("output-file"))
        {
            output = vm["output-file"].as< std::string >();
        }

        if(file.size() == 0)
        {
            std::cerr << "Please specify an input file.\n";
            return 1;
        }

        if (output == "")
        {
            std::cerr << "Please specify an output file.\n";
            return 1; 
        }
    } 
    catch (po::error & e)
    {
        std::cerr << e.what() << "\n";
        return 1;
    }

    try
    {
        Json::StyledWriter writer;
        htsFile * fp = bcf_open(file.c_str(), "r");
        bcf_hdr_t * hdr = bcf_hdr_read(fp);

        Json::Value root;
        Json::Value a;
        for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i)
        {
            a.append(hdr->samples[i]);
        }
        root["samples"] = a;

        Json::Value fields;
        for (int i = 0; i < hdr->nhrec; i++)
        {
            Json::Value field;
            field["key"] = hdr->hrec[i]->key;
            if (!hdr->hrec[i]->value)
            {
                Json::Value values;

                for (int j = 0; j < hdr->hrec[i]->nkeys; j++)
                {
                    values[hdr->hrec[i]->keys[j]] = hdr->hrec[i]->vals[j];
                }
                field["values"] = values;
            }
            else
            {
                field["value"] = hdr->hrec[i]->value;
            }
            fields.append(field);
        }
        root["fields"] = fields;

        tbx_t * tbx_idx = tbx_index_load(file.c_str());
        if ( !tbx_idx )
        {
            hts_idx_t * csi_idx = bcf_index_load(file.c_str());
            if(!csi_idx)
            {
                root["tabix"] = Json::Value::null;
            }
            else
            {
                root["tabix"] = Json::Value();
                root["tabix"]["chromosomes"] = Json::Value();

                int count = 0;
                const char ** tbx_names = bcf_index_seqnames(csi_idx, hdr, &count);

                for (int i = 0; i < count; ++i)
                {
                    root["tabix"]["chromosomes"].append(tbx_names[i]);
                }
                free(tbx_names);
                hts_idx_destroy(csi_idx);
            }
        }
        else
        {
            root["tabix"] = Json::Value();
            root["tabix"]["chromosomes"] = Json::Value();

            int count = 0;
            const char ** tbx_names = tbx_seqnames(tbx_idx, &count);

            for (int i = 0; i < count; ++i)
            {
                root["tabix"]["chromosomes"].append(tbx_names[i]);
            }

            free(tbx_names);
            tbx_destroy(tbx_idx);
        }


        std::ofstream out(output.c_str());
        out << writer.write(root);

        bcf_close(fp);
        bcf_hdr_destroy(hdr);
    } 
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    return 0;
}
示例#13
0
文件: vcf.c 项目: goshng/cocoa
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
{
	if (!fp->is_bin) {
		kstring_t txt, *s = &fp->line;
		bcf_hdr_t *h;
		h = bcf_hdr_init();
		txt.l = txt.m = 0; txt.s = 0;
		while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
			if (s->l == 0) continue;
			if (s->s[0] != '#') {
				if (hts_verbose >= 2)
					fprintf(stderr, "[E::%s] no sample line\n", __func__);
				free(txt.s);
				bcf_hdr_destroy(h);
				return 0;
			}
			if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
				int dret;
				gzFile f;
				kstream_t *ks;
				kstring_t tmp;
				tmp.l = tmp.m = 0; tmp.s = 0;
				f = gzopen(fp->fn_aux, "r");
				ks = ks_init(f);
				while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
					int c;
					kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
					ks_getuntil(ks, 0, &tmp, &dret);
					kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
					kputsn(">\n", 2, &txt);
					if (dret != '\n')
						while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
				}
				free(tmp.s);
				ks_destroy(ks);
				gzclose(f);
			}
			kputsn(s->s, s->l, &txt);
			if (s->s[1] != '#') break;
			kputc('\n', &txt);
		}
		h->l_text = txt.l + 1; // including NULL
		h->text = txt.s;
		bcf_hdr_parse(h);
        // check tabix index, are all contigs listed in the header? add the missing ones
        tbx_t *idx = tbx_index_load(fp->fn);
        if ( idx )
        {
			int i, n, need_sync = 0;
			const char **names = tbx_seqnames(idx, &n);
			for (i=0; i<n; i++)
			{
                bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
                if ( hrec ) continue;
                hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
                hrec->key = strdup("contig");
                bcf_hrec_add_key(hrec, "ID", strlen("ID"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
                bcf_hrec_add_key(hrec, "length", strlen("length"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0);   // what is a good default value?
                bcf_hdr_add_hrec(h, hrec);
                need_sync = 1;
			}
			free(names);
			tbx_destroy(idx);
            if ( need_sync )
            {
                bcf_hdr_sync(h);
                bcf_hdr_fmt_text(h);
            }
		}
		return h;
	} else return bcf_hdr_read((BGZF*)fp->fp);
}
示例#14
0
/**
 * tabix workhorse function
 */
static int tabix_handler(request_rec *r)
    {
    htsFile *fp=NULL;
    hts_itr_t *itr=NULL;
    kstring_t line = {0,0,0};
    int print_header=1;
    int print_body=1;
    struct tabix_callback_t handler;
    int http_status=OK;
	
	memset((void*)&handler,0,sizeof(struct tabix_callback_t));
    handler.r=r;
    handler.limit=DEFAULT_LIMIT_RECORDS;
	
    if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED);
    if (strcmp(r->method, "GET")!=0) return DECLINED;
    if(r->canonical_filename==NULL)  return DECLINED;
     /* file must be b-gzipped */
    if( !(
    	str_ends_with(r->canonical_filename,".gz")
       	))  return DECLINED;
    /* file must be indexed with tabix */
    if( !(
    	fileExtExists(r->canonical_filename,".tbi")
       	))  return 404;
   
    
   
    handler.httParams = HttpParamParseGET(r); 
    if(handler.httParams==NULL) return DECLINED;
    handler.file_format=E_FORMAT_UNDEFINED;
    if(str_ends_with(r->canonical_filename,".vcf.gz"))
    	{
    	handler.file_format=E_FORMAT_VCF;
    	}
    else if(str_ends_with(r->canonical_filename,".bed.gz"))
    	{
    	handler.file_format=E_FORMAT_BED;
    	}
    
    /* only one loop, we use this to cleanup the code, instead of using a goto statement */
    do	{
    	const char* format=HttpParamGet(handler.httParams,"format");
    	const char* limit=HttpParamGet(handler.httParams,"limit");
    	const char* region=HttpParamGet(handler.httParams,"region");
    	int iterator_was_requested=FALSE;
    	
    	
    	if(limit!=NULL)
    		{
    		handler.limit=atol(limit);
    		}
    	
    	if(format==NULL)
    		{
    		http_status=DECLINED;
    		break;
    		}
    	else if(strcmp(format,"xml")==0)
    	 	{
    	 	SETUP_HANDLER(xml);
    	 	}

    	 else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0)
    	 	{
    	 	handler.jsonp_callback=HttpParamGet(handler.httParams,"callback");
    	 	SETUP_HANDLER(json);
    	 	}
    	 else if(strcmp(format,"html")==0)
    	 	{
    	 	SETUP_HANDLER(html);
    	 	}
    	 else
    	 	{
    	 	SETUP_HANDLER(plain);
    	 	}
    	
    	fp=hts_open(r->canonical_filename,"r");
    	if(fp==NULL)
    		{
    		http_status=HTTP_NOT_FOUND;
    		break;
    		}
    	//read index
    	handler.tbx = tbx_index_load(r->canonical_filename);
    	if(handler.tbx==NULL)
			{
			http_status=HTTP_INTERNAL_SERVER_ERROR;
			break;
			}
    	if(region!=NULL && !str_is_empty(region))
    		{
    		iterator_was_requested=TRUE;
    		itr = tbx_itr_querys(handler.tbx,region);
    		}

	
    	handler.startdocument(&handler);
    	if(print_header)
    	    {
    	    handler.startheader(&handler);
    	    while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 )
    	            {
		    if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break;
		    handler.header(&handler,&line);
		    handler.count++;
    	            }
    	    handler.enddheader(&handler);
    	    }
    	handler.count=0;//Reset 
    	if(print_body)
    	    {
    	    handler.startbody(&handler);
		    if(iterator_was_requested)
				{
				if(itr!=NULL)
					{
					while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0)
						{
						if(handler.show(&handler,&line)<0) break;
						handler.count++;
						}
					}
		
				}
		    else
				{
				while ((handler.limit==-1 || handler.count< handler.limit) && \
					hts_getline(fp, KS_SEP_LINE, &line) >= 0)
					{
					if(handler.show(&handler,&line)<0) break;
					handler.count++;
					}
				}
	   	 handler.endbody(&handler);
    	    }
	handler.enddocument(&handler);
    	} while(0);/* always abort */
    
    
    //cleanup
    if(itr!=NULL) tbx_itr_destroy(itr);
    HttpParamFree(handler.httParams);
    free(line.s);
    if(fp!=NULL) hts_close(fp);
    if(handler.tbx!=NULL) tbx_destroy(handler.tbx);
    return http_status;
    }
示例#15
0
文件: convert.c 项目: srw6v/gqt
int convert(int argc, char **argv)
{
    if (argc < 2) return convert_help();

    int c;
    char *in=NULL, *out=NULL, *bim=NULL, *vid=NULL, *tmp_dir=NULL, *ped=NULL;
    uint32_t num_fields, num_records, col = 2;
    int i_is_set = 0, 
        o_is_set = 0, 
        f_is_set = 0, 
        b_is_set = 0, 
        v_is_set = 0, 
        t_is_set = 0, 
        p_is_set = 0, 
        r_is_set = 0; 

    while((c = getopt (argc, argv, "hi:o:f:r:b:v:t:p:c:")) != -1) {
        switch (c) {
            case 'c':
                col = atoi(optarg);
                break;
            case 'p':
                p_is_set = 1;
                ped = optarg;
                break;
            case 't':
                t_is_set = 1;
                tmp_dir = optarg;
                break;
            case 'v':
                v_is_set = 1;
                vid = optarg;
                break;
            case 'b':
                b_is_set = 1;
                bim = optarg;
                break;
            case 'i':
                i_is_set = 1;
                in = optarg;
                break;
            case 'o':
                o_is_set = 1;
                out = optarg;
                break;
            case 'f':
                f_is_set = 1;
                num_fields = atoi(optarg);
                break;
            case 'r':
                r_is_set = 1;
                num_records = atoi(optarg);
                break;
            case 'h':
                convert_help();
                return 1;
            case '?':
                if ( (optopt == 'i') || 
                     (optopt == 'f') ||
                     (optopt == 'r') ||
                     (optopt == 't') ||
                     (optopt == 's') ||
                     (optopt == 'p') ||
                     (optopt == 'c') ||
                     (optopt == 'o') )
                    fprintf (stderr, "Option -%c requires an argument.\n",
                            optopt);
                else if (isprint (optopt))
                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                else
                fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
            default:
                convert_help();
                return 1;
        }
    }

    char *type = argv[0];

    if (i_is_set == 0) {
        printf("Input file is not set\n");
        return convert_help();
    } 

    if (strcmp(type, "bcf") == 0) {
        if ( (f_is_set == 0) || (r_is_set == 0) ) {

            fprintf(stderr,"Attempting to autodetect num of records "
                    "and fields from %s\n", in);
            //Try and auto detect the sizes, need the index
            tbx_t *tbx = NULL;
            hts_idx_t *idx = NULL;
            htsFile *fp    = hts_open(in,"rb");
            if ( !fp ) {
                fprintf(stderr,"Could not read %s\n", in);
                return 1;
            }

            bcf_hdr_t *hdr = bcf_hdr_read(fp);
            if ( !hdr ) {
                fprintf(stderr,"Could not read the header: %s\n", in);
                return 1;
            }

            if (hts_get_format(fp)->format==vcf) {
                tbx = tbx_index_load(in);
                if ( !tbx ) { 
                    fprintf(stderr,"Could not load TBI index: %s\n", in);
                    return 1;
                }
            } else if ( hts_get_format(fp)->format==bcf ) {
                idx = bcf_index_load(in);
                if ( !idx ) {
                    fprintf(stderr,"Could not load CSI index: %s\n", in);
                    return 1;
                }
            } else {
                fprintf(stderr,
                        "Could not detect the file type as VCF or BCF: %s\n",
                        in);
                return 1;
            }

            num_fields = hdr->n[BCF_DT_SAMPLE];

            num_records = 0;
            const char **seq;
            int nseq;
            seq = tbx ? tbx_seqnames(tbx, &nseq) : 
                    bcf_index_seqnames(idx, hdr, &nseq);
            int i;
            uint32_t sum = 0;
            for (i = 0; i < nseq; ++i) {
                uint64_t records, v;
                hts_idx_get_stat(tbx ? tbx->idx: idx, i, &records, &v);
                num_records += records;
            }

            fprintf(stderr, "Number of records:%u\tNumber of fields:%u\n",
                    num_records, num_fields);
            free(seq);
            hts_close(fp);
            bcf_hdr_destroy(hdr);
            if (idx)
                hts_idx_destroy(idx);
            if (tbx)
                tbx_destroy(tbx);
        }


        if (o_is_set == 0) {
            out  = (char*)malloc(strlen(in) + 5); // 5 for ext and \0
            strcpy(out,in);
            strcat(out, ".gqt");
        }
        if (b_is_set == 0) {
            bim  = (char*)malloc(strlen(in) + 5); // 5 for ext and \0
            strcpy(bim,in);
            strcat(bim, ".bim");
        }
        if (v_is_set == 0) {
            vid  = (char*)malloc(strlen(in) + 5); // 5 for ext and \0
            strcpy(vid,in);
            strcat(vid, ".vid");
        }
        if (t_is_set == 0) {
            tmp_dir  = (char*)malloc(3*sizeof(char)); // "./\0"
            strcpy(tmp_dir,"./");
        }

        int r = bcf_wahbm(in, out, bim, vid, tmp_dir, num_fields, num_records);

        return r;
    } 

    if (strcmp(type, "ped") == 0)  {
        if (o_is_set == 0) {
            if (p_is_set == 1) {
                out  = (char*)malloc(strlen(ped) + 4); // 4 for ext and \0
                strcpy(out,ped);
                strcat(out, ".db");
            } else {
                out  = (char*)malloc(strlen(in) + 4); // 4 for ext and \0
                strcpy(out,in);
                strcat(out, ".db");
            }
      }

      fprintf(stderr, "Creating sample database %s\n", out);
      return ped_ped(in, ped, col, out);
    }
    return convert_help();
}
示例#16
0
文件: tabix.c 项目: Bratdaking/pysam
int main_tabix(int argc, char *argv[])
{
    int c, min_shift = -1, is_force = 0, is_all = 0;
    tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
    while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
        if (c == '0') conf.preset |= TBX_UCSC;
        else if (c == 'f') is_force = 1;
        else if (c == 'a') is_all = 1;
        else if (c == 'm') min_shift = atoi(optarg);
        else if (c == 's') conf.sc = atoi(optarg);
        else if (c == 'b') conf.bc = atoi(optarg);
        else if (c == 'e') conf.ec = atoi(optarg);
        else if (c == 'c') conf.meta_char = *optarg;
        else if (c == 'S') conf.line_skip = atoi(optarg);
        else if (c == 'p') {
            if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
            else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
            else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
            else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
            else {
                fprintf(stderr, "The type '%s' not recognised\n", optarg);
                return 1;
            }

        }
    if (optind == argc) {
        fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n");
        fprintf(stderr, "Options: -p STR    preset: gff, bed, sam or vcf [gff]\n");
        fprintf(stderr, "         -s INT    column number for sequence names (suppressed by -p) [1]\n");
        fprintf(stderr, "         -b INT    column number for region start [4]\n");
        fprintf(stderr, "         -e INT    column number for region end (if no end, set INT to -b) [5]\n");
        fprintf(stderr, "         -0        specify coordinates are zero-based\n");
        fprintf(stderr, "         -S INT    skip first INT lines [0]\n");
        fprintf(stderr, "         -c CHAR   skip lines starting with CHAR [null]\n");
        fprintf(stderr, "         -a        print all records\n");
        fprintf(stderr, "         -f        force to overwrite existing index\n");
        fprintf(stderr, "         -m INT    set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
        fprintf(stderr, "\n");
        return 1;
    }
    if (is_all) { // read without random access
        kstring_t s;
        BGZF *fp;
        s.l = s.m = 0; s.s = 0;
        fp = bgzf_open(argv[optind], "r");
        while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s);
        bgzf_close(fp);
        free(s.s);
    } else if (optind + 2 > argc) { // create index
        if ( !conf_ptr )
        {
            // auto-detect file type by file name
            int l = strlen(argv[optind]);
            int strcasecmp(const char *s1, const char *s2);
            if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
            else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
        }
        if ( conf_ptr ) conf = *conf_ptr;

        if (!is_force) {
            char *fn;
            FILE *fp;
            fn = (char*)alloca(strlen(argv[optind]) + 5);
            strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
            if ((fp = fopen(fn, "rb")) != 0) {
                fclose(fp);
                fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
                return 1;
            }
        }
        if ( tbx_index_build(argv[optind], min_shift, &conf) )
        {
            fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n");
            return 1;
        }
    } else { // read with random access
        tbx_t *tbx;
        BGZF *fp;
        kstring_t s;
        int i;
        if ((tbx = tbx_index_load(argv[optind])) == 0) return 1;
        if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1;
        s.s = 0; s.l = s.m = 0;
        for (i = optind + 1; i < argc; ++i) {
            hts_itr_t *itr;
            if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue;
            while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s);
            tbx_itr_destroy(itr);
        }
        free(s.s);
        bgzf_close(fp);
        tbx_destroy(tbx);
    }
    return 0;
}
示例#17
0
void union_data::scanPhenotypes(string fbed) {
	int n_includedP = 0;
	int n_excludedP = 0;

	//Open BED file
	vrb.title("Scanning phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t * tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");

	//Read header
	kstring_t str = {0,0,0};
	if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line");

	//Scan file
	vector < string > tokens;
    unsigned int linecount =0;
    if (regionPhenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
        vrb.bullet("target region [" + regionPhenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        //Read data
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if (phenotype_id_to_idx.count(tokens[3])) continue;
            if (filter_phenotype.check(tokens[3])) {
                phenotype_id.push_back(tokens[3]);
                phenotype_chr.push_back(tokens[0]);
                phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                phenotype_end.push_back(atoi(tokens[2].c_str()));
                pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
                phenotype_id_to_idx.insert(temp);
                n_includedP++;
            } else n_excludedP ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                stb.split(string(str.s), tokens);
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if (phenotype_id_to_idx.count(tokens[3])) continue;
                if (filter_phenotype.check(tokens[3])) {
                    phenotype_id.push_back(tokens[3]);
                    phenotype_chr.push_back(tokens[0]);
                    phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    phenotype_end.push_back(atoi(tokens[2].c_str()));
                    pair < string, int > temp (tokens[3],phenotype_id_to_idx.size());
                    phenotype_id_to_idx.insert(temp);
                    n_includedP++;
                } else n_excludedP ++;
            }
        }
    }
	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " new phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!");
}
示例#18
0
void union_data::readPhenotypes(string fbed, string region) {
	int n_includedS = 0;
	int n_includedP = 0;
	int n_excludedP = 0;
	vector < int > mappingS;
	phenotype_id.clear();
	phenotype_chr.clear();
	phenotype_start.clear();
	phenotype_end.clear();
	phenotype_val.clear();
	phenotype_count=0;
	phenotype_id_to_idx.clear();
	//Open BED file
	//vrb.title("Reading phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t *tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int t = 6 ; t < tokens.size() ; t ++) {
		mappingS.push_back(findSample(tokens[t]));
		if (mappingS.back() >= 0) n_includedS++;
	}
    unsigned int linecount =0;

	//Read phenotypes
	hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str());
	//vrb.bullet("target region [" + regionPhenotype.get() + "]");
	//if (!itr) vrb.error("Cannot jump to region!");
	//Read data
	while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
		linecount ++;
		if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		stb.split(string(str.s), tokens);
		if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
		if (filter_phenotype.check(tokens[3])) {
			phenotype_id.push_back(tokens[3]);
			phenotype_chr.push_back(tokens[0]);
			phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
			phenotype_end.push_back(atoi(tokens[2].c_str()));
			phenotype_val.push_back(vector < float > (sample_count, 0.0));
			for (int t = 6 ; t < tokens.size() ; t ++) {
				if (mappingS[t-6] >= 0) {
					if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
					else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
				}
			}
			pair < string, int > temp (tokens[3],n_includedP);
			phenotype_id_to_idx.insert(temp);
			n_includedP++;
		} else n_excludedP ++;
	}
	tbx_itr_destroy(itr);
	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	//vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	//if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
    //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
}
示例#19
0
Tabix::~Tabix(void) {
    tbx_itr_destroy(iter);
    tbx_destroy(tbx);
}
示例#20
0
void cis_data::readPhenotypes(string fbed) {
	int n_includedS = 0;
	int n_includedP = 0;
	int n_excludedP = 0;
	int n_negativeStrd = 0;
	vector < int > mappingS;

	//Open BED file
	vrb.title("Reading phenotype data in [" + fbed + "]");
	htsFile *fp = hts_open(fbed.c_str(),"r");
	if (!fp) vrb.error("Cannot open file");
	tbx_t *tbx = tbx_index_load(fbed.c_str());
	if (!tbx) vrb.error("Cannot open index file");
	kstring_t str = {0,0,0};
	if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");

	//Process sample names
	vector < string > tokens;
	stb.split(string(str.s), tokens);
	if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
	for (int t = 6 ; t < tokens.size() ; t ++) {
		mappingS.push_back(findSample(tokens[t]));
		if (mappingS.back() >= 0) n_includedS++;
	}

	//Read phenotypes
    unsigned int linecount =0;
    
    //Read phenotypes
    if (regionPhenotype.chr != "NA"){
        hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str());
        vrb.bullet("target region [" + regionPhenotype.get() + "]");
        if (!itr) vrb.error("Cannot jump to region!");
        //Read data
        while (tbx_itr_next(fp, tbx, itr, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
            if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
                phenotype_id.push_back(tokens[3]);
                phenotype_chr.push_back(tokens[0]);
                phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                phenotype_end.push_back(atoi(tokens[2].c_str()));
				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                phenotype_neg.push_back(tokens[5] == "-");
                if (phenotype_neg.back()) n_negativeStrd ++;
                phenotype_val.push_back(vector < float > (sample_count, 0.0));
                for (int t = 6 ; t < tokens.size() ; t ++) {
                    if (mappingS[t-6] >= 0) {
                        if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
                        else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
                    }
                }
                n_includedP++;
            } else n_excludedP ++;
        }
        tbx_itr_destroy(itr);
    }else{
        while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
            linecount ++;
            if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
            stb.split(string(str.s), tokens);
            if (str.l && str.s[0] != tbx->conf.meta_char) {
                if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
                if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) {
                    phenotype_id.push_back(tokens[3]);
                    phenotype_chr.push_back(tokens[0]);
                    phenotype_start.push_back(atoi(tokens[1].c_str()) + 1);
                    phenotype_end.push_back(atoi(tokens[2].c_str()));
    				if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES");
    				if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]);
                    phenotype_neg.push_back(tokens[5] == "-");
                    if (phenotype_neg.back()) n_negativeStrd ++;
                    phenotype_val.push_back(vector < float > (sample_count, 0.0));
                    for (int t = 6 ; t < tokens.size() ; t ++) {
                        if (mappingS[t-6] >= 0) {
                            if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing;
                            else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]);
                        }
                    }
                    n_includedP++;
                } else n_excludedP ++;
            }
        }
    }

	//Finalize & verbose
	tbx_destroy(tbx);
	if (hts_close(fp)) vrb.error("Cannot properly close file");
	phenotype_count = phenotype_id.size();
	vrb.bullet(stb.str(n_includedP) + " phenotypes included");
	if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user");
	if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand");
    if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!");
}
示例#21
0
文件: tabix.c 项目: Illumina/akt
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
    int i;
    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) error("Could not read %s\n", fname);
    enum htsExactFormat format = hts_get_format(fp)->format;

    regidx_t *reg_idx = NULL;
    if ( args->targets_fname )
    {
        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
    }

    if ( format == bcf )
    {
        htsFile *out = hts_open("-","w");
        if ( !out ) error("Could not open stdout\n", fname);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        if ( args->print_header )
            bcf_hdr_write(out,hdr);
        if ( !args->header_only )
        {
            bcf1_t *rec = bcf_init();
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
                while ( bcf_itr_next(fp, itr, rec) >=0 )
                {
                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
                    bcf_write(out,hdr,rec);
                }
                tbx_itr_destroy(itr);
            }
            bcf_destroy(rec);
        }
        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( format==vcf || format==sam || format==unknown_format )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
        kstring_t str = {0,0,0};
        if ( args->print_header )
        {
            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
            {
                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
                puts(str.s);
            }
        }
        if ( !args->header_only )
        {
            int nseq;
            const char **seq = NULL;
            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
                if ( !itr ) continue;
                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
                {
                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
                    puts(str.s);
                }
                tbx_itr_destroy(itr);
            }
            free(seq);
        }
        free(str.s);
        tbx_destroy(tbx);
    }
    else if ( format==bam )
        error("Please use \"samtools view\" for querying BAM files.\n");

    if ( reg_idx ) regidx_destroy(reg_idx);
    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);

    for (i=0; i<nregs; i++) free(regs[i]);
    free(regs);
    return 0;
}