Example #1
0
int main_interleave(int argc, char *argv[])
{
	gzFile fp1, fp2;
	kseq_t *seq[2];
	kstring_t str;

	if (argc < 3) {
		fprintf(stderr, "Usage: fermi interleave <in1.fq> <in2.fq>\n");
		return 1;
	}
	str.l = str.m = 0; str.s = 0;
	fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
	fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
	seq[0] = kseq_init(fp1);
	seq[1] = kseq_init(fp2);
	while (kseq_read(seq[0]) >= 0) {
		if (kseq_read(seq[1]) < 0) break; // one file ends
		str.l = 0;
		if (seq[0]->name.l > 2 && seq[0]->name.s[seq[0]->name.l-2] == '/' && isdigit(seq[0]->name.s[seq[0]->name.l-1]))
			seq[0]->name.s[(seq[0]->name.l -= 2)] = 0; // trim tailing "/[0-9]$"
		seq[1]->name.l = 0;
		kputsn(seq[0]->name.s, seq[0]->name.l, &seq[1]->name); // make sure two ends having the same name
		write_seq(seq[0], &str);
		write_seq(seq[1], &str);
		fputs(str.s, stdout);
	}
	kseq_destroy(seq[0]); gzclose(fp1);
	kseq_destroy(seq[1]); gzclose(fp2);
	free(str.s);
	return 0;
}
Example #2
0
void get_bases_for_each_snp(char filename[], int snp_locations[], char ** bases_for_snps, int length_of_genome, int number_of_snps)
{
  int l;
  int i = 0;
  int sequence_number = 0;
	
	gzFile fp;
	kseq_t *seq;
	
	fp = gzopen(filename, "r");
	seq = kseq_init(fp);

  
	while ((l = kseq_read(seq)) >= 0) 
	{
    
    for(i = 0; i< number_of_snps; i++)
		{
			bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]);
			// Present gaps and unknowns in the same way to Gubbins
			if(bases_for_snps[i][sequence_number] == 'N')
			{
				bases_for_snps[i][sequence_number]  = '-';
			}
		}
    sequence_number++;
  }

	kseq_destroy(seq);
	gzclose(fp);
}
Example #3
0
int build_reference_sequence(char reference_sequence[], char filename[])
{
	int i;
	
	int length_of_genome;

	gzFile fp;
	kseq_t *seq;
	
	fp = gzopen(filename, "r");
	seq = kseq_init(fp);
  kseq_read(seq);

	for(i = 0; i < seq->seq.l; i++)
	{
		reference_sequence[i] = toupper(seq->seq.s[i]);
		if(reference_sequence[i] == 'N')
		{
			reference_sequence[i]  = '-';
		}
	}
    if(reference_sequence[seq->seq.l] != '\0')
    {
      reference_sequence[seq->seq.l]  =   '\0';
    }
	
	kseq_destroy(seq);
	gzclose(fp);
	return 1;
}
Example #4
0
int streamAndCountOneFile(KWTCounterManager *manager)
{
	gzFile inputFP;	
    kseq_t* seq;

	//open file to read lines
	if(!( inputFP = gzopen ( manager->inputFileName , "r" )))
	{
		printf("Could not open input file \"%s\" for reading\n", manager->inputFileName);
		return 1;
	}

    // initialize reader
    seq = kseq_init(inputFP);

    // read sequences
    while(kseq_read(seq) >= 0)
	{
		if(streamOneStringUnchanged(manager, seq->seq.s, seq->seq.l) != 0)
		{
			gzclose(inputFP);
			return 1;
		}	
    }

    kseq_destroy(seq);
	gzclose(inputFP);
	return 0;	
}
Example #5
0
void bwa_seq_close(bwa_seqio_t *bs)
{
	if (bs == 0) return;
	gzclose(bs->ks->f->f);
	kseq_destroy(bs->ks);
	free(bs);
}
Example #6
0
int main_read_stat(int argc, char **argv) {
	if (argc > 1) {
		fprintf(stderr, "Usage: cat *.fq | %s\n", argv[0]);
		exit(1);
	}

	gzFile fp = gzdopen(fileno(stdin), "r");
    kseq_t *seq = kseq_init(fp); // kseq to read files
    int max_len = 0;
    int min_len = 999999999;
    long long total_len = 0;
    long long num_reads = 0;

    while (kseq_read(seq) >= 0) {
    	++num_reads;
    	total_len += seq->seq.l;
    	max_len = std::max(seq->seq.l, (size_t)max_len);
    	min_len = std::min(seq->seq.l, (size_t)min_len);
    }

    double avg_len = total_len * 1.0 / num_reads;

    printf("number reads: %lld\ntotal size:%lld\nlongest: %d\nshortest: %d\navg: %lf\n", num_reads, total_len, max_len, min_len, avg_len);

    kseq_destroy(seq);
    gzclose(fp);
	return 0;
}
Example #7
0
bseq1_t *bseq_read(const char *fn, int *n_)
{
	FILE *fp;
	bseq1_t *seqs;
	kseq_t *ks;
	int m, n;
	uint64_t size = 0;

	*n_ = 0;
	fp = fopen(fn, "rb");
	if (fp == 0) return 0;
	ks = kseq_init(fp);

	m = n = 0; seqs = 0;
	while (kseq_read(ks) >= 0) {
		bseq1_t *s;
		if (n >= m) {
			m = m? m<<1 : 256;
			seqs = realloc(seqs, m * sizeof(bseq1_t));
		}
		s = &seqs[n];
		s->seq = strdup(ks->seq.s);
		s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
		s->l_seq = ks->seq.l;
		size += seqs[n++].l_seq;
	}
	*n_ = n;

	kseq_destroy(ks);
	fclose(fp);
	return seqs;
}
Example #8
0
static seqs_t *load_seqs(const char *fn)
{
	seqs_t *s;
	seq1_t *p;
	gzFile fp;
	int l;
	kseq_t *seq;

	fp = xzopen(fn, "r");
	seq = kseq_init(fp);
	s = (seqs_t*)calloc(1, sizeof(seqs_t));
	s->m_seqs = 256;
	s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
	while ((l = kseq_read(seq)) >= 0) {
		if (s->n_seqs == s->m_seqs) {
			s->m_seqs <<= 1;
			s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
		}
		p = s->seqs + (s->n_seqs++);
		p->l = seq->seq.l;
		p->s = (unsigned char*)malloc(p->l + 1);
		memcpy(p->s, seq->seq.s, p->l);
		p->s[p->l] = 0;
		p->n = strdup((const char*)seq->name.s);
	}
	kseq_destroy(seq);
	gzclose(fp);
	fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
	return s;
}
Example #9
0
static khash_t(s) *load_mask(const char *fn)
{
	kseq_t *seq;
	gzFile fp;
	khash_t(s) *h;
	h = kh_init(s);
	fp = gzopen(fn, "r");
	seq = kseq_init(fp);
	while (kseq_read(seq) >= 0) {
		khint_t k;
		int ret, i;
		mask32_t *p;
		k = kh_put(s, h, strdup(seq->name.s), &ret);
		assert(ret); // duplicated name
		p = &kh_val(h, k);
		p->ori_len = seq->seq.l;
		p->mask = (uint32_t*)calloc((seq->seq.l+31)/32, 4);
		for (i = 0; i < seq->seq.l; ++i)
			if (seq->seq.s[i] == '3')
				p->mask[i/32] |= 1u<<i%32;
	}
	kseq_destroy(seq);
	gzclose(fp);
	return h;
}
void reads_parsing::close_file(int file_indx)
{
     kseq_destroy(read_files[file_indx]->seq);
     gzclose(read_files[file_indx]->fp);
     read_files[file_indx]->fp= NULL;


 }
Example #11
0
static void write_dict(const char *fn, args_t *args)
{
    hts_md5_context *md5;
    int l, i, k;
    gzFile fp;
    kseq_t *seq;
    unsigned char digest[16];
    char hex[33];

    fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
    if (fp == 0) {
        fprintf(stderr, "dict: %s: No such file or directory\n", fn);
        exit(1);
    }
    FILE *out = stdout;
    if (args->output_fname) {
        out = fopen(args->output_fname, "w");
        if (out == NULL) {
          fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
          exit(1);
        }
    }

    if (!(md5 = hts_md5_init()))
        exit(1);

    seq = kseq_init(fp);
    if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
    while ((l = kseq_read(seq)) >= 0) {
        for (i = k = 0; i < seq->seq.l; ++i) {
            if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~')
                seq->seq.s[k++] = toupper(seq->seq.s[i]);
        }
        hts_md5_reset(md5);
        hts_md5_update(md5, (unsigned char*)seq->seq.s, k);
        hts_md5_final(digest, md5);
        hts_md5_hex(hex, digest);
        fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
        if (args->uri)
            fprintf(out, "\tUR:%s", args->uri);
        else if (strcmp(fn, "-") != 0) {
#ifdef _WIN32
            char *real_path = _fullpath(NULL, fn, PATH_MAX);
#else
            char *real_path = realpath(fn, NULL);
#endif
            fprintf(out, "\tUR:file://%s", real_path);
            free(real_path);
        }
        if (args->assembly) fprintf(out, "\tAS:%s", args->assembly);
        if (args->species) fprintf(out, "\tSP:%s", args->species);
        fprintf(out, "\n");
    }
    kseq_destroy(seq);
    hts_md5_destroy(md5);

    if (args->output_fname) fclose(out);
}
Example #12
0
int stk_hety(int argc, char *argv[])
{
	gzFile fp;
	kseq_t *seq;
	int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
	char *buf;
	uint32_t cnt[3];
	if (argc == 1) {
		fprintf(stderr, "\n");
		fprintf(stderr, "Usage:   seqtk hety [options] <in.fa>\n\n");
		fprintf(stderr, "Options: -w INT   window size [%d]\n", win_size);
		fprintf(stderr, "         -t INT   # start positions in a window [%d]\n", n_start);
		fprintf(stderr, "         -m       treat lowercases as masked\n");
		fprintf(stderr, "\n");
		return 1;
	}
	while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
		switch (c) {
		case 'w': win_size = atoi(optarg); break;
		case 't': n_start = atoi(optarg); break;
		case 'm': is_lower_mask = 1; break;
		}
	}
	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
	seq = kseq_init(fp);
	win_step = win_size / n_start;
	buf = calloc(win_size, 1);
	while ((l = kseq_read(seq)) >= 0) {
		int x, i, y, z, next = 0;
		cnt[0] = cnt[1] = cnt[2] = 0;
		for (i = 0; i <= l; ++i) {
			if ((i >= win_size && i % win_step == 0) || i == l) {
				if (i == l && l >= win_size) {
					for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
				}
				if (cnt[1] + cnt[2] > 0)
					printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
						   (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
				next = i;
			}
			if (i < l) {
				y = i % win_size;
				c = seq->seq.s[i];
				if (is_lower_mask && islower(c)) c = 'N';
				c = seq_nt16_table[c];
				x = bitcnt_table[c];
				if (i >= win_size) --cnt[(int)buf[y]];
				buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
				++cnt[z];
			}
		}
	}
	free(buf);
	kseq_destroy(seq);
	gzclose(fp);
	return 0;
}
Example #13
0
int detect_snps(char reference_sequence[], char filename[], int length_of_genome, int exclude_gaps)
{
  int i;
  int number_of_snps = 0;
  int l;
  
  gzFile fp;
  kseq_t *seq;
  
  fp = gzopen(filename, "r");
  seq = kseq_init(fp);
  // First sequence is the reference sequence so skip it
  kseq_read(seq);
  
  while ((l = kseq_read(seq)) >= 0) {
    for(i = 0; i < length_of_genome; i++)
    {
    
      if(exclude_gaps)
      {
        // If there is an indel in the reference sequence, replace with the first proper base you find
        if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' ))
        {
          reference_sequence[i] = toupper(seq->seq.s[i]);
        }
        
        if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i]))
        {
          reference_sequence[i] = '*';
          number_of_snps++;
        }
      }
      else
      {
	
				char input_base = toupper(seq->seq.s[i]);
				if(input_base == 'N')
				{
					input_base = '-';
				}
	
        if(reference_sequence[i] != '*' && reference_sequence[i] != input_base)
        {
         reference_sequence[i] = '*';
         number_of_snps++;
        }
      }
    }
    
  }

  kseq_destroy(seq);
  gzclose(fp);

  return number_of_snps;
}
Example #14
0
void bwa_seq_close(bwa_seqio_t *bs)
{
	if (bs == 0) return;
	if (bs->is_bam) bam_close(bs->fp);
	else {
		gzclose(bs->ks->f->f);
		kseq_destroy(bs->ks);
	}
	free(bs);
}
Example #15
0
void ta_opt_free(ta_opt_t *opt)
{
	int i;
	gzFile fp = opt->ks->f->f;
	for (i = 0; i < opt->n_adaps; ++i)
		free(opt->adaps[i].seq);
	kseq_destroy(opt->ks);
	gzclose(fp);
	free(opt->adaps);
}
Example #16
0
static inline void parse_sequences(
        const char *filename,
        char ***strings_,
        unsigned long **sizes_,
        unsigned long *count_)
{
    FILE* fp;
    kseq_t *seq = NULL;
    int l = 0;
    char **strings = NULL;
    unsigned long *sizes = NULL;
    unsigned long count = 0;
    unsigned long memory = 1000;

    fp = fopen(filename, "r");
    if(fp == NULL) {
        perror("fopen");
        exit(1);
    }
    strings = malloc(sizeof(char*) * memory);
    sizes = malloc(sizeof(unsigned long) * memory);
    seq = kseq_init(fileno(fp));
    while ((l = kseq_read(seq)) >= 0) {
        strings[count] = strdup(seq->seq.s);
        if (NULL == strings[count]) {
            perror("strdup");
            exit(1);
        }
        sizes[count] = seq->seq.l;
        ++count;
        if (count >= memory) {
            char **new_strings = NULL;
            unsigned long *new_sizes = NULL;
            memory *= 2;
            new_strings = realloc(strings, sizeof(char*) * memory);
            if (NULL == new_strings) {
                perror("realloc");
                exit(1);
            }
            strings = new_strings;
            new_sizes = realloc(sizes, sizeof(unsigned long) * memory);
            if (NULL == new_sizes) {
                perror("realloc");
                exit(1);
            }
            sizes = new_sizes;
        }
    }
    kseq_destroy(seq);
    fclose(fp);

    *strings_ = strings;
    *sizes_ = sizes;
    *count_ = count;
}
Example #17
0
int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) {
	gzFile fp;
	int count=0,l;
	int excluded = 0;
	int is_fasta = 0; /* assume fastq */
	kseq_t *seq;

	/* open fasta file */
	fp = gzopen(input_file,"r");
	if (!fp) {
		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
		exit(EXIT_FAILURE);
	}

	seq = kseq_init(fp);

	/* determine file type */
	l = kseq_read(seq); /* read the first sequence */
	is_fasta = seq->qual.s == NULL ? 1 : 0;
	gzrewind(fp); 
	kseq_rewind(seq); /* rewind to beginning for main loop */

    if (verbose_flag) {
        if (is_fasta)
            fprintf(stderr, "Input is FASTA format\n");
        else
            fprintf(stderr, "Input is FASTQ format\n");
    }

	/* search through list and see if this header matches */
	while((l = kseq_read(seq)) >= 0) {
		if (exclude) {
			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s))
				excluded++;
			else {
				/* regex doesn't match, so check size/print */
				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
			}
		} else {
			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) {
				/* regex matches so check size/print */
				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
			} else
				excluded++;
		}
	} /* end of seq traversal */
	kseq_destroy(seq);
	gzclose(fp); /* done reading file so close */

	if (just_count) {
		fprintf(stdout, "Total output: %i\n", count);
		fprintf(stdout, "Total excluded: %i\n", excluded);
	}
	return count;
}
Example #18
0
void bwa_seq_close(bwa_seqio_t *bs)
{
	if (bs == 0) return;
	if (bs->is_bam) {
		if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file");
	} else {
		err_gzclose(bs->ks->f->f);
		kseq_destroy(bs->ks);
	}
	free(bs);
}
Example #19
0
void FastqFile::close() {

    if (kseq != NULL) {

        kseq_destroy(kseq);
        gzclose(fp);

        fnit = fnames.end();
        kseq = NULL;
    }
}
Example #20
0
File: bntseq.c Project: a113n/bwa
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
{
	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac, l;
	bntamb1_t *q;
	FILE *fp;

	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/4, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".pac");
	fp = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
	if (!for_only) { // add the reverse complemented sequence
		m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
		pac = realloc(pac, m_pac/4);
		memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
		for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
			_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
	}
	ret = bns->l_pac;
	{ // finalize .pac file
		ubyte_t ct;
		err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % 4 == 0) {
			ct = 0;
			err_fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % 4;
		err_fwrite(&ct, 1, 1, fp);
		// close .pac file
		err_fflush(fp);
		err_fclose(fp);
	}
	bns_dump(bns, prefix);
	bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
	return ret;
}
Example #21
0
int
fdb_config_destroy (fdb_config_t *cfg)
{
    if (cfg->infns != NULL){
        for (int iii = 0; iii < cfg->n_infs; iii++) {
            if (cfg->infns[iii] != NULL) free(cfg->infns[iii]);
        }
        free(cfg->infns);
    }
    if (cfg->in_kseqs != NULL){
        for (int iii = 0; iii < cfg->n_infs; iii++) {
            if (cfg->in_kseqs[iii] != NULL) {
                FDB_FP_CLOSE(cfg->in_kseqs[iii]->f->f);
                kseq_destroy(cfg->in_kseqs[iii]);
            }
        }
        free(cfg->in_kseqs);
    }
    if (cfg->barcode_file != NULL) free(cfg->barcode_file);
    if (cfg->buffer_seq != NULL) free(cfg->buffer_seq);
    if (cfg->out_dir != NULL) free(cfg->out_dir);
    if (cfg->out_suffix != NULL) free(cfg->out_suffix);
    if (cfg->leftover_suffix != NULL) free(cfg->leftover_suffix);
    if (cfg->barcodes != NULL) {
        for (int iii = 0; iii < cfg->n_barcodes; iii++) {
            if (cfg->barcodes[iii] != NULL) {
                if (cfg->barcodes[iii]->name.s != NULL) {
                    free(cfg->barcodes[iii]->name.s);
                }
                if (cfg->barcodes[iii]->seq.s != NULL) {
                    free(cfg->barcodes[iii]->seq.s);
                }
                for (int jjj = 0; jjj < cfg->n_infs; jjj++) {
                    if (cfg->barcodes[iii]->fns[jjj] != NULL) {
                        free(cfg->barcodes[iii]->fns[jjj]);
                    }
                    if (cfg->barcodes[iii]->fps[jjj] != NULL) {
                        FDB_FP_CLOSE(cfg->barcodes[iii]->fps[jjj]);
                    }
                }
            free(cfg->barcodes[iii]);
            }
        }
        free(cfg->barcodes);
    }
    if (cfg->leftover_outfps != NULL) {
        for (int iii = 0; iii <  cfg->n_infs; iii++) {
            if (cfg->leftover_outfps[iii] != NULL) {
                FDB_FP_CLOSE(cfg->leftover_outfps[iii]);
            }
        }
        free(cfg->leftover_outfps);
    }
}
Example #22
0
int main(int argc, char *argv[])
{
	int l, i, c;
	long long cnt[5], tot;
	kseq_t *seq;
	gzFile fp;
	cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0;
	while ((c = getopt(argc, argv, "l:r:")) >= 0) {
		switch (c) {
		case 'l': g_len = atoi(optarg); break;
		case 'r': g_ratio = atof(optarg); break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: gen_mask [-l %d] [-r %.2lf] <in.rawMask.fa>\n", g_len, g_ratio);
		return 1;
	}
	fp = gzopen(argv[optind], "r");
	seq = kseq_init(fp);
	while ((l = kseq_read(seq)) >= 0) {
		int n_good = 0, n_all = 0, n_mid = 0;
		printf(">%s %d %.3lf", seq->name.s, g_len, g_ratio);
		for (i = 0; i < l + g_len - 1; ++i) {
			int c1, c2;
			unsigned x = i < l? get_cnt(seq->seq.s[i]) : 0;
			c1 = x>>16; c2 = x&0xffff;
			if (c1 == 1) ++cnt[4];
			if (c1) {
				++n_all;
				if (is_good(c1, c2)) ++n_good;
				if (c1 == 1) ++n_mid;
			}
			x = i >= g_len? get_cnt(seq->seq.s[i - g_len]) : 0;
			c1 = x>>16; c2 = x&0xffff;
			if (c1) {
				--n_all;
				if (is_good(c1, c2)) --n_good;
				if (c1 == 1) --n_mid;
			}
			assert(n_all <= g_len && n_good <= n_all);
			if (i % 60 == 0) putchar('\n');
			x = n_all == 0? 0 : (double)n_good/n_all >= g_ratio? 3 : (double)n_mid/n_all >= g_ratio? 2 : 1;
			putchar(x + '0');
			cnt[x]++;
		}
		putchar('\n');
	}
	tot = cnt[1] + cnt[2] + cnt[3];
	fprintf(stderr, "%lld, %lld, %lld, %lld, %lld\n", cnt[0], cnt[1], cnt[2], cnt[3], cnt[4]);
	fprintf(stderr, "%lf, %lf, %lf\n", (double)cnt[3] / tot, (double)(cnt[2] + cnt[3]) / tot, (double)cnt[4] / tot);
	kseq_destroy(seq);
	gzclose(fp);
	return 0;
}
void Fast5Map::load_from_fasta(std::string fasta_filename)
{
    gzFile gz_fp;

    FILE* fp = fopen(fasta_filename.c_str(), "r");
    if(fp == NULL) {
        fprintf(stderr, "error: could not open %s for read\n", fasta_filename.c_str());
        exit(EXIT_FAILURE);
    }

    gz_fp = gzdopen(fileno(fp), "r");
    if(gz_fp == NULL) {
        fprintf(stderr, "error: could not open %s using gzdopen\n", fasta_filename.c_str());
        exit(EXIT_FAILURE);
    }

    kseq_t* seq = kseq_init(gz_fp);
    
    while(kseq_read(seq) >= 0) {
        if(seq->comment.l == 0) {
            fprintf(stderr, "error: no path associated with read %s\n", seq->name.s);
            exit(EXIT_FAILURE);
        }

        // This splitting code implicitly handles both the 2 and 3 field
        // fasta format that poretools will output. The FAST5 path
        // is always the last field.
        std::vector<std::string> fields = split(seq->comment.s, ' ');
        read_to_path_map[seq->name.s] = fields.back();
    }

    kseq_destroy(seq);
    gzclose(gz_fp);
    fclose(fp);
    
    // Sanity check that the first path actually points to a file
    if(read_to_path_map.size() > 0) {
        std::string first_read = read_to_path_map.begin()->first;
        std::string first_path = read_to_path_map.begin()->second;
        struct stat file_s;
        int ret = stat(first_path.c_str(), &file_s);
        if(ret != 0) {
            fprintf(stderr, "Error: could not find path to FAST5 for read %s\n", first_read.c_str());
            fprintf(stderr, "Please make sure that this path is accessible: %s\n", first_path.c_str());
            exit(EXIT_FAILURE);
        }
    }

    // Write the map as a fofn file so next time we don't have to parse
    // the entire fasta
    write_to_fofn(fasta_filename + FOFN_SUFFIX);
}
void reads_parsing::close()
 {
     for(int i=0;i<nb_files;i++)
       {
          if(read_files[i]->fp !=NULL)
           {
             kseq_destroy(read_files[i]->seq);
             gzclose(read_files[i]->fp);
             read_files[i]->fp =NULL;
           }
           //free(read_files[i]);
       }
   }
Example #25
0
int main(void)
{
	gzFile fp;
	kseq_t *seq;
	int n = 0, slen = 0, qlen = 0;
	fp = gzdopen(fileno(stdin), "r");
	seq = kseq_init(fp);
	while (kseq_read(seq) >= 0)
		printf("%s\t%s\t%s\t%s\n", seq->name.s, seq->comment.s, seq->seq.s, seq->qual.s);
	kseq_destroy(seq);
	gzclose(fp);
	return 0;
}
Example #26
0
int64_t dump_forward_pac(gzFile fp_fa, const char *prefix)
{
	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac;
	bntamb1_t *q;
	FILE *fp;

	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/4, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".bis.pac");
	fp = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);

	ret = bns->l_pac;
	{ // finalize .pac file
		ubyte_t ct;
		err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % 4 == 0) {
			ct = 0;
			err_fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % 4;
		err_fwrite(&ct, 1, 1, fp);
		// close .pac file
		err_fflush(fp);
		err_fclose(fp);
	}
  /* re-dump forward bis bns, otherwise the .bis.ann and .bis.amb have twice as long pac  */
  /* strcpy(name, prefix); strcat(name, ".bis"); */
  /* bis_bns_dump(bns, prefix); */

  bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
	return ret;
}
Example #27
0
int main(int argc, char *argv[])
{
	bwaidx_t *idx;
	gzFile fp;
	kseq_t *ks;
	mem_opt_t *opt;

	if (argc < 3) {
		fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
		return 1;
	}

	idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
	if (NULL == idx) {
		fprintf(stderr, "Index load failed.\n");
		exit(EXIT_FAILURE);
	}
	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
	if (NULL == fp) {
		fprintf(stderr, "Couldn't open %s : %s\n",
				strcmp(argv[2], "-") ? argv[2] : "stdin",
				errno ? strerror(errno) : "Out of memory");
		exit(EXIT_FAILURE);
	}
	ks = kseq_init(fp); // initialize the FASTA/Q parser
	opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values

	while (kseq_read(ks) >= 0) { // read one sequence
		mem_alnreg_v ar;
		int i, k;
		ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
		for (i = 0; i < ar.n; ++i) { // traverse each hit
			mem_aln_t a;
			if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
			a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
			// print alignment
			err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
			for (k = 0; k < a.n_cigar; ++k) // print CIGAR
				err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
			err_printf("\t%d\n", a.NM); // print edit distance
			free(a.cigar); // don't forget to deallocate CIGAR
		}
		free(ar.a); // and deallocate the hit list
	}

	free(opt);
	kseq_destroy(ks);
	err_gzclose(fp);
	bwa_idx_destroy(idx);
	return 0;
}
Example #28
0
static void md5_one(const char *fn)
{
    hts_md5_context *md5_one, *md5_all;
    int l, i, k;
    gzFile fp;
    kseq_t *seq;
    unsigned char unordered[16], digest[16];
    char hex[33];

    for (l = 0; l < 16; ++l) unordered[l] = 0;
    fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
    if (fp == 0) {
        fprintf(stderr, "md5fa: %s: No such file or directory\n", fn);
        exit(1);
    }

    if (!(md5_all = hts_md5_init()))
        exit(1);

    if (!(md5_one = hts_md5_init())) {
        hts_md5_destroy(md5_all);
        exit(1);
    }

    seq = kseq_init(fp);
    while ((l = kseq_read(seq)) >= 0) {
        for (i = k = 0; i < seq->seq.l; ++i) {
            if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
            else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
        }
        hts_md5_reset(md5_one);
        hts_md5_update(md5_one, (unsigned char*)seq->seq.s, k);
        hts_md5_final(digest, md5_one);
        hts_md5_hex(hex, digest);
        for (l = 0; l < 16; ++l)
            unordered[l] ^= digest[l];
        printf("%s  %s  %s\n", hex, fn, seq->name.s);
        hts_md5_update(md5_all, (unsigned char*)seq->seq.s, k);
    }
    hts_md5_final(digest, md5_all);
    kseq_destroy(seq);

    hts_md5_hex(hex, digest);
    printf("%s  %s  >ordered\n", hex, fn);
    hts_md5_hex(hex, unordered);
    printf("%s  %s  >unordered\n", hex, fn);

    hts_md5_destroy(md5_all);
    hts_md5_destroy(md5_one);
}
Example #29
0
void bwa_seq_close(bwa_seqio_t *bs)
{
    int i;
    if (bs == 0) return;
    if (bs->is_bam) bam_close(bs->fp);
    else {
        gzclose(bs->ks->f->f);
        kseq_destroy(bs->ks);
    }
    for(i=0; i!=3; ++i)
        if(bs->sai[i])
            fclose(bs->sai[i]);
    free(bs);
}
Example #30
0
void read_fastq(std::string fastq_filename, std::vector<std::string>* fastq_names,
                std::vector<size_t>* fastq_lengths)
{
  gzFile fastq_file = gzopen(fastq_filename.c_str(), "r");
  kseq_t* seq;
  int l;
  seq = kseq_init(fastq_file);
  while((l = kseq_read(seq)) >= 0) {
    fastq_names->push_back((std::string) seq->name.s);
    fastq_lengths->push_back(seq->seq.l);
  }
  kseq_destroy(seq);
  gzclose(fastq_file);
}