Exemple #1
0
/*
 * Sample a fraction of the input sequences.
 */
int
sample_f(double f, kseq_t *ks1, kseq_t *ks2, FILE *fs_out1, FILE *fs_out2, int paired)
{
	int seq_no = 0, l1 = 0, l2 = 0;

	srand48((unsigned long)ks1 + time(NULL));

	if (paired) {
		while (1) {
			if ((l1 = kseq_read(ks1)) < 0 || (l2 = kseq_read(ks2)) < 0)
				break;
			if (drand48() < f) {
				/* XXX: Do a consistency check between the reads? */ \
				kseq_write(ks1, fs_out1);
				kseq_write(ks2, fs_out2);
			}
			seq_no++;
		}
	} else {
		while (1) {
			if ((l1 = kseq_read(ks1)) < 0)
				break;
			if (drand48() < f)
				kseq_write(ks1, fs_out1);
			seq_no++;
		}
	}

	if (l1 == -2 || l2 == -2) {
		fprintf(stderr, "invalid fastq entry at line %d\n", 4*seq_no+1);
		return -1;
	}

	return 0;
}
Exemple #2
0
int stk_famask(int argc, char *argv[])
{
	gzFile fp[2];
	kseq_t *seq[2];
	int i, l;
	if (argc < 3) {
		fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
		return 1;
	}
	for (i = 0; i < 2; ++i) {
		fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
		seq[i] = kseq_init(fp[i]);
	}
	while (kseq_read(seq[0]) >= 0) {
		int min_l, c[2];
		kseq_read(seq[1]);
		if (strcmp(seq[0]->name.s, seq[1]->name.s))
			fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
		if (seq[0]->seq.l != seq[1]->seq.l)
			fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
		min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
		printf(">%s", seq[0]->name.s);
		for (l = 0; l < min_l; ++l) {
			c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
			if (c[1] == 'x') c[0] = tolower(c[0]);
			else if (c[1] != 'X') c[0] = c[1];
			if (l%60 == 0) putchar('\n');
			putchar(c[0]);
		}
		putchar('\n');
	}
	return 0;
}
Exemple #3
0
int main_interleave(int argc, char *argv[])
{
	gzFile fp1, fp2;
	kseq_t *seq[2];
	kstring_t str;

	if (argc < 3) {
		fprintf(stderr, "Usage: fermi interleave <in1.fq> <in2.fq>\n");
		return 1;
	}
	str.l = str.m = 0; str.s = 0;
	fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
	fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
	seq[0] = kseq_init(fp1);
	seq[1] = kseq_init(fp2);
	while (kseq_read(seq[0]) >= 0) {
		if (kseq_read(seq[1]) < 0) break; // one file ends
		str.l = 0;
		if (seq[0]->name.l > 2 && seq[0]->name.s[seq[0]->name.l-2] == '/' && isdigit(seq[0]->name.s[seq[0]->name.l-1]))
			seq[0]->name.s[(seq[0]->name.l -= 2)] = 0; // trim tailing "/[0-9]$"
		seq[1]->name.l = 0;
		kputsn(seq[0]->name.s, seq[0]->name.l, &seq[1]->name); // make sure two ends having the same name
		write_seq(seq[0], &str);
		write_seq(seq[1], &str);
		fputs(str.s, stdout);
	}
	kseq_destroy(seq[0]); gzclose(fp1);
	kseq_destroy(seq[1]); gzclose(fp2);
	free(str.s);
	return 0;
}
Exemple #4
0
int detect_snps(char reference_sequence[], char filename[], int length_of_genome, int exclude_gaps)
{
  int i;
  int number_of_snps = 0;
  int l;
  
  gzFile fp;
  kseq_t *seq;
  
  fp = gzopen(filename, "r");
  seq = kseq_init(fp);
  // First sequence is the reference sequence so skip it
  kseq_read(seq);
  
  while ((l = kseq_read(seq)) >= 0) {
    for(i = 0; i < length_of_genome; i++)
    {
    
      if(exclude_gaps)
      {
        // If there is an indel in the reference sequence, replace with the first proper base you find
        if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' ))
        {
          reference_sequence[i] = toupper(seq->seq.s[i]);
        }
        
        if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i]))
        {
          reference_sequence[i] = '*';
          number_of_snps++;
        }
      }
      else
      {
	
				char input_base = toupper(seq->seq.s[i]);
				if(input_base == 'N')
				{
					input_base = '-';
				}
	
        if(reference_sequence[i] != '*' && reference_sequence[i] != input_base)
        {
         reference_sequence[i] = '*';
         number_of_snps++;
        }
      }
    }
    
  }

  kseq_destroy(seq);
  gzclose(fp);

  return number_of_snps;
}
Exemple #5
0
int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) {
	gzFile fp;
	int count=0,l;
	int excluded = 0;
	int is_fasta = 0; /* assume fastq */
	kseq_t *seq;

	/* open fasta file */
	fp = gzopen(input_file,"r");
	if (!fp) {
		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
		exit(EXIT_FAILURE);
	}

	seq = kseq_init(fp);

	/* determine file type */
	l = kseq_read(seq); /* read the first sequence */
	is_fasta = seq->qual.s == NULL ? 1 : 0;
	gzrewind(fp); 
	kseq_rewind(seq); /* rewind to beginning for main loop */

    if (verbose_flag) {
        if (is_fasta)
            fprintf(stderr, "Input is FASTA format\n");
        else
            fprintf(stderr, "Input is FASTQ format\n");
    }

	/* search through list and see if this header matches */
	while((l = kseq_read(seq)) >= 0) {
		if (exclude) {
			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s))
				excluded++;
			else {
				/* regex doesn't match, so check size/print */
				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
			}
		} else {
			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) {
				/* regex matches so check size/print */
				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
			} else
				excluded++;
		}
	} /* end of seq traversal */
	kseq_destroy(seq);
	gzclose(fp); /* done reading file so close */

	if (just_count) {
		fprintf(stdout, "Total output: %i\n", count);
		fprintf(stdout, "Total excluded: %i\n", excluded);
	}
	return count;
}
Exemple #6
0
// read one sequence (reversed) from fastq file to half_byte_array (i.e. 4bit for one base pair )
int bwa_read_seq_one_half_byte (bwa_seqio_t *bs, unsigned char * half_byte_array, unsigned int start_index, unsigned short * length, int mid)
{
	kseq_t *seq = bs->ks;
	int len, i, mided_len;

	if (((len = kseq_read(seq)) >= 0) && (len > mid)) // added to process only when len is longer than mid tag
	{
		//To cut the length of the sequence
		if ( len > MAX_READ_LENGTH) len = MAX_READ_LENGTH;

		mided_len = len - mid;

		for (i = 0; i < mided_len; i++)
		{
			write_to_half_byte_array(half_byte_array,start_index+i,nst_nt4_table[(int)seq->seq.s[len-i-1]]);
		}

		*length = mided_len;
	}
	else
	{
		*length = 0;
	}

	return len;
}
Exemple #7
0
int main_read_stat(int argc, char **argv) {
	if (argc > 1) {
		fprintf(stderr, "Usage: cat *.fq | %s\n", argv[0]);
		exit(1);
	}

	gzFile fp = gzdopen(fileno(stdin), "r");
    kseq_t *seq = kseq_init(fp); // kseq to read files
    int max_len = 0;
    int min_len = 999999999;
    long long total_len = 0;
    long long num_reads = 0;

    while (kseq_read(seq) >= 0) {
    	++num_reads;
    	total_len += seq->seq.l;
    	max_len = std::max(seq->seq.l, (size_t)max_len);
    	min_len = std::min(seq->seq.l, (size_t)min_len);
    }

    double avg_len = total_len * 1.0 / num_reads;

    printf("number reads: %lld\ntotal size:%lld\nlongest: %d\nshortest: %d\navg: %lf\n", num_reads, total_len, max_len, min_len, avg_len);

    kseq_destroy(seq);
    gzclose(fp);
	return 0;
}
Exemple #8
0
ssize_t read_kseq_with2bit(SeqFileObj * const seqObj) {
    size_t seqlen;	// in fact size_t, but minus values are meanful.
    int_fast8_t rvalue = kseq_read(seqObj->fobj);
    if (rvalue>0) {
        uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now.
        kseq_t *kseq;
        kseq = seqObj->fobj;
        seqlen = kseq->seq.l;
		seqObj->name = kseq->name.s;
		if (! kseq->comment.l) seqObj->comment = NULL;
		 else seqObj->comment = kseq->comment.s;
		seqObj->seq = kseq->seq.s;
        if (rvalue&2) { // withQ
            //encodeQ;
            type |= 8u;
            seqObj->qual = kseq->qual.s;
        } else {
            seqObj->qual = NULL;
        }
        size_t needtomallocQQW = (seqlen+31u)>>5;  // 1 "QQWord" = 4 QWord = 32 bp. Well, I say there is QQW.
        if (needtomallocQQW > seqObj->binMallocedQQWord) {
            KROUNDUP32(needtomallocQQW);
            seqObj->binMallocedQQWord = needtomallocQQW;
            seqObj->diBseq = realloc(seqObj->diBseq,needtomallocQQW<<3);	// 2^3=8
            seqObj->hexBQ = realloc(seqObj->hexBQ,needtomallocQQW<<5);	// 4*2^3=32
        }
        seqObj->binNcount = base2dbit(seqlen, kseq->seq.s, seqObj->qual, seqObj->diBseq, seqObj->hexBQ);
// printf("-[%s]<%s><%zx>[%s]-\n",kseq->seq.s, qstr, seqObj->diBseq[0], unit2basechr(seqObj->diBseq[0]));
    // Well, how to deal with smallcase masking ? Not using this information yet.
        NormalizeChrSeq(kseq->seq.s);   // to /[ATCGN]*/
        seqObj->readlength = seqlen;
        seqObj->type = type;
        return seqlen;
    } else return rvalue;
Exemple #9
0
ssize_t read_kseq_no2bit(SeqFileObj * const seqObj) {
    size_t seqlen;	// in fact size_t, but minus values are meanful.
    int_fast8_t rvalue = kseq_read(seqObj->fobj);
//fputs("<--->", stderr);
    if (rvalue>0) {
        uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now.
        kseq_t *kseq;
        kseq = seqObj->fobj;
        seqlen = kseq->seq.l;
		seqObj->name = kseq->name.s;
		if (! kseq->comment.l) seqObj->comment = NULL;
		 else seqObj->comment = kseq->comment.s;
		seqObj->seq = kseq->seq.s;
		NormalizeChrSeq(kseq->seq.s);   // to /[ATCGN]*/
        if (rvalue&2) { // withQ
            //encodeQ;
            type |= 8u;
            seqObj->qual = kseq->qual.s;
        } else {
            seqObj->qual = NULL;
        }
        seqObj->readlength = seqlen;
        seqObj->type = type;
        return seqlen;
    } else return rvalue;
}
Exemple #10
0
void get_bases_for_each_snp(char filename[], int snp_locations[], char ** bases_for_snps, int length_of_genome, int number_of_snps)
{
  int l;
  int i = 0;
  int sequence_number = 0;
	
	gzFile fp;
	kseq_t *seq;
	
	fp = gzopen(filename, "r");
	seq = kseq_init(fp);

  
	while ((l = kseq_read(seq)) >= 0) 
	{
    
    for(i = 0; i< number_of_snps; i++)
		{
			bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]);
			// Present gaps and unknowns in the same way to Gubbins
			if(bases_for_snps[i][sequence_number] == 'N')
			{
				bases_for_snps[i][sequence_number]  = '-';
			}
		}
    sequence_number++;
  }

	kseq_destroy(seq);
	gzclose(fp);
}
static khash_t(s) *load_mask(const char *fn)
{
	kseq_t *seq;
	gzFile fp;
	khash_t(s) *h;
	h = kh_init(s);
	fp = gzopen(fn, "r");
	seq = kseq_init(fp);
	while (kseq_read(seq) >= 0) {
		khint_t k;
		int ret, i;
		mask32_t *p;
		k = kh_put(s, h, strdup(seq->name.s), &ret);
		assert(ret); // duplicated name
		p = &kh_val(h, k);
		p->ori_len = seq->seq.l;
		p->mask = (uint32_t*)calloc((seq->seq.l+31)/32, 4);
		for (i = 0; i < seq->seq.l; ++i)
			if (seq->seq.s[i] == '3')
				p->mask[i/32] |= 1u<<i%32;
	}
	kseq_destroy(seq);
	gzclose(fp);
	return h;
}
Exemple #12
0
// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file
int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) {
	struct CDS currentCDS;
	gzFile  inputSeqPtr;
	FILE * inputAnnPtr, * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long outputSize, currentLine;

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	inputAnnPtr = fopen(passAnnName, "r");
	outputPtr = fopen(passProName, "w");

	// Read in 1st sequence data
	seq = kseq_init(inputSeqPtr);
	kseq_read(seq);

	// Iterate through each CDS sequence in the annotation file
	currentLine = 0;
	while (getNextCDS(inputAnnPtr, &currentCDS, &currentLine)) {
		convertToAA(seq->seq.s, &currentCDS, &outputBuffer, &outputSize);
		fprintf(outputPtr, ">%d\n%.*s\n", currentLine,  outputSize, outputBuffer);
		free(outputBuffer);
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(inputAnnPtr);
	err_fclose(outputPtr);

	return 0;
}
Exemple #13
0
int build_reference_sequence(char reference_sequence[], char filename[])
{
	int i;
	
	int length_of_genome;

	gzFile fp;
	kseq_t *seq;
	
	fp = gzopen(filename, "r");
	seq = kseq_init(fp);
  kseq_read(seq);

	for(i = 0; i < seq->seq.l; i++)
	{
		reference_sequence[i] = toupper(seq->seq.s[i]);
		if(reference_sequence[i] == 'N')
		{
			reference_sequence[i]  = '-';
		}
	}
    if(reference_sequence[seq->seq.l] != '\0')
    {
      reference_sequence[seq->seq.l]  =   '\0';
    }
	
	kseq_destroy(seq);
	gzclose(fp);
	return 1;
}
Exemple #14
0
int streamAndCountOneFile(KWTCounterManager *manager)
{
	gzFile inputFP;	
    kseq_t* seq;

	//open file to read lines
	if(!( inputFP = gzopen ( manager->inputFileName , "r" )))
	{
		printf("Could not open input file \"%s\" for reading\n", manager->inputFileName);
		return 1;
	}

    // initialize reader
    seq = kseq_init(inputFP);

    // read sequences
    while(kseq_read(seq) >= 0)
	{
		if(streamOneStringUnchanged(manager, seq->seq.s, seq->seq.l) != 0)
		{
			gzclose(inputFP);
			return 1;
		}	
    }

    kseq_destroy(seq);
	gzclose(inputFP);
	return 0;	
}
Exemple #15
0
static seqs_t *load_seqs(const char *fn)
{
	seqs_t *s;
	seq1_t *p;
	gzFile fp;
	int l;
	kseq_t *seq;

	fp = xzopen(fn, "r");
	seq = kseq_init(fp);
	s = (seqs_t*)calloc(1, sizeof(seqs_t));
	s->m_seqs = 256;
	s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
	while ((l = kseq_read(seq)) >= 0) {
		if (s->n_seqs == s->m_seqs) {
			s->m_seqs <<= 1;
			s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
		}
		p = s->seqs + (s->n_seqs++);
		p->l = seq->seq.l;
		p->s = (unsigned char*)malloc(p->l + 1);
		memcpy(p->s, seq->seq.s, p->l);
		p->s[p->l] = 0;
		p->n = strdup((const char*)seq->name.s);
	}
	kseq_destroy(seq);
	gzclose(fp);
	fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
	return s;
}
Exemple #16
0
bseq1_t *bseq_read(const char *fn, int *n_)
{
	FILE *fp;
	bseq1_t *seqs;
	kseq_t *ks;
	int m, n;
	uint64_t size = 0;

	*n_ = 0;
	fp = fopen(fn, "rb");
	if (fp == 0) return 0;
	ks = kseq_init(fp);

	m = n = 0; seqs = 0;
	while (kseq_read(ks) >= 0) {
		bseq1_t *s;
		if (n >= m) {
			m = m? m<<1 : 256;
			seqs = realloc(seqs, m * sizeof(bseq1_t));
		}
		s = &seqs[n];
		s->seq = strdup(ks->seq.s);
		s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
		s->l_seq = ks->seq.l;
		size += seqs[n++].l_seq;
	}
	*n_ = n;

	kseq_destroy(ks);
	fclose(fp);
	return seqs;
}
Exemple #17
0
int read_fasta(char* seqfile, REFLIST* reflist) {
    clock_t t;
    kseq_t *seq;
    gzFile fp = gzopen(seqfile, "r");
    seq = kseq_init(fp);
    if (fp == NULL) {
        fprintf(stderr, "file %s not found \n", seqfile);
        return -1;
    }
    fprintf(stderr, "reading reference sequence file %s with %d sequences\n", seqfile, reflist->ns);
    t = clock();
    int i=0, j=0; char c;
    while (kseq_read(seq) >= 0) {
        memcpy(reflist->sequences[i], seq->seq.s, seq->seq.l);
        for (j = 0; j < seq->seq.l; j++) {
            reflist->sequences[i][j] = toupper(reflist->sequences[i][j]);
        }
        i++;
    }
    gzclose(fp); fp=NULL;
    for (i = 0; i < reflist->ns; i++) {
        reflist->sequences[i][reflist->lengths[i]] = '\0';
        if (i < 10) {
            fprintf(stderr, "%s %d ", reflist->names[i], reflist->lengths[i]);
            for (j = 0; j < 30; j++) fprintf(stderr, "%c", reflist->sequences[i][j]);
            fprintf(stderr, "\n");
        }
    }
    fprintf(stderr, "read reference sequence file in %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
    return 1;
}
Exemple #18
0
static void write_dict(const char *fn, args_t *args)
{
    hts_md5_context *md5;
    int l, i, k;
    gzFile fp;
    kseq_t *seq;
    unsigned char digest[16];
    char hex[33];

    fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
    if (fp == 0) {
        fprintf(stderr, "dict: %s: No such file or directory\n", fn);
        exit(1);
    }
    FILE *out = stdout;
    if (args->output_fname) {
        out = fopen(args->output_fname, "w");
        if (out == NULL) {
          fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
          exit(1);
        }
    }

    if (!(md5 = hts_md5_init()))
        exit(1);

    seq = kseq_init(fp);
    if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
    while ((l = kseq_read(seq)) >= 0) {
        for (i = k = 0; i < seq->seq.l; ++i) {
            if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~')
                seq->seq.s[k++] = toupper(seq->seq.s[i]);
        }
        hts_md5_reset(md5);
        hts_md5_update(md5, (unsigned char*)seq->seq.s, k);
        hts_md5_final(digest, md5);
        hts_md5_hex(hex, digest);
        fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
        if (args->uri)
            fprintf(out, "\tUR:%s", args->uri);
        else if (strcmp(fn, "-") != 0) {
#ifdef _WIN32
            char *real_path = _fullpath(NULL, fn, PATH_MAX);
#else
            char *real_path = realpath(fn, NULL);
#endif
            fprintf(out, "\tUR:file://%s", real_path);
            free(real_path);
        }
        if (args->assembly) fprintf(out, "\tAS:%s", args->assembly);
        if (args->species) fprintf(out, "\tSP:%s", args->species);
        fprintf(out, "\n");
    }
    kseq_destroy(seq);
    hts_md5_destroy(md5);

    if (args->output_fname) fclose(out);
}
/* Parser */
void parse_seq_pass1(kseq_t * seq, int ** inds_ptr, int ** zinds_ptr, int * N_ptr, int * M_ptr, double max_gap_fraction)
{
    int t;
    int l0 = 0;
    int N = 0;
    int M = 0;
    int * inds = NULL;
    int * zinds = NULL;
    int zl = 1000;
    int c;
    int ngaps;
    int seqn = 0;

    zinds = malloc(zl * sizeof(int));

    while ((t = kseq_read(seq)) >= 0) {
        char * s = seq->seq.s;
        int l = strlen(s);
        ngaps = 0;
        if (M == 0) {
            inds = malloc(l * sizeof(int));
            for (c = 0; c < l; ++c) {
                inds[c] = (s[c] != '.' && s[c] == toupper(s[c]));
                ngaps += (s[c] == '-');
            }
            l0 = l;
            for (c = 0; c < l0; ++c) {
                N += inds[c];
            }
        } else {
            if (l != l0) {
                mexErrMsgIdAndTxt("read_alignment_fasta:input", "input data is unaligned");
            }
            for (c = 0; c < l; ++c) {
                if (inds[c] != (s[c] != '.' && s[c] == toupper(s[c]))) {
                    mexErrMsgIdAndTxt("read_alignment_fasta:input", "input data is unaligned?");
                }
                ngaps += (s[c] == '-');
            }
        }
        if (seqn > zl) {
            zl *= 2;
            zinds = realloc(zinds, zl * sizeof(int));
        }
        if ((double) ngaps / N <= max_gap_fraction) {
            zinds[seqn] = 1;
            ++M;
        } else {
            zinds[seqn] = 0;
        }
        ++seqn;
    }
    *inds_ptr = inds;
    *zinds_ptr = zinds;
    *N_ptr = N;
    *M_ptr = M;
    return;
}
Exemple #20
0
int stk_hety(int argc, char *argv[])
{
	gzFile fp;
	kseq_t *seq;
	int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
	char *buf;
	uint32_t cnt[3];
	if (argc == 1) {
		fprintf(stderr, "\n");
		fprintf(stderr, "Usage:   seqtk hety [options] <in.fa>\n\n");
		fprintf(stderr, "Options: -w INT   window size [%d]\n", win_size);
		fprintf(stderr, "         -t INT   # start positions in a window [%d]\n", n_start);
		fprintf(stderr, "         -m       treat lowercases as masked\n");
		fprintf(stderr, "\n");
		return 1;
	}
	while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
		switch (c) {
		case 'w': win_size = atoi(optarg); break;
		case 't': n_start = atoi(optarg); break;
		case 'm': is_lower_mask = 1; break;
		}
	}
	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
	seq = kseq_init(fp);
	win_step = win_size / n_start;
	buf = calloc(win_size, 1);
	while ((l = kseq_read(seq)) >= 0) {
		int x, i, y, z, next = 0;
		cnt[0] = cnt[1] = cnt[2] = 0;
		for (i = 0; i <= l; ++i) {
			if ((i >= win_size && i % win_step == 0) || i == l) {
				if (i == l && l >= win_size) {
					for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
				}
				if (cnt[1] + cnt[2] > 0)
					printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
						   (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
				next = i;
			}
			if (i < l) {
				y = i % win_size;
				c = seq->seq.s[i];
				if (is_lower_mask && islower(c)) c = 'N';
				c = seq_nt16_table[c];
				x = bitcnt_table[c];
				if (i >= win_size) --cnt[(int)buf[y]];
				buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
				++cnt[z];
			}
		}
	}
	free(buf);
	kseq_destroy(seq);
	gzclose(fp);
	return 0;
}
Exemple #21
0
static inline void parse_sequences(
        const char *filename,
        char ***strings_,
        unsigned long **sizes_,
        unsigned long *count_)
{
    FILE* fp;
    kseq_t *seq = NULL;
    int l = 0;
    char **strings = NULL;
    unsigned long *sizes = NULL;
    unsigned long count = 0;
    unsigned long memory = 1000;

    fp = fopen(filename, "r");
    if(fp == NULL) {
        perror("fopen");
        exit(1);
    }
    strings = malloc(sizeof(char*) * memory);
    sizes = malloc(sizeof(unsigned long) * memory);
    seq = kseq_init(fileno(fp));
    while ((l = kseq_read(seq)) >= 0) {
        strings[count] = strdup(seq->seq.s);
        if (NULL == strings[count]) {
            perror("strdup");
            exit(1);
        }
        sizes[count] = seq->seq.l;
        ++count;
        if (count >= memory) {
            char **new_strings = NULL;
            unsigned long *new_sizes = NULL;
            memory *= 2;
            new_strings = realloc(strings, sizeof(char*) * memory);
            if (NULL == new_strings) {
                perror("realloc");
                exit(1);
            }
            strings = new_strings;
            new_sizes = realloc(sizes, sizeof(unsigned long) * memory);
            if (NULL == new_sizes) {
                perror("realloc");
                exit(1);
            }
            sizes = new_sizes;
        }
    }
    kseq_destroy(seq);
    fclose(fp);

    *strings_ = strings;
    *sizes_ = sizes;
    *count_ = count;
}
Exemple #22
0
void load_seqid_taxid_rel(char * seqid_taxid_file){
    seqid_taxid_rel = NULL;
    tax_to_seqs = NULL;
    gzFile seq_tax = gzopen(seqid_taxid_file, "r");
    kseq_t * seq_t = kseq_init(seq_tax);
    struct seqid_taxid_single * tstr;
    struct taxid_seqid * tts;
    int l = 0;
    
    while ((l = kseq_read(seq_t)) >= 0) {
        //add taxid related to seqid
        uint64_t tmp_taxid = 0;
        HASH_FIND_STR(seqid_taxid_rel, seq_t->name.s, tstr);
        if (tstr == NULL) {
            if (seq_t->name.l > SEQID_SIZE) {
                printf("seqid %s is too long, must be less than 100 characters.  Exiting...", seq_t->name.s);
                exit(1);
            }
            tstr = (struct seqid_taxid_single *)malloc(sizeof(struct seqid_taxid_single));
            memset(tstr->seqid, '\0', SEQID_SIZE*sizeof(char));
            strncpy(tstr->seqid, seq_t->name.s, seq_t->name.l);
            tmp_taxid = strtoull(seq_t->seq.s, NULL, 10);
            tstr->taxid = tmp_taxid;
            HASH_ADD_STR(seqid_taxid_rel, seqid, tstr);
            
        }
        else {
            printf("%s already seen in hash",seq_t->name.s);
        }
        
        //add seqid(s) related to taxid
        HASH_FIND(hh, tax_to_seqs, &tmp_taxid, sizeof(uint64_t), tts);
        if (tts == NULL) {
            tts = (struct taxid_seqid *)malloc(sizeof(struct taxid_seqid));
            tts->taxid = tmp_taxid;
            tts->num_seqids = 0;
            tts->max_seqs = 10;
            tts->seqids = (char **)malloc(sizeof(char *)*10);
            tts->seqids[tts->num_seqids] = (char *)calloc(SEQID_SIZE,sizeof(char));
            strncpy(tts->seqids[tts->num_seqids], seq_t->name.s, seq_t->name.l);
            tts->num_seqids++;
            HASH_ADD(hh, tax_to_seqs, taxid, sizeof(uint64_t), tts);
        }
        else {
            if (tts->num_seqids >= tts->max_seqs) {
                tts->max_seqs = 2 * tts->max_seqs;
                tts->seqids = (char **)realloc(tts->seqids, sizeof(char *)*tts->max_seqs);
            }
            tts->seqids[tts->num_seqids] = (char *)calloc(SEQID_SIZE,sizeof(char));
            strncpy(tts->seqids[tts->num_seqids], seq_t->name.s, seq_t->name.l);
            tts->num_seqids++;
        }
    }
    gzclose(seq_tax);
}
Exemple #23
0
int main(int argc, char *argv[])
{
	int l, i, c;
	long long cnt[5], tot;
	kseq_t *seq;
	gzFile fp;
	cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0;
	while ((c = getopt(argc, argv, "l:r:")) >= 0) {
		switch (c) {
		case 'l': g_len = atoi(optarg); break;
		case 'r': g_ratio = atof(optarg); break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: gen_mask [-l %d] [-r %.2lf] <in.rawMask.fa>\n", g_len, g_ratio);
		return 1;
	}
	fp = gzopen(argv[optind], "r");
	seq = kseq_init(fp);
	while ((l = kseq_read(seq)) >= 0) {
		int n_good = 0, n_all = 0, n_mid = 0;
		printf(">%s %d %.3lf", seq->name.s, g_len, g_ratio);
		for (i = 0; i < l + g_len - 1; ++i) {
			int c1, c2;
			unsigned x = i < l? get_cnt(seq->seq.s[i]) : 0;
			c1 = x>>16; c2 = x&0xffff;
			if (c1 == 1) ++cnt[4];
			if (c1) {
				++n_all;
				if (is_good(c1, c2)) ++n_good;
				if (c1 == 1) ++n_mid;
			}
			x = i >= g_len? get_cnt(seq->seq.s[i - g_len]) : 0;
			c1 = x>>16; c2 = x&0xffff;
			if (c1) {
				--n_all;
				if (is_good(c1, c2)) --n_good;
				if (c1 == 1) --n_mid;
			}
			assert(n_all <= g_len && n_good <= n_all);
			if (i % 60 == 0) putchar('\n');
			x = n_all == 0? 0 : (double)n_good/n_all >= g_ratio? 3 : (double)n_mid/n_all >= g_ratio? 2 : 1;
			putchar(x + '0');
			cnt[x]++;
		}
		putchar('\n');
	}
	tot = cnt[1] + cnt[2] + cnt[3];
	fprintf(stderr, "%lld, %lld, %lld, %lld, %lld\n", cnt[0], cnt[1], cnt[2], cnt[3], cnt[4]);
	fprintf(stderr, "%lf, %lf, %lf\n", (double)cnt[3] / tot, (double)(cnt[2] + cnt[3]) / tot, (double)cnt[4] / tot);
	kseq_destroy(seq);
	gzclose(fp);
	return 0;
}
Exemple #24
0
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
{
	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac, l;
	bntamb1_t *q;
	FILE *fp;

	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/4, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".pac");
	fp = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
	if (!for_only) { // add the reverse complemented sequence
		m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
		pac = realloc(pac, m_pac/4);
		memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
		for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
			_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
	}
	ret = bns->l_pac;
	{ // finalize .pac file
		ubyte_t ct;
		err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % 4 == 0) {
			ct = 0;
			err_fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % 4;
		err_fwrite(&ct, 1, 1, fp);
		// close .pac file
		err_fflush(fp);
		err_fclose(fp);
	}
	bns_dump(bns, prefix);
	bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
	return ret;
}
static char* read_text(char* filename) {
        gzFile fp;
        kseq_t *seq;
        fp = gzopen(filename, "r");
        assert(fp != NULL && "Could not open fasta file\n");
        seq = kseq_init(fp);
        int res = kseq_read(seq);
        assert(res >= 0);
        gzclose(fp);
        return seq->seq.s;
        // kseq_destroy(seq);
}
Exemple #26
0
static kseq_v read_seqs(kseq_t *seq, size_t n_wanted) {
  kseq_v result;
  kv_init(result);
  for (size_t i = 0; i < n_wanted || n_wanted == 0; i++) {
    if (kseq_read(seq) <= 0)
      break;
    kseq_t s;
    kseq_copy(&s, seq);
    kv_push(kseq_t, result, s);
  }
  return result;
}
Exemple #27
0
void split_psmcfa(int trunk_size, kseq_t *seq)
{
    while (kseq_read(seq) >= 0) {
        int i, k;
        for (i = k = 0; i < seq->seq.l; i += trunk_size) {
            if (seq->seq.l - i < trunk_size * 3 / 2) { // use the full length
                print_seq(seq, i, seq->seq.l, ++k);
                break;
            } else print_seq(seq, i, (i+trunk_size < seq->seq.l)? i+trunk_size : seq->seq.l, ++k);
        }
    }
}
/* Parser */
void parse_seq_pass2(kseq_t * seq, double ** Z, int * inds, int * zinds)
{
    int l;
    int i = 0;
    int j = 0;
    while ((l = kseq_read(seq)) >= 0) {
        if (!zinds[j++]) {
            continue;
        }
        convert_seq(seq->seq.s, inds, Z, i++);
    }
}
void Fast5Map::load_from_fasta(std::string fasta_filename)
{
    gzFile gz_fp;

    FILE* fp = fopen(fasta_filename.c_str(), "r");
    if(fp == NULL) {
        fprintf(stderr, "error: could not open %s for read\n", fasta_filename.c_str());
        exit(EXIT_FAILURE);
    }

    gz_fp = gzdopen(fileno(fp), "r");
    if(gz_fp == NULL) {
        fprintf(stderr, "error: could not open %s using gzdopen\n", fasta_filename.c_str());
        exit(EXIT_FAILURE);
    }

    kseq_t* seq = kseq_init(gz_fp);
    
    while(kseq_read(seq) >= 0) {
        if(seq->comment.l == 0) {
            fprintf(stderr, "error: no path associated with read %s\n", seq->name.s);
            exit(EXIT_FAILURE);
        }

        // This splitting code implicitly handles both the 2 and 3 field
        // fasta format that poretools will output. The FAST5 path
        // is always the last field.
        std::vector<std::string> fields = split(seq->comment.s, ' ');
        read_to_path_map[seq->name.s] = fields.back();
    }

    kseq_destroy(seq);
    gzclose(gz_fp);
    fclose(fp);
    
    // Sanity check that the first path actually points to a file
    if(read_to_path_map.size() > 0) {
        std::string first_read = read_to_path_map.begin()->first;
        std::string first_path = read_to_path_map.begin()->second;
        struct stat file_s;
        int ret = stat(first_path.c_str(), &file_s);
        if(ret != 0) {
            fprintf(stderr, "Error: could not find path to FAST5 for read %s\n", first_read.c_str());
            fprintf(stderr, "Please make sure that this path is accessible: %s\n", first_path.c_str());
            exit(EXIT_FAILURE);
        }
    }

    // Write the map as a fofn file so next time we don't have to parse
    // the entire fasta
    write_to_fofn(fasta_filename + FOFN_SUFFIX);
}
void FastaReader::readNext() {
	if(!is_open()) {
		throw GenericException("File is not opened");
	}

	if(eof()) {
		throw GenericException("End of file reached");
	}

	if (kseq_read(seq) < 0) {
		is_eof_ = true;
	}
}