/* * Sample a fraction of the input sequences. */ int sample_f(double f, kseq_t *ks1, kseq_t *ks2, FILE *fs_out1, FILE *fs_out2, int paired) { int seq_no = 0, l1 = 0, l2 = 0; srand48((unsigned long)ks1 + time(NULL)); if (paired) { while (1) { if ((l1 = kseq_read(ks1)) < 0 || (l2 = kseq_read(ks2)) < 0) break; if (drand48() < f) { /* XXX: Do a consistency check between the reads? */ \ kseq_write(ks1, fs_out1); kseq_write(ks2, fs_out2); } seq_no++; } } else { while (1) { if ((l1 = kseq_read(ks1)) < 0) break; if (drand48() < f) kseq_write(ks1, fs_out1); seq_no++; } } if (l1 == -2 || l2 == -2) { fprintf(stderr, "invalid fastq entry at line %d\n", 4*seq_no+1); return -1; } return 0; }
int stk_famask(int argc, char *argv[]) { gzFile fp[2]; kseq_t *seq[2]; int i, l; if (argc < 3) { fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n"); return 1; } for (i = 0; i < 2; ++i) { fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); seq[i] = kseq_init(fp[i]); } while (kseq_read(seq[0]) >= 0) { int min_l, c[2]; kseq_read(seq[1]); if (strcmp(seq[0]->name.s, seq[1]->name.s)) fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); if (seq[0]->seq.l != seq[1]->seq.l) fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; printf(">%s", seq[0]->name.s); for (l = 0; l < min_l; ++l) { c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; if (c[1] == 'x') c[0] = tolower(c[0]); else if (c[1] != 'X') c[0] = c[1]; if (l%60 == 0) putchar('\n'); putchar(c[0]); } putchar('\n'); } return 0; }
int main_interleave(int argc, char *argv[]) { gzFile fp1, fp2; kseq_t *seq[2]; kstring_t str; if (argc < 3) { fprintf(stderr, "Usage: fermi interleave <in1.fq> <in2.fq>\n"); return 1; } str.l = str.m = 0; str.s = 0; fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); seq[0] = kseq_init(fp1); seq[1] = kseq_init(fp2); while (kseq_read(seq[0]) >= 0) { if (kseq_read(seq[1]) < 0) break; // one file ends str.l = 0; if (seq[0]->name.l > 2 && seq[0]->name.s[seq[0]->name.l-2] == '/' && isdigit(seq[0]->name.s[seq[0]->name.l-1])) seq[0]->name.s[(seq[0]->name.l -= 2)] = 0; // trim tailing "/[0-9]$" seq[1]->name.l = 0; kputsn(seq[0]->name.s, seq[0]->name.l, &seq[1]->name); // make sure two ends having the same name write_seq(seq[0], &str); write_seq(seq[1], &str); fputs(str.s, stdout); } kseq_destroy(seq[0]); gzclose(fp1); kseq_destroy(seq[1]); gzclose(fp2); free(str.s); return 0; }
int detect_snps(char reference_sequence[], char filename[], int length_of_genome, int exclude_gaps) { int i; int number_of_snps = 0; int l; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); // First sequence is the reference sequence so skip it kseq_read(seq); while ((l = kseq_read(seq)) >= 0) { for(i = 0; i < length_of_genome; i++) { if(exclude_gaps) { // If there is an indel in the reference sequence, replace with the first proper base you find if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' )) { reference_sequence[i] = toupper(seq->seq.s[i]); } if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i])) { reference_sequence[i] = '*'; number_of_snps++; } } else { char input_base = toupper(seq->seq.s[i]); if(input_base == 'N') { input_base = '-'; } if(reference_sequence[i] != '*' && reference_sequence[i] != input_base) { reference_sequence[i] = '*'; number_of_snps++; } } } } kseq_destroy(seq); gzclose(fp); return number_of_snps; }
int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) { gzFile fp; int count=0,l; int excluded = 0; int is_fasta = 0; /* assume fastq */ kseq_t *seq; /* open fasta file */ fp = gzopen(input_file,"r"); if (!fp) { fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); exit(EXIT_FAILURE); } seq = kseq_init(fp); /* determine file type */ l = kseq_read(seq); /* read the first sequence */ is_fasta = seq->qual.s == NULL ? 1 : 0; gzrewind(fp); kseq_rewind(seq); /* rewind to beginning for main loop */ if (verbose_flag) { if (is_fasta) fprintf(stderr, "Input is FASTA format\n"); else fprintf(stderr, "Input is FASTQ format\n"); } /* search through list and see if this header matches */ while((l = kseq_read(seq)) >= 0) { if (exclude) { if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) excluded++; else { /* regex doesn't match, so check size/print */ count += size_filter(seq, is_fasta, min, max, length, convert, just_count); } } else { if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) { /* regex matches so check size/print */ count += size_filter(seq, is_fasta, min, max, length, convert, just_count); } else excluded++; } } /* end of seq traversal */ kseq_destroy(seq); gzclose(fp); /* done reading file so close */ if (just_count) { fprintf(stdout, "Total output: %i\n", count); fprintf(stdout, "Total excluded: %i\n", excluded); } return count; }
// read one sequence (reversed) from fastq file to half_byte_array (i.e. 4bit for one base pair ) int bwa_read_seq_one_half_byte (bwa_seqio_t *bs, unsigned char * half_byte_array, unsigned int start_index, unsigned short * length, int mid) { kseq_t *seq = bs->ks; int len, i, mided_len; if (((len = kseq_read(seq)) >= 0) && (len > mid)) // added to process only when len is longer than mid tag { //To cut the length of the sequence if ( len > MAX_READ_LENGTH) len = MAX_READ_LENGTH; mided_len = len - mid; for (i = 0; i < mided_len; i++) { write_to_half_byte_array(half_byte_array,start_index+i,nst_nt4_table[(int)seq->seq.s[len-i-1]]); } *length = mided_len; } else { *length = 0; } return len; }
int main_read_stat(int argc, char **argv) { if (argc > 1) { fprintf(stderr, "Usage: cat *.fq | %s\n", argv[0]); exit(1); } gzFile fp = gzdopen(fileno(stdin), "r"); kseq_t *seq = kseq_init(fp); // kseq to read files int max_len = 0; int min_len = 999999999; long long total_len = 0; long long num_reads = 0; while (kseq_read(seq) >= 0) { ++num_reads; total_len += seq->seq.l; max_len = std::max(seq->seq.l, (size_t)max_len); min_len = std::min(seq->seq.l, (size_t)min_len); } double avg_len = total_len * 1.0 / num_reads; printf("number reads: %lld\ntotal size:%lld\nlongest: %d\nshortest: %d\navg: %lf\n", num_reads, total_len, max_len, min_len, avg_len); kseq_destroy(seq); gzclose(fp); return 0; }
ssize_t read_kseq_with2bit(SeqFileObj * const seqObj) { size_t seqlen; // in fact size_t, but minus values are meanful. int_fast8_t rvalue = kseq_read(seqObj->fobj); if (rvalue>0) { uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now. kseq_t *kseq; kseq = seqObj->fobj; seqlen = kseq->seq.l; seqObj->name = kseq->name.s; if (! kseq->comment.l) seqObj->comment = NULL; else seqObj->comment = kseq->comment.s; seqObj->seq = kseq->seq.s; if (rvalue&2) { // withQ //encodeQ; type |= 8u; seqObj->qual = kseq->qual.s; } else { seqObj->qual = NULL; } size_t needtomallocQQW = (seqlen+31u)>>5; // 1 "QQWord" = 4 QWord = 32 bp. Well, I say there is QQW. if (needtomallocQQW > seqObj->binMallocedQQWord) { KROUNDUP32(needtomallocQQW); seqObj->binMallocedQQWord = needtomallocQQW; seqObj->diBseq = realloc(seqObj->diBseq,needtomallocQQW<<3); // 2^3=8 seqObj->hexBQ = realloc(seqObj->hexBQ,needtomallocQQW<<5); // 4*2^3=32 } seqObj->binNcount = base2dbit(seqlen, kseq->seq.s, seqObj->qual, seqObj->diBseq, seqObj->hexBQ); // printf("-[%s]<%s><%zx>[%s]-\n",kseq->seq.s, qstr, seqObj->diBseq[0], unit2basechr(seqObj->diBseq[0])); // Well, how to deal with smallcase masking ? Not using this information yet. NormalizeChrSeq(kseq->seq.s); // to /[ATCGN]*/ seqObj->readlength = seqlen; seqObj->type = type; return seqlen; } else return rvalue;
ssize_t read_kseq_no2bit(SeqFileObj * const seqObj) { size_t seqlen; // in fact size_t, but minus values are meanful. int_fast8_t rvalue = kseq_read(seqObj->fobj); //fputs("<--->", stderr); if (rvalue>0) { uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now. kseq_t *kseq; kseq = seqObj->fobj; seqlen = kseq->seq.l; seqObj->name = kseq->name.s; if (! kseq->comment.l) seqObj->comment = NULL; else seqObj->comment = kseq->comment.s; seqObj->seq = kseq->seq.s; NormalizeChrSeq(kseq->seq.s); // to /[ATCGN]*/ if (rvalue&2) { // withQ //encodeQ; type |= 8u; seqObj->qual = kseq->qual.s; } else { seqObj->qual = NULL; } seqObj->readlength = seqlen; seqObj->type = type; return seqlen; } else return rvalue; }
void get_bases_for_each_snp(char filename[], int snp_locations[], char ** bases_for_snps, int length_of_genome, int number_of_snps) { int l; int i = 0; int sequence_number = 0; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { for(i = 0; i< number_of_snps; i++) { bases_for_snps[i][sequence_number] = toupper(((char *) seq->seq.s)[snp_locations[i]]); // Present gaps and unknowns in the same way to Gubbins if(bases_for_snps[i][sequence_number] == 'N') { bases_for_snps[i][sequence_number] = '-'; } } sequence_number++; } kseq_destroy(seq); gzclose(fp); }
static khash_t(s) *load_mask(const char *fn) { kseq_t *seq; gzFile fp; khash_t(s) *h; h = kh_init(s); fp = gzopen(fn, "r"); seq = kseq_init(fp); while (kseq_read(seq) >= 0) { khint_t k; int ret, i; mask32_t *p; k = kh_put(s, h, strdup(seq->name.s), &ret); assert(ret); // duplicated name p = &kh_val(h, k); p->ori_len = seq->seq.l; p->mask = (uint32_t*)calloc((seq->seq.l+31)/32, 4); for (i = 0; i < seq->seq.l; ++i) if (seq->seq.s[i] == '3') p->mask[i/32] |= 1u<<i%32; } kseq_destroy(seq); gzclose(fp); return h; }
// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) { struct CDS currentCDS; gzFile inputSeqPtr; FILE * inputAnnPtr, * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long outputSize, currentLine; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); inputAnnPtr = fopen(passAnnName, "r"); outputPtr = fopen(passProName, "w"); // Read in 1st sequence data seq = kseq_init(inputSeqPtr); kseq_read(seq); // Iterate through each CDS sequence in the annotation file currentLine = 0; while (getNextCDS(inputAnnPtr, ¤tCDS, ¤tLine)) { convertToAA(seq->seq.s, ¤tCDS, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d\n%.*s\n", currentLine, outputSize, outputBuffer); free(outputBuffer); } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(inputAnnPtr); err_fclose(outputPtr); return 0; }
int build_reference_sequence(char reference_sequence[], char filename[]) { int i; int length_of_genome; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); kseq_read(seq); for(i = 0; i < seq->seq.l; i++) { reference_sequence[i] = toupper(seq->seq.s[i]); if(reference_sequence[i] == 'N') { reference_sequence[i] = '-'; } } if(reference_sequence[seq->seq.l] != '\0') { reference_sequence[seq->seq.l] = '\0'; } kseq_destroy(seq); gzclose(fp); return 1; }
int streamAndCountOneFile(KWTCounterManager *manager) { gzFile inputFP; kseq_t* seq; //open file to read lines if(!( inputFP = gzopen ( manager->inputFileName , "r" ))) { printf("Could not open input file \"%s\" for reading\n", manager->inputFileName); return 1; } // initialize reader seq = kseq_init(inputFP); // read sequences while(kseq_read(seq) >= 0) { if(streamOneStringUnchanged(manager, seq->seq.s, seq->seq.l) != 0) { gzclose(inputFP); return 1; } } kseq_destroy(seq); gzclose(inputFP); return 0; }
static seqs_t *load_seqs(const char *fn) { seqs_t *s; seq1_t *p; gzFile fp; int l; kseq_t *seq; fp = xzopen(fn, "r"); seq = kseq_init(fp); s = (seqs_t*)calloc(1, sizeof(seqs_t)); s->m_seqs = 256; s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); while ((l = kseq_read(seq)) >= 0) { if (s->n_seqs == s->m_seqs) { s->m_seqs <<= 1; s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); } p = s->seqs + (s->n_seqs++); p->l = seq->seq.l; p->s = (unsigned char*)malloc(p->l + 1); memcpy(p->s, seq->seq.s, p->l); p->s[p->l] = 0; p->n = strdup((const char*)seq->name.s); } kseq_destroy(seq); gzclose(fp); fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); return s; }
bseq1_t *bseq_read(const char *fn, int *n_) { FILE *fp; bseq1_t *seqs; kseq_t *ks; int m, n; uint64_t size = 0; *n_ = 0; fp = fopen(fn, "rb"); if (fp == 0) return 0; ks = kseq_init(fp); m = n = 0; seqs = 0; while (kseq_read(ks) >= 0) { bseq1_t *s; if (n >= m) { m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); } s = &seqs[n]; s->seq = strdup(ks->seq.s); s->qual = ks->qual.l? strdup(ks->qual.s) : 0; s->l_seq = ks->seq.l; size += seqs[n++].l_seq; } *n_ = n; kseq_destroy(ks); fclose(fp); return seqs; }
int read_fasta(char* seqfile, REFLIST* reflist) { clock_t t; kseq_t *seq; gzFile fp = gzopen(seqfile, "r"); seq = kseq_init(fp); if (fp == NULL) { fprintf(stderr, "file %s not found \n", seqfile); return -1; } fprintf(stderr, "reading reference sequence file %s with %d sequences\n", seqfile, reflist->ns); t = clock(); int i=0, j=0; char c; while (kseq_read(seq) >= 0) { memcpy(reflist->sequences[i], seq->seq.s, seq->seq.l); for (j = 0; j < seq->seq.l; j++) { reflist->sequences[i][j] = toupper(reflist->sequences[i][j]); } i++; } gzclose(fp); fp=NULL; for (i = 0; i < reflist->ns; i++) { reflist->sequences[i][reflist->lengths[i]] = '\0'; if (i < 10) { fprintf(stderr, "%s %d ", reflist->names[i], reflist->lengths[i]); for (j = 0; j < 30; j++) fprintf(stderr, "%c", reflist->sequences[i][j]); fprintf(stderr, "\n"); } } fprintf(stderr, "read reference sequence file in %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); return 1; }
static void write_dict(const char *fn, args_t *args) { hts_md5_context *md5; int l, i, k; gzFile fp; kseq_t *seq; unsigned char digest[16]; char hex[33]; fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "dict: %s: No such file or directory\n", fn); exit(1); } FILE *out = stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); exit(1); } } if (!(md5 = hts_md5_init())) exit(1); seq = kseq_init(fp); if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n"); while ((l = kseq_read(seq)) >= 0) { for (i = k = 0; i < seq->seq.l; ++i) { if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~') seq->seq.s[k++] = toupper(seq->seq.s[i]); } hts_md5_reset(md5); hts_md5_update(md5, (unsigned char*)seq->seq.s, k); hts_md5_final(digest, md5); hts_md5_hex(hex, digest); fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex); if (args->uri) fprintf(out, "\tUR:%s", args->uri); else if (strcmp(fn, "-") != 0) { #ifdef _WIN32 char *real_path = _fullpath(NULL, fn, PATH_MAX); #else char *real_path = realpath(fn, NULL); #endif fprintf(out, "\tUR:file://%s", real_path); free(real_path); } if (args->assembly) fprintf(out, "\tAS:%s", args->assembly); if (args->species) fprintf(out, "\tSP:%s", args->species); fprintf(out, "\n"); } kseq_destroy(seq); hts_md5_destroy(md5); if (args->output_fname) fclose(out); }
/* Parser */ void parse_seq_pass1(kseq_t * seq, int ** inds_ptr, int ** zinds_ptr, int * N_ptr, int * M_ptr, double max_gap_fraction) { int t; int l0 = 0; int N = 0; int M = 0; int * inds = NULL; int * zinds = NULL; int zl = 1000; int c; int ngaps; int seqn = 0; zinds = malloc(zl * sizeof(int)); while ((t = kseq_read(seq)) >= 0) { char * s = seq->seq.s; int l = strlen(s); ngaps = 0; if (M == 0) { inds = malloc(l * sizeof(int)); for (c = 0; c < l; ++c) { inds[c] = (s[c] != '.' && s[c] == toupper(s[c])); ngaps += (s[c] == '-'); } l0 = l; for (c = 0; c < l0; ++c) { N += inds[c]; } } else { if (l != l0) { mexErrMsgIdAndTxt("read_alignment_fasta:input", "input data is unaligned"); } for (c = 0; c < l; ++c) { if (inds[c] != (s[c] != '.' && s[c] == toupper(s[c]))) { mexErrMsgIdAndTxt("read_alignment_fasta:input", "input data is unaligned?"); } ngaps += (s[c] == '-'); } } if (seqn > zl) { zl *= 2; zinds = realloc(zinds, zl * sizeof(int)); } if ((double) ngaps / N <= max_gap_fraction) { zinds[seqn] = 1; ++M; } else { zinds[seqn] = 0; } ++seqn; } *inds_ptr = inds; *zinds_ptr = zinds; *N_ptr = N; *M_ptr = M; return; }
int stk_hety(int argc, char *argv[]) { gzFile fp; kseq_t *seq; int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; char *buf; uint32_t cnt[3]; if (argc == 1) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk hety [options] <in.fa>\n\n"); fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); fprintf(stderr, " -m treat lowercases as masked\n"); fprintf(stderr, "\n"); return 1; } while ((c = getopt(argc, argv, "w:t:m")) >= 0) { switch (c) { case 'w': win_size = atoi(optarg); break; case 't': n_start = atoi(optarg); break; case 'm': is_lower_mask = 1; break; } } fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); seq = kseq_init(fp); win_step = win_size / n_start; buf = calloc(win_size, 1); while ((l = kseq_read(seq)) >= 0) { int x, i, y, z, next = 0; cnt[0] = cnt[1] = cnt[2] = 0; for (i = 0; i <= l; ++i) { if ((i >= win_size && i % win_step == 0) || i == l) { if (i == l && l >= win_size) { for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; } if (cnt[1] + cnt[2] > 0) printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); next = i; } if (i < l) { y = i % win_size; c = seq->seq.s[i]; if (is_lower_mask && islower(c)) c = 'N'; c = seq_nt16_table[c]; x = bitcnt_table[c]; if (i >= win_size) --cnt[(int)buf[y]]; buf[y] = z = x > 2? 0 : x == 2? 2 : 1; ++cnt[z]; } } } free(buf); kseq_destroy(seq); gzclose(fp); return 0; }
static inline void parse_sequences( const char *filename, char ***strings_, unsigned long **sizes_, unsigned long *count_) { FILE* fp; kseq_t *seq = NULL; int l = 0; char **strings = NULL; unsigned long *sizes = NULL; unsigned long count = 0; unsigned long memory = 1000; fp = fopen(filename, "r"); if(fp == NULL) { perror("fopen"); exit(1); } strings = malloc(sizeof(char*) * memory); sizes = malloc(sizeof(unsigned long) * memory); seq = kseq_init(fileno(fp)); while ((l = kseq_read(seq)) >= 0) { strings[count] = strdup(seq->seq.s); if (NULL == strings[count]) { perror("strdup"); exit(1); } sizes[count] = seq->seq.l; ++count; if (count >= memory) { char **new_strings = NULL; unsigned long *new_sizes = NULL; memory *= 2; new_strings = realloc(strings, sizeof(char*) * memory); if (NULL == new_strings) { perror("realloc"); exit(1); } strings = new_strings; new_sizes = realloc(sizes, sizeof(unsigned long) * memory); if (NULL == new_sizes) { perror("realloc"); exit(1); } sizes = new_sizes; } } kseq_destroy(seq); fclose(fp); *strings_ = strings; *sizes_ = sizes; *count_ = count; }
void load_seqid_taxid_rel(char * seqid_taxid_file){ seqid_taxid_rel = NULL; tax_to_seqs = NULL; gzFile seq_tax = gzopen(seqid_taxid_file, "r"); kseq_t * seq_t = kseq_init(seq_tax); struct seqid_taxid_single * tstr; struct taxid_seqid * tts; int l = 0; while ((l = kseq_read(seq_t)) >= 0) { //add taxid related to seqid uint64_t tmp_taxid = 0; HASH_FIND_STR(seqid_taxid_rel, seq_t->name.s, tstr); if (tstr == NULL) { if (seq_t->name.l > SEQID_SIZE) { printf("seqid %s is too long, must be less than 100 characters. Exiting...", seq_t->name.s); exit(1); } tstr = (struct seqid_taxid_single *)malloc(sizeof(struct seqid_taxid_single)); memset(tstr->seqid, '\0', SEQID_SIZE*sizeof(char)); strncpy(tstr->seqid, seq_t->name.s, seq_t->name.l); tmp_taxid = strtoull(seq_t->seq.s, NULL, 10); tstr->taxid = tmp_taxid; HASH_ADD_STR(seqid_taxid_rel, seqid, tstr); } else { printf("%s already seen in hash",seq_t->name.s); } //add seqid(s) related to taxid HASH_FIND(hh, tax_to_seqs, &tmp_taxid, sizeof(uint64_t), tts); if (tts == NULL) { tts = (struct taxid_seqid *)malloc(sizeof(struct taxid_seqid)); tts->taxid = tmp_taxid; tts->num_seqids = 0; tts->max_seqs = 10; tts->seqids = (char **)malloc(sizeof(char *)*10); tts->seqids[tts->num_seqids] = (char *)calloc(SEQID_SIZE,sizeof(char)); strncpy(tts->seqids[tts->num_seqids], seq_t->name.s, seq_t->name.l); tts->num_seqids++; HASH_ADD(hh, tax_to_seqs, taxid, sizeof(uint64_t), tts); } else { if (tts->num_seqids >= tts->max_seqs) { tts->max_seqs = 2 * tts->max_seqs; tts->seqids = (char **)realloc(tts->seqids, sizeof(char *)*tts->max_seqs); } tts->seqids[tts->num_seqids] = (char *)calloc(SEQID_SIZE,sizeof(char)); strncpy(tts->seqids[tts->num_seqids], seq_t->name.s, seq_t->name.l); tts->num_seqids++; } } gzclose(seq_tax); }
int main(int argc, char *argv[]) { int l, i, c; long long cnt[5], tot; kseq_t *seq; gzFile fp; cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; while ((c = getopt(argc, argv, "l:r:")) >= 0) { switch (c) { case 'l': g_len = atoi(optarg); break; case 'r': g_ratio = atof(optarg); break; } } if (argc == optind) { fprintf(stderr, "Usage: gen_mask [-l %d] [-r %.2lf] <in.rawMask.fa>\n", g_len, g_ratio); return 1; } fp = gzopen(argv[optind], "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { int n_good = 0, n_all = 0, n_mid = 0; printf(">%s %d %.3lf", seq->name.s, g_len, g_ratio); for (i = 0; i < l + g_len - 1; ++i) { int c1, c2; unsigned x = i < l? get_cnt(seq->seq.s[i]) : 0; c1 = x>>16; c2 = x&0xffff; if (c1 == 1) ++cnt[4]; if (c1) { ++n_all; if (is_good(c1, c2)) ++n_good; if (c1 == 1) ++n_mid; } x = i >= g_len? get_cnt(seq->seq.s[i - g_len]) : 0; c1 = x>>16; c2 = x&0xffff; if (c1) { --n_all; if (is_good(c1, c2)) --n_good; if (c1 == 1) --n_mid; } assert(n_all <= g_len && n_good <= n_all); if (i % 60 == 0) putchar('\n'); x = n_all == 0? 0 : (double)n_good/n_all >= g_ratio? 3 : (double)n_mid/n_all >= g_ratio? 2 : 1; putchar(x + '0'); cnt[x]++; } putchar('\n'); } tot = cnt[1] + cnt[2] + cnt[3]; fprintf(stderr, "%lld, %lld, %lld, %lld, %lld\n", cnt[0], cnt[1], cnt[2], cnt[3], cnt[4]); fprintf(stderr, "%lf, %lf, %lf\n", (double)cnt[3] / tot, (double)(cnt[2] + cnt[3]) / tot, (double)cnt[4] / tot); kseq_destroy(seq); gzclose(fp); return 0; }
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
static char* read_text(char* filename) { gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); assert(fp != NULL && "Could not open fasta file\n"); seq = kseq_init(fp); int res = kseq_read(seq); assert(res >= 0); gzclose(fp); return seq->seq.s; // kseq_destroy(seq); }
static kseq_v read_seqs(kseq_t *seq, size_t n_wanted) { kseq_v result; kv_init(result); for (size_t i = 0; i < n_wanted || n_wanted == 0; i++) { if (kseq_read(seq) <= 0) break; kseq_t s; kseq_copy(&s, seq); kv_push(kseq_t, result, s); } return result; }
void split_psmcfa(int trunk_size, kseq_t *seq) { while (kseq_read(seq) >= 0) { int i, k; for (i = k = 0; i < seq->seq.l; i += trunk_size) { if (seq->seq.l - i < trunk_size * 3 / 2) { // use the full length print_seq(seq, i, seq->seq.l, ++k); break; } else print_seq(seq, i, (i+trunk_size < seq->seq.l)? i+trunk_size : seq->seq.l, ++k); } } }
/* Parser */ void parse_seq_pass2(kseq_t * seq, double ** Z, int * inds, int * zinds) { int l; int i = 0; int j = 0; while ((l = kseq_read(seq)) >= 0) { if (!zinds[j++]) { continue; } convert_seq(seq->seq.s, inds, Z, i++); } }
void Fast5Map::load_from_fasta(std::string fasta_filename) { gzFile gz_fp; FILE* fp = fopen(fasta_filename.c_str(), "r"); if(fp == NULL) { fprintf(stderr, "error: could not open %s for read\n", fasta_filename.c_str()); exit(EXIT_FAILURE); } gz_fp = gzdopen(fileno(fp), "r"); if(gz_fp == NULL) { fprintf(stderr, "error: could not open %s using gzdopen\n", fasta_filename.c_str()); exit(EXIT_FAILURE); } kseq_t* seq = kseq_init(gz_fp); while(kseq_read(seq) >= 0) { if(seq->comment.l == 0) { fprintf(stderr, "error: no path associated with read %s\n", seq->name.s); exit(EXIT_FAILURE); } // This splitting code implicitly handles both the 2 and 3 field // fasta format that poretools will output. The FAST5 path // is always the last field. std::vector<std::string> fields = split(seq->comment.s, ' '); read_to_path_map[seq->name.s] = fields.back(); } kseq_destroy(seq); gzclose(gz_fp); fclose(fp); // Sanity check that the first path actually points to a file if(read_to_path_map.size() > 0) { std::string first_read = read_to_path_map.begin()->first; std::string first_path = read_to_path_map.begin()->second; struct stat file_s; int ret = stat(first_path.c_str(), &file_s); if(ret != 0) { fprintf(stderr, "Error: could not find path to FAST5 for read %s\n", first_read.c_str()); fprintf(stderr, "Please make sure that this path is accessible: %s\n", first_path.c_str()); exit(EXIT_FAILURE); } } // Write the map as a fofn file so next time we don't have to parse // the entire fasta write_to_fofn(fasta_filename + FOFN_SUFFIX); }
void FastaReader::readNext() { if(!is_open()) { throw GenericException("File is not opened"); } if(eof()) { throw GenericException("End of file reached"); } if (kseq_read(seq) < 0) { is_eof_ = true; } }