int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) { gzFile fp; int count=0,l; int excluded = 0; int is_fasta = 0; /* assume fastq */ kseq_t *seq; /* open fasta file */ fp = gzopen(input_file,"r"); if (!fp) { fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); exit(EXIT_FAILURE); } seq = kseq_init(fp); /* determine file type */ l = kseq_read(seq); /* read the first sequence */ is_fasta = seq->qual.s == NULL ? 1 : 0; gzrewind(fp); kseq_rewind(seq); /* rewind to beginning for main loop */ if (verbose_flag) { if (is_fasta) fprintf(stderr, "Input is FASTA format\n"); else fprintf(stderr, "Input is FASTQ format\n"); } /* search through list and see if this header matches */ while((l = kseq_read(seq)) >= 0) { if (exclude) { if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) excluded++; else { /* regex doesn't match, so check size/print */ count += size_filter(seq, is_fasta, min, max, length, convert, just_count); } } else { if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) { /* regex matches so check size/print */ count += size_filter(seq, is_fasta, min, max, length, convert, just_count); } else excluded++; } } /* end of seq traversal */ kseq_destroy(seq); gzclose(fp); /* done reading file so close */ if (just_count) { fprintf(stdout, "Total output: %i\n", count); fprintf(stdout, "Total excluded: %i\n", excluded); } return count; }
/* The gateway function */ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { char * filename; double max_gap_fraction; gzFile fp; kseq_t *seq; int N, M; double * N_ptr; double * M_ptr; double * Z_ptr; double ** Z; int * inds; int * zinds; /* check for proper number of arguments */ if (nrhs != 2) { mexErrMsgIdAndTxt("read_alignemnt_fasta:nrhs", "Two inputs required: filename, max_gap_fraction."); } if (nlhs != 3) { mexErrMsgIdAndTxt("read_alignemnt_fasta:nlhs", "Three outputs required: N, M, Z."); } /* get the value of the frequence matrices */ filename = mxArrayToString(prhs[0]); fp = gzopen(filename, "r"); if (fp == Z_NULL) { mexErrMsgIdAndTxt("read_alignemnt_fasta:open_file", "Error opening file"); } seq = kseq_init(fp); /* get the max_gap_fraction value */ max_gap_fraction = mxGetScalar(prhs[1]); /* create the outputs N, M */ plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); plhs[1] = mxCreateDoubleMatrix(1, 1, mxREAL); N_ptr = mxGetPr(plhs[0]); M_ptr = mxGetPr(plhs[1]); parse_seq_pass1(seq, &inds, &zinds, &N, &M, max_gap_fraction); *N_ptr = (double) N; *M_ptr = (double) M; /* create the output matrix Z */ plhs[2] = mxCreateDoubleMatrix(M, N, mxREAL); Z_ptr = mxGetPr(plhs[2]); Z = malloc(N * sizeof(double)); { int i; for (i = 0; i < N; ++i) { Z[i] = Z_ptr; Z_ptr += M; } } gzrewind(fp); kseq_rewind(seq); parse_seq_pass2(seq, Z, inds, zinds); /* release memory */ kseq_destroy(seq); gzclose(fp); mxFree(filename); free(Z); free(inds); }
int pull_by_name(char *input_file, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count) { gzFile fp; int i,l,capacity=80; int count=0,excluded=0; int is_fasta = -1; char *fasta_name; char *line; kseq_t *seq; /* get some space for the line */ line = malloc(sizeof(char) * capacity); /* get memory allocated */ if (!line) { fprintf(stderr, "%s - line malloc: %s\n",progname, strerror(errno)); exit(EXIT_FAILURE); } while((i = getl(&line,names_fp)) != -1) { fasta_name = parse_name(line); if (fasta_name) { add_name(fasta_name); /* add fasta_name to hash */ } } free(line); /* free up line */ if (verbose_flag) { fprintf(stderr,"\n"); fprintf(stderr,"done reading from input (%d entries)\n", hash_key_count()); } /*print_hash();*/ /* open fasta file */ fp = gzopen(input_file,"r"); if (!fp) { fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); exit(EXIT_FAILURE); } seq = kseq_init(fp); /* initialize kseq */ /* determine file type */ l = kseq_read(seq); /* read the first sequence */ is_fasta = seq->qual.s == NULL ? 1 : 0; gzrewind(fp); /* rewind to beginning for main loop */ kseq_rewind(seq); if (verbose_flag) { if (is_fasta) fprintf(stderr, "Input is FASTA format\n"); else fprintf(stderr, "Input is FASTQ format\n"); } /* search through list and see if this header matches */ while((l = kseq_read(seq)) >= 0) { if (exclude == 0) { /* INCLUDE names from names file */ if (find_name(seq->name.s)) { /* found name in list */ if (min > 0 && max > 0) { /* got a min and max */ if (seq->seq.l >= min && seq->seq.l <= max) { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } } else if (min > 0 || max > 0) { /* either min or max is 0 */ if (min > 0 && seq->seq.l >= min) { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } else if (max > 0 && seq->seq.l <= max) { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } } else { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } } } else { /* EXCLUDE names from names file */ if (find_name(seq->name.s)) { /* found name in list */ excluded++; } else { if (min > 0 && max > 0) { /* got a min and max */ if (seq->seq.l >= min && seq->seq.l <= max) { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } } else if (min > 0 || max > 0) { /* either min or max is 0 */ if (min > 0 && seq->seq.l >= min) { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } else if (max > 0 && seq->seq.l <= max) { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } } else { count++; if (!just_count) { if (convert) is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); else is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); } } } } } kseq_destroy(seq); gzclose(fp); /* done reading file */ delete_hash(); /* free the list nodes */ if (just_count) { fprintf(stdout, "Total output: %i\n", count); if (exclude) fprintf(stdout, "Total excluded: %i\n", excluded); } if (verbose_flag) { fprintf(stderr,"Processed %i entries\n",count); if (exclude) fprintf(stderr,"Excluded %i entries\n",excluded); } return count; }