Example #1
0
int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) {
	gzFile fp;
	int count=0,l;
	int excluded = 0;
	int is_fasta = 0; /* assume fastq */
	kseq_t *seq;

	/* open fasta file */
	fp = gzopen(input_file,"r");
	if (!fp) {
		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
		exit(EXIT_FAILURE);
	}

	seq = kseq_init(fp);

	/* determine file type */
	l = kseq_read(seq); /* read the first sequence */
	is_fasta = seq->qual.s == NULL ? 1 : 0;
	gzrewind(fp); 
	kseq_rewind(seq); /* rewind to beginning for main loop */

    if (verbose_flag) {
        if (is_fasta)
            fprintf(stderr, "Input is FASTA format\n");
        else
            fprintf(stderr, "Input is FASTQ format\n");
    }

	/* search through list and see if this header matches */
	while((l = kseq_read(seq)) >= 0) {
		if (exclude) {
			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s))
				excluded++;
			else {
				/* regex doesn't match, so check size/print */
				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
			}
		} else {
			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) {
				/* regex matches so check size/print */
				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
			} else
				excluded++;
		}
	} /* end of seq traversal */
	kseq_destroy(seq);
	gzclose(fp); /* done reading file so close */

	if (just_count) {
		fprintf(stdout, "Total output: %i\n", count);
		fprintf(stdout, "Total excluded: %i\n", excluded);
	}
	return count;
}
/* The gateway function */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
    char * filename;
    double max_gap_fraction;
    gzFile fp;
    kseq_t *seq;
    int N, M;
    double * N_ptr;
    double * M_ptr;
    double * Z_ptr;
    double ** Z;
    int * inds;
    int * zinds;

    /* check for proper number of arguments */
    if (nrhs != 2) {
        mexErrMsgIdAndTxt("read_alignemnt_fasta:nrhs", "Two inputs required: filename, max_gap_fraction.");
    }
    if (nlhs != 3) {
        mexErrMsgIdAndTxt("read_alignemnt_fasta:nlhs", "Three outputs required: N, M, Z.");
    }

    /* get the value of the frequence matrices  */
    filename = mxArrayToString(prhs[0]);

    fp = gzopen(filename, "r");

    if (fp == Z_NULL) {
        mexErrMsgIdAndTxt("read_alignemnt_fasta:open_file", "Error opening file");
    }

    seq = kseq_init(fp);

    /* get the max_gap_fraction value */
    max_gap_fraction = mxGetScalar(prhs[1]);

    /* create the outputs N, M */
    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
    plhs[1] = mxCreateDoubleMatrix(1, 1, mxREAL);

    N_ptr = mxGetPr(plhs[0]);
    M_ptr = mxGetPr(plhs[1]);

    parse_seq_pass1(seq, &inds, &zinds, &N, &M, max_gap_fraction);

    *N_ptr = (double) N;
    *M_ptr = (double) M;

    /* create the output matrix Z */
    plhs[2] = mxCreateDoubleMatrix(M, N, mxREAL);

    Z_ptr = mxGetPr(plhs[2]);
    Z = malloc(N * sizeof(double));
    {
        int i;
        for (i = 0; i < N; ++i) {
            Z[i] = Z_ptr;
            Z_ptr += M;
        }
    }

    gzrewind(fp);
    kseq_rewind(seq);

    parse_seq_pass2(seq, Z, inds, zinds);

    /* release memory */
    kseq_destroy(seq);
    gzclose(fp);

    mxFree(filename);

    free(Z);

    free(inds);
}
Example #3
0
int pull_by_name(char *input_file, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count) {
	gzFile fp;
	int i,l,capacity=80;
	int count=0,excluded=0;
	int is_fasta = -1;
	char *fasta_name;
	char *line;
	kseq_t *seq;

	/* get some space for the line */
	line = malloc(sizeof(char) * capacity); /* get memory allocated */
	if (!line) {
		fprintf(stderr, "%s - line malloc: %s\n",progname, strerror(errno));
		exit(EXIT_FAILURE);
	}

	while((i = getl(&line,names_fp)) != -1) {
		fasta_name = parse_name(line);
		if (fasta_name) {
			add_name(fasta_name);             /* add fasta_name to hash */
		}
	}

	free(line); /* free up line */

	if (verbose_flag) {
		fprintf(stderr,"\n");
		fprintf(stderr,"done reading from input (%d entries)\n", hash_key_count());
	}
	/*print_hash();*/


	/* open fasta file */
	fp = gzopen(input_file,"r");
	if (!fp) {
		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
		exit(EXIT_FAILURE);
	}

	seq = kseq_init(fp); /* initialize kseq */

	/* determine file type */
	l = kseq_read(seq); /* read the first sequence */
	is_fasta = seq->qual.s == NULL ? 1 : 0;
	gzrewind(fp); /* rewind to beginning for main loop */
	kseq_rewind(seq);

	if (verbose_flag) {
		if (is_fasta)
			fprintf(stderr, "Input is FASTA format\n");
		else
			fprintf(stderr, "Input is FASTQ format\n");
	}

	/* search through list and see if this header matches */
	while((l = kseq_read(seq)) >= 0) {
		if (exclude == 0) { /* INCLUDE names from names file */
			if (find_name(seq->name.s)) {            /* found name in list */
				if (min > 0 && max > 0) { /* got a min and max */
					if (seq->seq.l >= min && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else if (min > 0 || max > 0) { /* either  min or max is 0 */
					if (min > 0 && seq->seq.l >= min) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					} else if (max > 0 && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else {
					count++;
					if (!just_count) {
						if (convert)
								is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
						else 
								is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
					}
				}
			}
		} else { /* EXCLUDE names from names file */
			if (find_name(seq->name.s)) {            /* found name in list */
				excluded++;
			} else {
				if (min > 0 && max > 0) { /* got a min and max */
					if (seq->seq.l >= min && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else if (min > 0 || max > 0) { /* either  min or max is 0 */
					if (min > 0 && seq->seq.l >= min) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					} else if (max > 0 && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else {
					count++;
					if (!just_count) {
						if (convert)
								is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
						else 
								is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
					}
				}
			}
		}
	}
	kseq_destroy(seq);
	gzclose(fp); /* done reading file */

	delete_hash(); /* free the list nodes */

	if (just_count) {
		fprintf(stdout, "Total output: %i\n", count);
		if (exclude)
			fprintf(stdout, "Total excluded: %i\n", excluded);
	}

	if (verbose_flag) {
		fprintf(stderr,"Processed %i entries\n",count);
		if (exclude)
			fprintf(stderr,"Excluded %i entries\n",excluded);
	}
	return count;
}