Example #1
0
int size_filter(kseq_t *seq, int is_fasta, int min, int max, int length, int convert, int just_count) {
	int count=0;
	if (min > 0 && max > 0) { /* got a min and max */
		if (seq->seq.l >= min && seq->seq.l <= max) {
			count++;
			if (!just_count) {
				if (convert)
					is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
				else
					is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
			}
		}
	} else if (min > 0 || max > 0) { /* either  min or max is 0 */
		if (min > 0 && seq->seq.l >= min) {
			count++;
			if (!just_count) {
				if (convert)
					is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
				else
					is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
			}
		} else if (max > 0 && seq->seq.l <= max) {
			count++;
			if (!just_count) {
				if (convert)
					is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
				else
					is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
			}
		}
	} else {
		/* neither min nor max was > 0, so we print this sequence */
		count++;
		if (!just_count) {
			if (convert)
				is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
			else
				is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
		}
	}
	return count;
}
Example #2
0
int pull_by_name(char *input_file, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count) {
	gzFile fp;
	int i,l,capacity=80;
	int count=0,excluded=0;
	int is_fasta = -1;
	char *fasta_name;
	char *line;
	kseq_t *seq;

	/* get some space for the line */
	line = malloc(sizeof(char) * capacity); /* get memory allocated */
	if (!line) {
		fprintf(stderr, "%s - line malloc: %s\n",progname, strerror(errno));
		exit(EXIT_FAILURE);
	}

	while((i = getl(&line,names_fp)) != -1) {
		fasta_name = parse_name(line);
		if (fasta_name) {
			add_name(fasta_name);             /* add fasta_name to hash */
		}
	}

	free(line); /* free up line */

	if (verbose_flag) {
		fprintf(stderr,"\n");
		fprintf(stderr,"done reading from input (%d entries)\n", hash_key_count());
	}
	/*print_hash();*/


	/* open fasta file */
	fp = gzopen(input_file,"r");
	if (!fp) {
		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
		exit(EXIT_FAILURE);
	}

	seq = kseq_init(fp); /* initialize kseq */

	/* determine file type */
	l = kseq_read(seq); /* read the first sequence */
	is_fasta = seq->qual.s == NULL ? 1 : 0;
	gzrewind(fp); /* rewind to beginning for main loop */
	kseq_rewind(seq);

	if (verbose_flag) {
		if (is_fasta)
			fprintf(stderr, "Input is FASTA format\n");
		else
			fprintf(stderr, "Input is FASTQ format\n");
	}

	/* search through list and see if this header matches */
	while((l = kseq_read(seq)) >= 0) {
		if (exclude == 0) { /* INCLUDE names from names file */
			if (find_name(seq->name.s)) {            /* found name in list */
				if (min > 0 && max > 0) { /* got a min and max */
					if (seq->seq.l >= min && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else if (min > 0 || max > 0) { /* either  min or max is 0 */
					if (min > 0 && seq->seq.l >= min) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					} else if (max > 0 && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else {
					count++;
					if (!just_count) {
						if (convert)
								is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
						else 
								is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
					}
				}
			}
		} else { /* EXCLUDE names from names file */
			if (find_name(seq->name.s)) {            /* found name in list */
				excluded++;
			} else {
				if (min > 0 && max > 0) { /* got a min and max */
					if (seq->seq.l >= min && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else if (min > 0 || max > 0) { /* either  min or max is 0 */
					if (min > 0 && seq->seq.l >= min) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					} else if (max > 0 && seq->seq.l <= max) {
						count++;
						if (!just_count) {
							if (convert)
									is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
							else 
									is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
						}
					}
				} else {
					count++;
					if (!just_count) {
						if (convert)
								is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
						else 
								is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
					}
				}
			}
		}
	}
	kseq_destroy(seq);
	gzclose(fp); /* done reading file */

	delete_hash(); /* free the list nodes */

	if (just_count) {
		fprintf(stdout, "Total output: %i\n", count);
		if (exclude)
			fprintf(stdout, "Total excluded: %i\n", excluded);
	}

	if (verbose_flag) {
		fprintf(stderr,"Processed %i entries\n",count);
		if (exclude)
			fprintf(stderr,"Excluded %i entries\n",excluded);
	}
	return count;
}