示例#1
0
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int discard_n) {

	int window_size = (int) (0.1 * fqrec->seq.l);
	int i,j;
	int window_start=0;
	int window_total=0;
	int three_prime_cut = fqrec->seq.l;
	int five_prime_cut = 0;
	int found_five_prime = 0;
	double window_avg;
	cutsites* retvals;

	/* If the sequence contains an "N" then discard if the option has been selected */
	/* Also discard if the length of the sequence is less than the length threshold */
	if ((discard_n && (strstr(fqrec->seq.s, "N") || strstr(fqrec->seq.s, "n"))) || 
			(fqrec->seq.l < length_threshold)) {
		retvals = (cutsites*) malloc (sizeof(cutsites));
		retvals->three_prime_cut = -1;
		retvals->five_prime_cut = -1;
		return (retvals);
	}

	/* if the seq length is less then 10bp, */
	/* then make the window size the length of the seq */
	if (window_size == 0) window_size = fqrec->seq.l;

	for (i=0; i<window_size; i++) {
		window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i);
	}

	for (i=0; i <= fqrec->qual.l - window_size; i++) {

		window_avg = (double)window_total / (double)window_size;

        /* If it is the first window, and the qual average is already above
           the threshold, then we have already found the five prime cut at pos 0 */
        if (i==0 && window_avg >= qual_threshold) {found_five_prime = 1;}

		/* Finding the 5' cutoff */
		/* Find when the average quality in the window goes above the threshold starting from the 5' end */
		if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) {

			/* at what point in the window does the quality go above the threshold? */
			for (j=window_start; j<window_start+window_size; j++) {
				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) {
					five_prime_cut = j;
					break;
				}
			}

			found_five_prime = 1;
		}

		/* Finding the 3' cutoff */
		/* if the average quality in the window is less than the threshold */
		/* or if the window is the last window in the read */
		if ((window_avg < qual_threshold || 
			window_start+window_size > fqrec->qual.l) && found_five_prime == 1) {

			/* at what point in the window does the quality dip below the threshold? */
			for (j=window_start; j<window_start+window_size; j++) {
				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) {
					three_prime_cut = j;

					/* if cutting length is less than threshold then return -1 for both */
					/* to indicate that the read should be discarded */
					if (three_prime_cut - five_prime_cut < length_threshold) {
						three_prime_cut = -1;
						five_prime_cut = -1;
					}
					break;
				}
			}

			break;
		}

		/* instead of sliding the window, subtract the first qual and add the next qual */
		window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start);
		if (window_start+window_size < fqrec->qual.l) {
			window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size);
		}
		window_start++;
	}

    /* If you never find a five prime cut site, then discard whole read */
    if (found_five_prime == 0) {
        three_prime_cut = -1;
        five_prime_cut = -1;
    }

	retvals = (cutsites*) malloc (sizeof(cutsites));
	retvals->three_prime_cut = three_prime_cut;
	retvals->five_prime_cut = five_prime_cut;
	return (retvals);
}
示例#2
0
文件: sliding.c 项目: jdidion/sickle
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug) {
    if (trunc_size >= 0 && trunc_size < length_threshold) {
        trunc_size = length_threshold;
    }
    
	int window_size = (int) (0.1 * fqrec->seq.l);
	int i,j;
	int window_start=0;
	int window_total=0;
	int three_prime_cut = fqrec->seq.l;
	int five_prime_cut = 0;
	int found_five_prime = 0;
	double window_avg;
	cutsites* retvals;
    char *npos;

	/* discard if the length of the sequence is less than the length threshold */
    if (fqrec->seq.l < length_threshold) {
		retvals = (cutsites*) malloc (sizeof(cutsites));
		retvals->three_prime_cut = -1;
		retvals->five_prime_cut = -1;
		return (retvals);
	}

	/* if the seq length is less then 10bp, */
	/* then make the window size the length of the seq */
	if (window_size == 0) window_size = fqrec->seq.l;

	for (i=0; i<window_size; i++) {
		window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i);
	}

	for (i=0; i <= fqrec->qual.l - window_size; i++) {

		window_avg = (double)window_total / (double)window_size;

        if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg);

		/* Finding the 5' cutoff */
		/* Find when the average quality in the window goes above the threshold starting from the 5' end */
		if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) {

        if (debug) printf ("inside 5-prime cut\n");

			/* at what point in the window does the quality go above the threshold? */
			for (j=window_start; j<window_start+window_size; j++) {
				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) {
					five_prime_cut = j;
					break;
				}
			}

            if (debug) printf ("five_prime_cut: %d\n", five_prime_cut);

			found_five_prime = 1;
		}

		/* Finding the 3' cutoff */
		/* if the average quality in the window is less than the threshold */
		/* or if the window is the last window in the read */
		if ((window_avg < qual_threshold || 
			window_start+window_size > fqrec->qual.l) && (found_five_prime == 1 || no_fiveprime)) {

			/* at what point in the window does the quality dip below the threshold? */
			for (j=window_start; j<window_start+window_size; j++) {
				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) {
					three_prime_cut = j;
					break;
				}
			}

			break;
		}

		/* instead of sliding the window, subtract the first qual and add the next qual */
		window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start);
		if (window_start+window_size < fqrec->qual.l) {
			window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size);
		}
		window_start++;
	}


    /* If truncate N option is selected, and sequence has Ns, then */
    /* change 3' cut site to be the base before the first N */
    if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
        three_prime_cut = npos - fqrec->seq.s;
    }

    /* if cutting length is less than threshold then return -1 for both */
    /* to indicate that the read should be discarded */
    /* Also, if you never find a five prime cut site, then discard whole read */
    if ((found_five_prime == 0 && !no_fiveprime) || (three_prime_cut - five_prime_cut < length_threshold)) {
        three_prime_cut = -1;
        five_prime_cut = -1;

        if (debug) printf("%s\n", fqrec->name.s);
    }

    if (debug) printf ("\n\n");

    /* if trunc_size is less than read length, remove equal number of bases from each side */
    /* TODO: this should acutally remove bases based on quality */
    
    
    if (trunc_size >= 0) {
        int side = 0;
        while (trunc_size < three_prime_cut - five_prime_cut) {
            if (side == 0) {
                five_prime_cut++;
                side = 1;
            }
            else {
                three_prime_cut--;
                side = 0;
            }
        }
    }

	retvals = (cutsites*) malloc (sizeof(cutsites));
	retvals->three_prime_cut = three_prime_cut;
	retvals->five_prime_cut = five_prime_cut;
	return (retvals);
}