void testRevcompRepresentative() {
  list<Sequence> reads = Fasta("../../data/representative_revcomp.fa").getAll();

  KmerRepresentativeComputer krc(reads, "##############");
  krc.setOptions(false, 3, 0.5);
  krc.setCoverageReferenceLength(50);
  krc.compute();
  Sequence representative = krc.getRepresentative();

  // Computing reads revcomp
  for (list <Sequence>::iterator it = reads.begin(); it != reads.end(); it++) {
    it->sequence = revcomp(it->sequence);
  }

  KmerRepresentativeComputer krc2(reads, "##############");
  krc2.setOptions(false, 3, 0.5);
  krc2.setCoverageReferenceLength(50);
  krc2.compute();
  Sequence representative2 = krc2.getRepresentative();

  // Check position of [ in label, so that we remove that part, and then we
  // can compare the labels
  size_t pos1 = representative.label.find_first_of('[');
  size_t pos2 = representative2.label.find_first_of('[');

  TAP_TEST(representative.label.substr(0, pos1) == representative2.label.substr(0, pos2), TEST_KMER_REPRESENTATIVE_REVCOMP,
           "The two representatives should have the same label");

  TAP_TEST(revcomp(representative.sequence) == representative2.sequence, TEST_KMER_REPRESENTATIVE_REVCOMP,
           "The two representatives should have the same sequence (but revcomp-ed)");

}
Exemple #2
0
string WindowsStorage::getLabel(junction window) {

  bool found = false;
  for (auto it: windows_labels) {
    string sequence_of_interest = it.first;
    if (sequence_of_interest.size() < window.size()) {
      found = window.find(sequence_of_interest) != string::npos
        || window.find(revcomp(sequence_of_interest)) != string::npos;
    } else {
      found = sequence_of_interest.find(window) != string::npos
        || sequence_of_interest.find(revcomp(window)) != string::npos;
    }
    if (found)
      return it.second;
  }
  return "";
}
Exemple #3
0
char check_intron_consensus(unsigned long prev_exon_rend, unsigned long next_exon_lend,
                            const string &genome_seq, char strand)
{


    unsigned int genome_seq_length = genome_seq.length();
    if (prev_exon_rend > genome_seq_length || next_exon_lend > genome_seq_length) {
        stringstream errmsg;
        errmsg << "Error, coordinates: " << prev_exon_rend << " and " << next_exon_lend
            << " are not entirely within genome sequence length: " << genome_seq_length;

        throw (errmsg.str());
    }


    string left_dinuc;
    left_dinuc += genome_seq[prev_exon_rend];
    left_dinuc += genome_seq[prev_exon_rend + 1];

    string right_dinuc;
    right_dinuc += genome_seq[next_exon_lend - 3];
    right_dinuc += genome_seq[next_exon_lend - 2];

    if (strand == '-') {
        string left_dinuc_copy = left_dinuc;
        string right_dinuc_copy = right_dinuc;

        left_dinuc = revcomp(right_dinuc_copy);
        right_dinuc = revcomp(left_dinuc_copy);
    }

    if (
        ((left_dinuc == "GT" || left_dinuc == "GC") && right_dinuc == "AG")
            ||
                (left_dinuc == "CT" && right_dinuc == "AC")
        ) {

        return ('N'); // has proper splice boundaries.
    }
    else {
        return ('D');
    }

}
Exemple #4
0
void init_aa0(unsigned char **aa0, int n0, int nm0,
	      unsigned char **aa0s, unsigned char **aa1s, 
	      int qframe, int qshuffle_flg, int max_tot,
	      struct pstruct *ppst, void **f_str, void **qf_str,
	      void *my_rand_state) {
  int id;

  /* note that aa[5,4,3,2] are never used, but are provided so that frame
     can range from 0 .. 5; likewise for f_str[5..2] */

  aa0[5] = aa0[4] = aa0[3] = aa0[2] = aa0[1] = aa0[0];

  /* zero out for SSE2/ALTIVEC -- make sure this is ALWAYS done */
  for (id=0; id < SEQ_PAD; id++) aa0[0][n0+id] = '\0';

  init_work (aa0[0], n0, ppst, &f_str[0]);
  f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0];

  if (qframe == 2) {
    if ((aa0[1]=(unsigned char *)calloc((size_t)n0+2+SEQ_PAD,sizeof(unsigned char)))==NULL) {
      fprintf(stderr," cannot allocate aa01[%d]\n", n0);
    }
    *aa0[1]='\0';
    aa0[1]++;
    memcpy(aa0[1],aa0[0],n0+1);
    /* for ALTIVEC/SSE2, must pad with 16 NULL's */
    for (id=0; id<SEQ_PAD; id++) {aa0[1][n0+id]=0;}
    revcomp(aa0[1],n0,ppst->c_nt);
    init_work (aa0[1], n0, ppst, &f_str[1]);
  }

  if (qshuffle_flg) {
    if ((*aa0s=(unsigned char *)calloc(n0+2+SEQ_PAD,sizeof(char)))==NULL) {
      fprintf(stderr,"cannot allocate aa0s[%d]\n",n0+2);
      exit(1);
    }
    **aa0s='\0';
    (*aa0s)++;
    memcpy(*aa0s,aa0[0],n0);
    qshuffle(*aa0s,n0,nm0, my_rand_state);
    /* for SSE2/ALTIVEC, must pad with 16 NULL's */
    for (id=0; id<SEQ_PAD; id++) {(*aa0s)[n0+id]=0;}
    init_work (*aa0s, n0, ppst, qf_str);
  }

  /* always allocate shuffle space */
  if((*aa1s=calloc(max_tot+1,sizeof(char))) == NULL) {
    fprintf(stderr,"unable to allocate shuffled library sequence [%d]\n", max_tot);
    exit(1);
  }
  else {
    **aa1s=0;
    (*aa1s)++;
  }
}
Exemple #5
0
Sequence Segmenter::getSequence() const {
  Sequence s ;
  s.label_full = info ;
  if (segmented) {
    s.label = label + " " + (reversed ? "-" : "+");
    s.sequence = revcomp(sequence, reversed);
  } else {
    s.sequence = sequence;
  }

  return s ;
}
Exemple #6
0
/**
 * @brief This function concatenates the reverse complement to a given master
 * string. A `#` sign is used as a separator.
 * @param s The master string.
 * @param len Its length.
 * @return The newly concatenated string.
 */
char *catcomp(char *s, size_t len) {
	if (!s) return NULL;

	char *rev = revcomp(s, len);

	char *temp = realloc(rev, 2 * len + 2);
	CHECK_MALLOC(temp);

	rev = temp;
	rev[len] = '#';

	memcpy(rev + len + 1, s, len + 1);

	return rev;
}
Exemple #7
0
/* prepareSeq: prepares sequence string for analysis by shustring-type programs.
 * Does the following: 1) set all residues to upper case
 *                     2) generate reverse complement
 *                     3) concatenate reverse complement to end of forward strand
 * e.g. if the string of the original seq. is ACCGZ\0, (Z for the border)
 * then the new one which includes the reversed complement seq. looks like this: ACCGZCGGTZ\0
 *
 */
void prepareSeq(Sequence *sequence){
  Sequence *rstrand;
  Int64 i, j;
  char *nuc = "TCAGtcag";
  
  strtoupper(sequence->seq);
  /* take care of reverse strand */
  rstrand = revcomp(sequence); /* reverse and complement a sequence */
  rstrand->headers = (char **)emalloc(sizeof(char *));
  rstrand->headers[0] = (char *)emalloc(sizeof(char));
  rstrand->borders = (Int64 *)emalloc(sizeof(Int64));
	rstrand->freqTab = NULL;
	rstrand->numSeq = 1;
  sequence->seq[sequence->len] = '\0';
  sequence->len += sequence->len; /* new seq. length = 2 x original size */
  sequence->seq = (char *)erealloc(sequence->seq,(size_t)(sequence->len+1)*sizeof(char));
  /* number of borders = 2 x original size */
	sequence->borders = (Int64 *)erealloc(sequence->borders, 2*(size_t)sequence->numSeq * sizeof(Int64));
  /* adjust the border values */
	for(i=1;i<sequence->numSeq;i++){
		/* seq. looks like this: F1 F2 .. Fn Rn .. R2 R1 */
    sequence->borders[2*sequence->numSeq-i-1] = sequence->len - sequence->borders[i-1] - 2;
  }
  sequence->borders[2*sequence->numSeq-1] = sequence->len - 1;
	/* move first border of reverted sequences to the end */
  rstrand->seq++;  /* since the last border of the original seq is the first char of the reversed seq */               
  //strncat(sequence->seq,rstrand->seq,(size_t)sequence->len); ??
	strncat(sequence->seq,rstrand->seq,(size_t)sequence->len / 2);
  rstrand->seq--; /* return the pointer */
  sequence->seq[sequence->len-1] = BORDER;
  sequence->seq[sequence->len] = '\0';
  freeSequence(rstrand);
  sequence->numNuc = 0;
	for(i = 0; i < 8; i++) {
    //sequence->numNuc += sequence->freqTab[(int)nuc[i]];
		for (j = 0; j < sequence->numNuc; j ++) {
			sequence->numNuc += sequence->freqTab[j][(Int64)nuc[i]];
			sequence->freqTab[j][(Int64)nuc[i]] *= 2; /* fwd and rev strand */
		}
	}
  sequence->numNuc *= 2;  
	sequence->numSbjctNuc *= 2;
}
Exemple #8
0
/**
 * @param sequences - An array of pointers to the sequences.
 * @param n - The number of sequences.
 */
void run(seq_t *sequences, size_t n) {
	seq_t *subject = &sequences[0];
	esa_s E;

	if (seq_subject_init(subject) || esa_init(&E, subject)) {
		errx(1, "Failed to create index for %s.", subject->name);
	}

	size_t i = 0;
	// now compare every other sequence to the subject
	for (size_t j = 0; j < n; j++) {
		if (j == i) {
			continue;
		}

		// TODO: Provide a nicer progress indicator.
		if (FLAGS & F_EXTRA_VERBOSE) {
#pragma omp critical
			{ fprintf(stderr, "comparing %zu and %zu\n", i, j); }
		}

		size_t ql = sequences[j].len;

		if (FLAGS & F_FORWARD) {
			printf("> %s\n", sequences[j].name);
			dist_anchor(&E, sequences[j].S, ql, subject->gc);
		}

		if (FLAGS & F_REVCOMP) {
			char *R = revcomp(sequences[j].S, ql);

			printf("> %s Reverse\n", sequences[j].name);
			dist_anchor(&E, R, ql, subject->gc);
			free(R);
		}
	}

	esa_free(&E);
	seq_subject_free(subject);
}
Exemple #9
0
void testRevcomp() {
  TAP_TEST(complement("AATCAGactgactagATCGAn") == "TTAGTCTGACTGATCTAGCTN", TEST_REVCOMP, "");
  TAP_TEST(revcomp("AATCAGactgactagATCGAn") == "NTCGATCTAGTCAGTCTGATT", TEST_REVCOMP, "");
  TAP_TEST(revcomp("") == "", TEST_REVCOMP, "");
  TAP_TEST(revcomp("aaaaaa") == "TTTTTT", TEST_REVCOMP, "");
}
Exemple #10
0
int main (int argc, char **argv) {
	char c;
	int mismatch = 0;
	char *in[3] = {0,0,0};
	char *out[5];
	char *orep=NULL;
	int out_n = 0;
	int in_n = 0;
	int threads = 1;				// not really necessary
	char verify='\0';

	int i;
	int mino = 6;
	int pctdiff = 8;				// this number tested well on exome data... tweak for best results
	bool omode = false;	
	char *bfil = NULL;
    bool norevcomp = false;
    bool allow_ex = false;

	while (	(c = getopt (argc, argv, "-dRnbeo:t:v:m:p:r:xV")) != -1) {
		switch (c) {
		case '\1':
			if (!in[0]) 
				in[0]=optarg;
			else if (!in[1])		
				in[1]=optarg;
			else if (!in[2])		
				in[2]=optarg;
			else {
				usage(stderr); return 1;
			}
			++in_n;
			break;
                case 'o': if (out_n == 3) {
				usage(stderr); return 1;
			  }
			  out[out_n++] = optarg; 
			  break;
		case 'r': orep = optarg; break;
		case 't': threads = atoi(optarg); break;
        case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); return 0; break;
		case 'm': mino = atoi(optarg); break;
		case 'x': allow_ex = true; break;
		case 'p': pctdiff = atoi(optarg); break;
		case 'R': norevcomp = true; break;
		case 'd': ++debug; break;
                case 'v':
                        if (strlen(optarg)>1) {
                                fprintf(stderr, "Option -v requires a single character argument");
                                exit(1);
                        }
                        verify = *optarg; break;
		case '?': 
		     if (strchr("otvmpr", optopt))
		       fprintf (stderr, "Option -%c requires an argument.\n", optopt);
		     else if (isprint(optopt))
		       fprintf (stderr, "Unknown option `-%c'.\n", optopt);
		     else
		       fprintf (stderr,
				"Unknown option character `\\x%x'.\n",
				optopt);
		     usage(stderr);
             	     return 1;
		}
	}

	if (argc < 3 || !in[1] || (!in[2] && out_n != 1 && out_n != 3) || (in[2] && out_n != 1 && out_n != 5)) {
		usage(stderr);
		return 1;
	}

	FILE *fin[2];
	bool gzin[2]; meminit(gzin);
	for (i = 0; i < in_n; ++i) {
		fin[i] = gzopen(in[i], "r",&gzin[i]); 
		if (!fin[i]) {
			fprintf(stderr, "Error opening file '%s': %s\n",in[i], strerror(errno));
			return 1;
		}
	}

	const char *suffix[5]={"un1", "un2", "join", "un3", "join2"};
	FILE *fout[5]; meminit(fout);
	bool gzout[5]; meminit(gzout);
	char *pre = out[0];
	for (i = 0; i < (in[2] ? 5 : 3); ++i) {
		// prefix out
		if (out_n == 1) {
			out[i]=(char *)malloc(strlen(pre)+10);
			strcpy(out[i], pre);
			char *p;
			if (p=strchr(out[i], '%')) {
				// substiture instead of append
				strcpy(p, suffix[i]);
				strcpy(p+strlen(suffix[i]), pre+(p-out[i])+1);
			} else {
				strcat(out[i], suffix[i]);
			}
		} // else explicit
		fout[i] = gzopen(out[i], "w",&gzout[i]);
		if (!fout[i]) {
				fprintf(stderr, "Error opening output file '%s': %s\n",out[i], strerror(errno));
				return 1;
		}
	}

//printf("in_n:%d in:%x fo:%x", in_n, in[3], fout[4]);
//return 1;

	FILE *frep = NULL;
	if (orep) {
                frep = fopen(orep, "w");
                if (!orep) {
                        fprintf(stderr, "Error opening report file '%s': %s\n",out[i], strerror(errno));
                        return 1;
                }
	}


	// some basic validation of the file formats
	{
		for (i=0;i<in_n;++i) {
			char c=getc(fin[i]);
			if (c != '@')  {
				fprintf(stderr, "%s doesn't appear to be a fastq file (%c)\n", in[i], c);
				return 1;
			}
			ungetc(c, fin[i]);
		}
	}

	struct fq fq[3];	
        meminit(fq);

	int nrec=0;
	int nerr=0;
	int nok=0;
	int joincnt=0;
	double tlen=0;
	double tlensq=0;
	int read_ok;

	struct fq rc;
	meminit(rc);

	// read in 1 record from each file
	while (read_ok=read_fq(fin[0], nrec, &fq[0])) {
		for (i=1;i<in_n;++i) {
		int mate_ok=read_fq(fin[i], nrec, &fq[i]);
		if (read_ok != mate_ok) {
			fprintf(stderr, "# of rows in mate file '%s' doesn't match primary file, quitting!\n", in[i]);
			return 1;
		}
		if (verify) {
			// verify 1 in 100
			if (0 == (nrec % 100)) {
				char *p=strchr(fq[i].id.s,verify);
				if (!p) {
					fprintf(stderr, "File %s is missing id verification char %c at line %d", in[i], verify, nrec*4+1);
					return 1;
				}
				int l = p-fq[i].id.s;
				if (strncmp(fq[0].id.s, fq[i].id.s, l)) {
					fprintf(stderr, "File %s, id doesn't match file %s at line %d", in[0], in[i], nrec*4+1);
					return 1;
				}
			}
		}
		}

		++nrec;
		if (read_ok < 0) continue;

		if (debug) fprintf(stderr, "seq: %s %d\n", fq[0].seq.s, fq[0].seq.n);

        if (!norevcomp) {
    		revcomp(&rc, &fq[1]);
        } else {
            rc=fq[1];
        }

		if (debug) fprintf(stderr, "comp: %s %d\n", rc.seq.s, rc.seq.n);

		int maxo = min(fq[0].seq.n, rc.seq.n);
		int bestscore=INT_MAX;
		int besto=-1;
		for (i=mino; i <= maxo; ++i) {
			int mind = (pctdiff * i) / 100;
            int d;
            d=hd(fq[0].seq.s+fq[0].seq.n-i, rc.seq.s, i);
			if (debug) fprintf(stderr, "hd: %d, %d\n", i, d);
			if (d <= mind) {
				// squared-distance over length, probably can be proven better (like pearson's)
				int score = (1000*(d*d+1))/i;	
				if (score < bestscore) {
					bestscore=score;
					besto=i;
				}
			}
		}

        int hasex=0;
        if (allow_ex && besto<maxo) {
            if (fq[0].seq.n > rc.seq.n) {
                int mind = (pctdiff * maxo) / 100;
                for (i=0; i < fq[0].seq.n-maxo; ++i ) {
                    int d;
                    d=hd(fq[0].seq.s+fq[0].seq.n-rc.seq.n-i-1, rc.seq.s, maxo);
                    if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d);
                    if (d <= mind) {
                        // squared-distance over length, probably can be proven better (like pearson's)
                        int score = (1000*(d*d+1))/maxo;
                        if (score < bestscore) {
                            bestscore=score;
                            // negative overlap!
                            hasex=-i;
                            besto=maxo;
                        }
                    }
                }
            } else if (fq[0].seq.n < rc.seq.n) {
                int mind = (pctdiff * maxo) / 100;
                for (i=0; i < rc.seq.n-maxo; ++i ) {
                    int d;
                    d=hd(fq[0].seq.s, rc.seq.s+i, maxo);
                    if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d);
                    if (d <= mind) {
                        // squared-distance over length, probably can be proven better (like pearson's)
                        int score = (1000*(d*d+1))/maxo;
                        if (score < bestscore) {
                            bestscore=score;
                            // negative overlap!
                            hasex=-i;
                            besto=maxo;
                        }
                    }
                }
            }
        }

		if (debug) {
			fprintf(stderr, "best: %d %d\n", besto-hasex, bestscore);
		}

		FILE *fmate = NULL;
        int olen = besto-hasex;

		if (besto > 0) {
			++joincnt;

			tlen+=olen;
			tlensq+=olen*olen;

            char *sav_fqs=NULL, *sav_rcs;
            char *sav_fqq, *sav_rcq;

            if (hasex) {
                sav_fqs=fq[0].seq.s;
                sav_fqq=fq[0].qual.s;
                sav_rcs=rc.seq.s;
                sav_rcq=rc.qual.s;
                if (fq[0].seq.n < rc.seq.n) {
                    rc.seq.s=rc.seq.s-hasex;
                    rc.qual.s=rc.qual.s-hasex;
                    rc.seq.n=maxo;
                    rc.qual.n=maxo;
                } else {
                    // fprintf(stderr, "rc negative overlap: %s %d\n", rc.seq.s, hasex);
                    fq[0].seq.s=fq[0].seq.s+fq[0].seq.n-maxo+hasex-1;
                    fq[0].qual.s=fq[0].qual.s+fq[0].seq.n-maxo+hasex-1;
                    fq[0].seq.n=maxo;
                    fq[0].qual.n=maxo;
                    // fprintf(stderr, "negative overlap: %s -> %s, %d\n", fq[0].seq.s, rc.seq.s, maxo);
                }
                // ok now pretend everythings normal, 100% overlap
		        //if (debug) 
            }

			FILE *f=fout[2];

			if (verify) {
				char *p=strchr(fq[0].id.s,verify);
				if (p) {
					*p++ = '\n';
					*p = '\0';
				}
			}
			fputs(fq[0].id.s,f);
			for (i = 0; i < besto; ++i ) {
				int li = fq[0].seq.n-besto+i;
				int ri = i;
                if (debug>=2) printf("%c %c / %c %c / ", fq[0].seq.s[li], rc.seq.s[ri], fq[0].qual.s[li], rc.qual.s[ri]);
				if (fq[0].seq.s[li] == rc.seq.s[ri]) {
					fq[0].qual.s[li] = max(fq[0].qual.s[li], rc.qual.s[ri]);
                    // bounded improvement in quality, since there's no independence
					// fq[0].qual.s[ri] = max(fq[0].qual.s[li], rc.qual.s[ri])+min(3,min(fq[0].qual.s[li],rc.qual.s[ri])-33);
				} else {
					// use the better-quality read
                    // this approximates the formula: E = min(0.5,[(1-e2/2) * e1] / [(1-e1) * e2/2 + (1-e2/2) * e1])
					if (fq[0].qual.s[li] > rc.qual.s[ri]) {
                        // reduction in quality, based on phred-difference
					    fq[0].qual.s[li] = 33+min(fq[0].qual.s[li],max(fq[0].qual.s[li]-rc.qual.s[ri],3));
					} else {
						fq[0].seq.s[li] = rc.seq.s[ri];
                        // reduction in quality, based on phred-difference
					    fq[0].qual.s[li] = 33+min(rc.qual.s[ri],max(rc.qual.s[ri]-fq[0].qual.s[li],3));
					}
				}
                if (debug>=2) printf("%c %c\n", fq[0].seq.s[li], fq[0].qual.s[li]);
			}

			fwrite(fq[0].seq.s,1,fq[0].seq.n,f);
			fputs(rc.seq.s+besto,f);
			fputc('\n',f);
			fputs(fq[0].com.s,f);
			fwrite(fq[0].qual.s,1,fq[0].qual.n,f);
			fputs(rc.qual.s+besto,f);
			fputc('\n',f);
			fmate=fout[4];

            if (sav_fqs) {
                fq[0].seq.s=sav_fqs;
                fq[0].qual.s=sav_fqq;
                rc.seq.s=sav_rcs;
                rc.qual.s=sav_rcq;
            }

			if (frep) {
				fprintf(frep, "%d\n", besto);
			}
		} else {
			for (i=0;i<2;++i) {
				FILE *f=fout[i];
				fputs(fq[i].id.s,f);
				fputs(fq[i].seq.s,f);
				fputc('\n',f);
				fputs(fq[i].com.s,f);
				fputs(fq[i].qual.s,f);
				fputc('\n',f);
			}
			fmate=fout[3];
		}

		if (fmate) {
			fputs(fq[2].id.s,fmate);
			fputs(fq[2].seq.s,fmate);
			fputc('\n',fmate);
			fputs(fq[2].com.s,fmate);
			fputs(fq[2].qual.s,fmate);
			fputc('\n',fmate);
		}
	}


	double dev = sqrt((((double)joincnt)*tlensq-pow((double)tlen,2)) / ((double)joincnt*((double)joincnt-1)) );
	printf("Total reads: %d\n", nrec);
	printf("Total joined: %d\n", joincnt);
	printf("Average join len: %.2f\n", (double) tlen / (double) joincnt);
	printf("Stdev join len: %.2f\n", dev);
    printf("Version: %s.%d\n", VERSION, SVNREV);

	return 0;
}
Exemple #11
0
bool IFindObserver<span>::contains(KmerType kmer)
{
    kmer = std::min(kmer, revcomp(kmer, this->_find->kmer_size()));
    Node node = Node(Node::Value(kmer));
    return this->_find->graph_contains(node);
}
Exemple #12
0
int
main(int argc, char **argv)
{
    char          *cmfile;
    ESL_ALPHABET  *abc;
    char          *seqfile;
    ESL_SQFILE    *sqfp;
    int            format;
    CM_FILE       *cmfp;
    CM_t          *cm;
    ESL_SQ        *seq;
    float          sc, rev_sc;
    Parsetree_t   *tr;
    Fancyali_t    *fali;
    Fancyali_t    *rev_fali;

    int do_local;

    /* int status;    */
    /* char *optname; */
    /* char *optarg; */
    int   optind;

    int status;
    char errbuf[eslERRBUFSIZE];

    cmfile = seqfile = NULL;
    abc = NULL;
    sqfp = NULL;
    cmfp = NULL;
    cm = NULL;
    seq = NULL;
    tr = NULL;
    fali = NULL;
    rev_fali = NULL;
    format = eslSQFILE_UNKNOWN;
    do_local = TRUE;

    /* Should process options, but for now assume none and set optind */
    optind = 1;

    if ( argc - optind != 2 ) cm_Die("Incorrect number of arguments\n");
    cmfile = argv[optind++];
    seqfile = argv[optind++];

    if((status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf)) != eslOK)
        cm_Die("Failed to open covariance model save file\n");
    if ((status = cm_file_Read(cmfp, TRUE, &abc, &cm)) != eslOK)
        cm_Die("Failed to read a CM from cm file\n");
    if (cm == NULL)
        cm_Die("CM file empty?\n");
    cm_file_Close(cmfp);

    if ( esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK )
        cm_Die("Failed to open sequence database file\n");

    if (do_local) cm->config_opts |= CM_CONFIG_LOCAL;

    if((status = cm_Configure(cm, errbuf, -1)) != eslOK) cm_Die(errbuf);
    /*SetMarginalScores_reproduce_bug_i27(cm);*/

    seq = esl_sq_Create();
    while ( esl_sqio_Read(sqfp, seq) == eslOK )
    {
        if (seq->n == 0) continue;

        int i0 = 1;
        int j0 = seq->n;

        if (seq->dsq == NULL)
            esl_sq_Digitize(abc, seq);
        sc = TrCYK_DnC(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */
        /* sc = TrCYK_Inside(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, FALSE, &tr); */
        fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons, seq->dsq, FALSE, NULL);
        /* float sc, struct_sc;
         * ParsetreeScore(cm, NULL, NULL, tr, seq->dsq, FALSE, &sc, &struct_sc, NULL, NULL, NULL);
         * printf("Parsetree score: %.4f\n", sc);
         * ParsetreeDump(stdout, tr, cm, seq->dsq);
         */
        FreeParsetree(tr);

        revcomp(abc, seq, seq);
        rev_sc = TrCYK_DnC(cm,seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */
        rev_fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons,seq->dsq, FALSE, NULL);
        /*ParsetreeDump(stdout, tr, cm, seq->dsq);*/
        FreeParsetree(tr);

        if (sc > rev_sc)
        {
            printf("sequence: %s\n", seq->name);
            printf("score:    %.2f\n",sc);
            PrintFancyAli(stdout, fali, 0, FALSE, FALSE, 60);
        }
        else
        {
            printf("sequence: %s (reversed)\n", seq->name);
            printf("score:    %.2f\n",rev_sc);
            PrintFancyAli(stdout, fali, seq->n, TRUE, FALSE, 60);
        }

        FreeFancyAli(fali);
        FreeFancyAli(rev_fali);

        esl_sq_Destroy(seq);
        seq = esl_sq_Create();

    }
    esl_sq_Destroy(seq);

    FreeCM(cm);
    esl_sqfile_Close(sqfp);

    return EXIT_SUCCESS;
}
Exemple #13
0
int run (int argc, char* argv[]) {
    
    if (argc < 3) {
        stringstream s;
        s << "Usage: " << argv[0] << " file.fasta kmer_length [DS_mode]" << endl << endl;
        
        cerr << s.str();
        return(1);
        
    }

    string fasta_filename (argv[1]);
    unsigned int kmer_length = atoi(argv[2]);
    
    bool DS_mode = (argc >= 3) ? true : false;
    
    Fasta_reader fasta_reader(fasta_filename);
    
    Ktree ktree;

    long read_counter = 0;
    
    while (fasta_reader.hasNext()) {
        
        read_counter++;
        if (read_counter % 1000 == 0) {
            cerr << "\rread[" << read_counter << "]   ";
        }
        

        Fasta_entry fe = fasta_reader.getNext();
        
        string accession = fe.get_accession();
        
        
        string sequence = fe.get_sequence();
        
        // cerr << "Processing: " << sequence << endl;
                        
        if (sequence.length() < kmer_length + 1) {
            continue;
        }
        
        for (unsigned int i = 0; i <= sequence.length() - kmer_length; i++) {
            
            string kmer = sequence.substr(i, kmer_length);
            
            if (! contains_non_gatc(kmer)) {

                ktree.add_kmer(kmer);
            
                if (DS_mode) {
                    kmer = revcomp(kmer);
                    ktree.add_kmer(kmer);
                }

            }
            
        }
        
    }
 

    ktree.report_kmer_counts();
    
   
    return(0);
}
Exemple #14
0
bool checkMapability(const KmerIndex& index, const std::string &s, const std::vector<std::pair<KmerEntry,int>>& v, std::vector<int> &u) {
  const int maxMismatch = 2;
  const int maxSoftclip = 5;
    
  Kmer km;
  KmerEntry val;
  int p;

  if (!v.empty()) {
    p = findFirstMappingKmer(v,val);
    km = Kmer(s.c_str()+p);
  } else {
    return false;
  }
  
  std::vector<int> vtmp; vtmp.reserve(u.size());
  
  for (auto tr : u) {
    auto trpos = index.findPosition(tr, km, val, p);
    int tpos = (int)trpos.first;
    int sz = (int)s.size();
    bool add = true; 
    if (trpos.second) {
      if (tpos < 1 || tpos + sz - 1 > index.target_seqs_[tr].size()) {
        add = false;
      } else {
        //std::cout << index.target_seqs_[tr].substr(tpos,sz) << std::endl;
        //std::cout << s << std::endl;
        int mis = 0;
        for (int i = 0; i < sz - maxSoftclip; i++) {
          if (index.target_seqs_[tr][tpos-1 + i] != s[i]) {
            ++mis;
            if (mis > maxMismatch) {
              break;
            }
          }
        }
        add = (mis <= maxMismatch);
      }
    }  else {
      if (tpos > index.target_seqs_[tr].size() || tpos - sz < 1) {
        add = false;
      } else {      
        std::string rs = revcomp(s);
        //std::cout << index.target_seqs_[tr].substr(tpos - sz, sz) << std::endl;
        //std::cout << rs << std::endl;
        int mis = 0;
        for (int i = sz-1; i >= maxSoftclip; i--) {
          if (index.target_seqs_[tr][tpos-sz+i] != rs[sz]) {
            ++mis;
            if (mis > maxMismatch) {
              break;
            }
          }
        }
        add = (mis <= maxMismatch);
      }
    }
    
    if (add) {
      vtmp.push_back(tr);
    }
    
  }
 
  
  if (vtmp.empty()) {
    return false;
  }
  
  if (vtmp.size() < u.size()) {
    u = vtmp; // copy
  }
  
  return true;
  
}
// main k-mer counting function, shared between minia and dsk
// verbose == 0 : stderr progress bar
// verbose >= 1 : print basic status
// verbose >= 2 : print extra partition information
// write_count == True: include kmer count in results file, in that form:
//           - save kmer count for each kmer in the resulting binary file
//           - the very first four bytes of the result file are the kmer length
void sorting_count(Bank *Sequences, char *prefix, int max_memory, int max_disk_space, bool write_count, int verbose)
{

    // create a temp dir from the prefix
    char temp_dir[1024];
    sprintf(temp_dir,"%s_temp",prefix);

    // clear the temp folder (needs to be done before estimating disk space)
    DIR*            dp;
    struct dirent*  ep;
    char            p_buf[512] = {0};
    dp = opendir(temp_dir);
    while ( (dp != NULL) && ((ep = readdir(dp)) != NULL)) {
        sprintf(p_buf, "%s/%s", temp_dir, ep->d_name);
        remove(p_buf);
    }
    if(dp != NULL)
        closedir(dp);

    if (max_disk_space == 0)
    {
        // default max disk space
        struct statvfs buffer ;
        char current_path[1000];
        getcwd(current_path,sizeof(current_path));
        // int ret =
        statvfs(current_path, &buffer);
        int available = (int)(((double)buffer.f_bavail * (double)buffer.f_bsize) / 1024 / 1024);
	uint32_t tt_new_temp = (uint32_t) (((double)Sequences->filesizes)/(1024*1024));
        printf("Available disk space in %s: %d  %u %llu MB\n",current_path,available,tt_new_temp,Sequences->filesizes); // not working in osx (is that a TODO then?)
        max_disk_space = min((uint32_t)available/2, tt_new_temp);
    } 
    if (max_disk_space <= 0) // still 0?
        max_disk_space = 10000; // = default for osx

    // estimate number of iterations TODO Check if multiplication with totalKmers is actually required or not. It may be just increasing number of partitions for no reason
    //uint64_t volume = totalKmers*Sequences->estimate_kmers_volume(smallestKmer);  //Since there are totalKmers no of kmers and an upper bound can be estimated by using the smallest size of kmer. Added by Raunaq
    uint64_t volume = Sequences->estimate_kmers_volume(smallestKmer);  //Since there are totalKmers no of kmers and an upper bound can be estimated by using the smallest size of kmer. Added by Raunaq
    uint32_t nb_passes = ( volume / max_disk_space ) + 1;
    int passes_hash ;
    
    int nb_threads=1;
    
#if OMP
    use_compressed_reads =true;
    nb_threads = 8;
    max_memory /= nb_threads;
    max_memory = max (max_memory,1);
#endif
    
    // temp bugfix: don't use compressed reads for long reads
    if (Sequences->estimate_max_readlen() > 1000000)
        use_compressed_reads = false;
    
    
    uint64_t volume_per_pass,volume_per_partition;
    uint32_t nb_partitions;
    int partitions_hash;

    // loop to lower the number of partitions below the maximum number of simulatenously open files
    do
    {
        volume_per_pass = volume / nb_passes;
        nb_partitions = ( volume_per_pass * totalKmers / max_memory ) + 1; 
	//printf("volume per pass and total volume %llu %llu \n",volume_per_pass,(unsigned long long)volume);
        // if partitions are hashed instead of sorted, adjust for load factor
        // (as in the worst case, all kmers in the partition are distinct and partition may be slightly bigger due to hash-repartition)
        if (use_hashing)
        {
            nb_partitions = (uint32_t) ceil((float) nb_partitions / load_factor);
            nb_partitions = ((nb_partitions * OAHash::size_entry() ) + sizeof(key_type)-1) / sizeof(key_type); // also adjust for hash overhead
        }

        struct rlimit lim;
        int max_open_files = 1000;
        int err = getrlimit(RLIMIT_NOFILE, &lim);
        if (err == 0)
            max_open_files = lim.rlim_cur / 2;
        if (nb_partitions >= max_open_files)
            nb_passes++;
        else
            break;
    }
    while (1);
    volume_per_partition= volume_per_pass/nb_partitions;
    passes_hash = ceil(log(nb_passes)/log(4));
    partitions_hash = ceil(log(nb_partitions)/log(4));
    int size_for_reestimation = ceil((passes_hash + partitions_hash)*1.8);
    double * lmer_counts = (double * ) malloc(sizeof(long)*pow(4,size_for_reestimation));
    long * lmers_for_hash = (long * ) malloc(sizeof(long)*pow(4,size_for_reestimation));
    int * partitions_for_lmers =(int * ) malloc(sizeof(int)*pow(4,size_for_reestimation));
    Sequences->count_kmers_for_small_value(size_for_reestimation,lmer_counts);
    int temp_partition=reestimate_partitions(size_for_reestimation,volume_per_partition,lmer_counts,lmers_for_hash,partitions_for_lmers);
    unordered_map<long,int> part_hash;
    int total_lmers=pow(4,size_for_reestimation);
    for(int it=0;it<total_lmers;it++)
    {
	pair<long,int> temp_pair(lmers_for_hash[it],partitions_for_lmers[it]);
        part_hash.insert (temp_pair); // Add element to the hash 
    }
    //uint64_t up_passes_size = volume_per_pass;
      	do
	{
		//recompute the number of partitions based on updated partitions estimate
		nb_partitions = ceil(temp_partition*1.0/nb_passes);
		struct rlimit lim;
	        int max_open_files = 1000;
	        int err = getrlimit(RLIMIT_NOFILE, &lim);
	        if (err == 0)
        	    max_open_files = lim.rlim_cur / 2;
	        if (nb_partitions >= max_open_files)
        	    nb_passes++;
	        else
        	    break;
	}while(1);
    	printf("no of partitions before %lu and after %d passes %lu \n",nb_partitions*nb_passes,temp_partition,nb_passes);
    uint64_t total_IO =   volume * 2LL * 1024LL*1024LL   ;// in bytes  +   nb_passes * ( volume / (sizeof(kmer_type)*4) )    ; // in bytes
    uint64_t temp_IO = 0;
    BinaryBankConcurrent * redundant_partitions_file[nb_partitions]; 
    char redundant_filename[nb_partitions][256];
    kmer_type kmer;
    int max_read_length = KMERSBUFFER_MAX_READLEN;
    kmer_type * kmer_table_seq = (kmer_type * ) malloc(sizeof(kmer_type)*max_read_length); ;
    kmer_type * kmer_length_table_seq = (kmer_type * ) malloc(sizeof(kmer_type)*max_read_length);

    BinaryReads *  binread = NULL;
    if(use_compressed_reads)
        binread = new BinaryReads(return_file_name(binary_read_file),true);

    fprintf(stderr,"Sequentially counting ~%llu MB of kmers with %d partition(s) and %d passes using %d thread(s), ~%d MB of memory and ~%d MB of disk space\n", (unsigned long long)volume, nb_partitions,nb_passes, nb_threads, max_memory * nb_threads, max_disk_space);

    STARTWALL(count);

    mkdir(temp_dir, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
    
    // Open totalKmers files to store counts of totalKmers different k's
    BinaryBankConcurrent * SolidKmers[totalKmers];
    for (int s=0;s<totalKmers;s++) 
    {	
		char temp[1024];
		sprintf(temp,"%s.%d",return_file_name(solid_kmers_file),Kmerlist[s]);
		uint64_t exp = (((uint64_t)1)<<(Kmerlist[s]*2))-1;
		SolidKmers[s] = new BinaryBankConcurrent(temp,sizeof(kmer),true,nb_threads);
		//printf("kmer is %d exp is %llu \n",Kmerlist[s],exp);
		//BinaryBankConcurrent * SolidKmers = new BinaryBankConcurrent(return_file_name(solid_kmers_file),sizeof(kmer),true,nb_threads);

	    if (write_count)
	    {
        	// write k-mer nbits as the first 4 bytes; and actual k-mer size as the next 4 bits
      		  uint32_t kmer_nbits = sizeof(kmer) * 8;
   	     	SolidKmers[s]->write_buffered(&kmer_nbits, 4,0);
        	SolidKmers[s]->write_buffered(&Kmerlist[s], 4,0);
        	SolidKmers[s]->flush(0);
        }
   }

    int64_t estimated_NbReads = Sequences->estimate_nb_reads();
    char * rseq;
    int readlen;
    int64_t NbSolid = 0;
    int64_t * NbSolid_omp = (int64_t  *) calloc(nb_threads,sizeof(int64_t));
    //long total_kmers_per_partition[nb_partitions]; //guillaume probably commented it because updating this variable would require synchronization
    long distinct_kmers_per_partition[nb_partitions];
    uint64_t  * histo_count = (uint64_t  *) calloc(10001,sizeof(uint64_t));


#if OMP
    uint64_t  **  histo_count_omp = (uint64_t  **) calloc(nb_threads,sizeof(uint64_t *));
    for(int ii=0;ii<nb_threads;ii++)
    {
        histo_count_omp[ii]= (uint64_t  *) calloc(10001,sizeof(uint64_t));
    }
#endif
    

    
   
    //start by the conversion of the file to binary format

    if(use_compressed_reads)
    {
        char * pt_begin;
        int idx =0 ;
        int64_t NbRead = 0;
        Progress progress_conversion;
       // progress_conversion.timer_mode=1; // to switch to timer mode (show elapsed and estimated remaining time)
        progress_conversion.init(estimated_NbReads,"First step: Converting input file into Binary format");
        
        Sequences->rewind_all();
        while(1)
        {
            if(! Sequences->get_next_seq(&rseq,&readlen)) break; // read  original fasta file
            if(readlen > max_read_length) // realloc kmer_table_seq if needed
            {
                max_read_length = 2*readlen;
                kmer_table_seq = (kmer_type * ) realloc(kmer_table_seq,sizeof(kmer_type)*max_read_length);
            	kmer_length_table_seq = (kmer_type * ) realloc(kmer_length_table_seq,sizeof(kmer_type)*max_read_length);
	    }
            
            pt_begin = rseq;
            //should be ok
            while (pt_begin < (rseq+ readlen))
            {
                idx=0; // start a new read

                //skips NN
                while (*pt_begin =='N' && pt_begin < (rseq+ readlen))
                {
                    pt_begin ++;
                }
                // goes to next N or end of seq
                while ( (pt_begin[idx] !='N') &&  ((pt_begin +idx) < (rseq+ readlen))  )
                {
                    idx++;
                }
                
                //we have a seq beginning at  pt_begin of size idx  ,without any N, will be treated as a read:
                binread->write_read(pt_begin,idx);
		revcomp_sequence(pt_begin,idx); // reverse complement the string 
		binread->write_read(pt_begin,idx); // write reverse complement string 
		revcomp_sequence(pt_begin,idx); // restore the string 

		pt_begin += idx;
            }
            
            // binread->write_read(rseq,readlen);
            
            
            NbRead++;
            if ((NbRead%10000)==0)
            {
                progress_conversion.inc(10000);
            }
        }
	//printf("Number of reads converted to binary %d \n",NbRead);
        progress_conversion.finish();
        binread->close();

    }
    ///fin conversion
    if (clear_cache)
    {
#ifdef OSX
        system("purge");
#else
        system("echo 3 > /proc/sys/vm/drop_caches");
#endif
    }
    
    
    
#if SINGLE_BAR
    Progress progress;
    char message[1000];
    sprintf(message,"Counting kmers");
    progress.timer_mode=1;
    if (verbose == 0 )
        progress.init(total_IO,message);
#endif
    
    //use_compressed_reads=false; // for testing compute_kmer_from_one_seq 
    // how many times we will traverse the whole reads file (has an influence on temp disk space)

   uint64_t iter_partition=0;
    for (uint32_t current_pass = 0; current_pass < nb_passes; current_pass ++)
    {
	// stop computing if all partitions are done Added by Raunaq
        if (iter_partition==temp_partition)
		break;
	if(use_compressed_reads ) //open binary reads for reading
            binread->open(false);
        
        STARTWALL(debpass);
        STARTWALL(debw);
	int initial_value = current_pass*nb_partitions;
        for (uint32_t p=0;p<nb_partitions;p++)
        {
            sprintf(redundant_filename[p],"%s/partition%d.redundant_kmers",temp_dir,p);
            redundant_partitions_file[p] =  new BinaryBankConcurrent (redundant_filename[p],sizeof(kmer_type),true, nb_threads);
            distinct_kmers_per_partition[p]=0;
       	}
	int final_value = ((current_pass+1)*nb_partitions)-1;
	printf("Storing k-mers in partition files between %d and %d \n",initial_value,final_value);
        Sequences->rewind_all();
#if !SINGLE_BAR
        Progress progress;
        progress.timer_mode=1; // to switch to timer mode (show elapsed and estimated remaining time)
        char message[1000];
        sprintf(message,"Pass %d/%d, Step 1: partitioning",current_pass+1,nb_passes);
        if (verbose == 0 )
            progress.init(estimated_NbReads,message);
#endif
     

        
        //current_pass> 0 &&
#if OMP
#pragma omp parallel if(use_compressed_reads)  num_threads(nb_threads)
#endif
        {
            int64_t  nbkmers_written =0;
            int tid =0;
            int64_t NbRead = 0;
            int64_t nread =0;
            int64_t tempread =0;
	    long it_zero_wrt =0;
#if OMP

            tid = omp_get_thread_num();
#endif
            int nreads_in_buffer= 1000;
            KmersBuffer * kbuff =NULL;
            if(use_compressed_reads)
            {
                kbuff = new KmersBuffer (binread, 1000000,  nreads_in_buffer); //buffer size (in nb of kmers), seq per task // the buffer is per thread
                kbuff->binary_read_file = binread->binary_read_file;
            }

            kmer_type * kmer_table ;
            kmer_type * kmer_length_info ; // Added by Raunaq, to store the length of read into the partitions file
	    while(1)
            {

                //read the fasta file
                if(use_compressed_reads) // && current_pass>0
                {
                    nread = kbuff->readkmers();
                    if( nread < 0) break;
                    NbRead+= nread;
                    tempread+= nread;
                }
                else
                {
                    if(! Sequences->get_next_seq(&rseq,&readlen)) break; // read  original fasta file
                    if(readlen > max_read_length) // realloc kmer_table_seq if needed
                    {
                        max_read_length = 2*readlen;
                        kmer_table_seq = (kmer_type * ) realloc(kmer_table_seq,sizeof(kmer_type)*max_read_length);
            		kmer_length_table_seq = (kmer_type * ) realloc(kmer_length_table_seq,sizeof(kmer_type)*max_read_length);
                    }

                }

//                if(use_compressed_reads ) //write compressed read file at first pass //&& current_pass==0
//                    binread->write_read(rseq,readlen);

                int i;
                int nbkmers =readlen-sizeKmer+1;  

                if( use_compressed_reads) //current_pass >0 &&
                {
                    nbkmers = kbuff->nkmers;
                    kmer_table = kbuff->kmers_buffer;
		    kmer_length_info = kbuff->kmer_length;
                } 
                else //old fashion   
                {
                    compute_kmer_table_from_one_seq(readlen,rseq,kmer_table_seq,kmer_length_table_seq,Kmerlist[totalKmers-1]); // Added by Raunaq for computing kmers for all values of k 
                    nbkmers =readlen-Kmerlist[totalKmers-1]+1;  
                    kmer_table = kmer_table_seq;
		    kmer_length_info = kmer_length_table_seq;
                    NbRead++;
                    //printf("Number of kmers read from seq %d \n",nbkmers);
		}
		
                nbkmers_written= 0;
		char  temp_kmer[256];
		int zero;
                //compute the kmers stored in the buffer kmer_table
                for (i=0; i<nbkmers; i++)
                {
                    kmer_type lkmer;
					kmer_type lkmer_length;
                    // kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp);

                    lkmer = kmer_table[i];
					lkmer_length = kmer_length_info[i];
		   // zero = code2seq(lkmer,temp_kmer);
					long pass_lkmer = code2first_n_nucleotide(lkmer,size_for_reestimation);
					unordered_map<long,int>::const_iterator got = part_hash.find(pass_lkmer);
					int p;// compute in which partition this kmer falls into
					if(got==part_hash.end())
						continue;
					else
						p = got->second; 
                    // check if this kmer should be included in the current pass
                    if(!(p >= initial_value && p<= final_value))
						continue;


/*		
#ifdef _ttmath
                    (reduced_kmer % nb_partitions).ToInt(p);
#else
                    p = reduced_kmer % nb_partitions;
#endif
*/
					p = p - current_pass*nb_partitions;  
                    nbkmers_written++;

                    redundant_partitions_file[p]->write_element_buffered(&lkmer,tid); // save this kmer to the right partition file
					redundant_partitions_file[p]->write_buffered(&lkmer_length,sizeof(lkmer_length),tid,false); // save the kmer length next to the kmer in the same partition file
		    // total_kmers_per_partition[p]++; // guillaume probably commented it because updating this variable would require synchronization

                }
                //NbRead++;
#if SINGLE_BAR
                if(verbose==0)
                {
                if (nb_threads == 1)
                    progress.inc(nbkmers_written * sizeof(kmer_type));
                else
                    progress.inc(nbkmers_written * sizeof(kmer_type),tid);
                }
#endif
             //   if ((NbRead%10000)==0)
                if(tempread> 10000)
                {
                    tempread -= 10000;
                    if (verbose)
                        fprintf (stderr,"%cPass %d/%d, loop through reads to separate (redundant) kmers into partitions, processed %lluM reads out of %lluM",13,current_pass+1,nb_passes,(unsigned long long)(NbRead/1000/1000),(unsigned long long)(estimated_NbReads/1000/1000));
#if !SINGLE_BAR
                    else
                        if (nb_threads == 1)
                            progress.set(NbRead);
                        else
                            progress.inc(10000,tid);
#endif
                }
            } //end while
           // printf("Count of zero in write is %lu \n",it_zero_wrt);
            if(use_compressed_reads)
                delete kbuff;
        } // end OMP 


        
#if !SINGLE_BAR
        if (verbose == 0)
        {
            if (nb_threads == 1)
             progress.finish();
            else
              progress.finish_threaded();  // here only one thread
            
            sprintf(message,"Pass %d/%d, Step 2: computing kmer count per partition",current_pass+1,nb_passes);
            progress.init(nb_partitions+1,message);
        }
#endif
        
        if (verbose)fprintf(stderr,"\n");

        if (verbose >= 2)
        {
            STOPWALL(debw,"Writing redundant kmers");
        }
        STARTWALL(debtri);
	


            for (uint32_t p=0;p<nb_partitions;p++)
            {	
				redundant_partitions_file[p]->close();
                redundant_partitions_file[p]->open(false);
            }



        // for better timing: clear the file cache, since the partitions may still be in memory, that's unfair to low mem machines
        if (clear_cache)
        {
#ifdef OSX
            system("purge");
#else
            system("echo 3 > /proc/sys/vm/drop_caches");
#endif
        }

        //quick and dirty parall with omp, testing
        //todo if we want omp and histo : separate histo_count tab per thread that needs to be merged at the end
        // TODO to guillaume: remove that todo above, because it is done, right?
        kmer_type lkmer,lkmer_length,lkmer_temp,exp;
	long it_zero=0;
	OAHash * hash;
	int p,s;
#if OMP 
        //omp_set_numthreads(2);  //num_threads(2) //if(!output_histo) num_threads(nb_threads)
#pragma omp parallel for private (p,s,lkmer,lkmer_length,hash,lkmer_temp,exp)  num_threads(nb_threads)
#endif        
        // load, sort each partition to output solid kmers
        for ( p=0;p<nb_partitions;p++)
        {
			char temp_kmer[256];  // bug check code 
			int zero;
			kmer_type lkmer_revcomp; // to store revcomps
				
           	bool use_hashing_for_this_partition = use_hashing;
			if(hybrid_mode)
			{
				if(   (redundant_partitions_file[p]->nb_elements()*sizeof(kmer_type)) <  (max_memory*1024LL*1024LL) )  // Maintain totalKmers hash for each partition file
				{	
					use_hashing_for_this_partition = false;
				}
				else
				{
					use_hashing_for_this_partition = true;
				}
			}
            int tid =0;
			//int s;
			//Computing if hashing should be used or not for this partition
#if OMP
            tid = omp_get_thread_num();
#endif
            //use_hashing_for_this_partition = false;  //to check the vector part of the code
           	if (use_hashing_for_this_partition)
            {
                // hash partition and save to solid file
				
				hash = new OAHash(max_memory*1024LL*1024LL/2); // One hash to store all types of k-mer lengths

				uint64_t nkmers_read=0;
				redundant_partitions_file[p]->read_element_buffered(&lkmer_length);

				while (redundant_partitions_file[p]->read_element_buffered(&lkmer))
				{
			
					if(lkmer_length == Kmerlist[0])  //only add the largest k-mer 
						hash->increment(lkmer,convert_to_int(lkmer_length));
					else
					{
						unordered_map<int,int>::const_iterator got = kmerlength_map.find(convert_to_int(lkmer_length));
						exp = (((kmer_type)1)<<(got->second*2))-1;
						lkmer_temp = lkmer & exp;
						hash->increment(lkmer_temp,got->second);

					}	
					if(!redundant_partitions_file[p]->read_element_buffered(&lkmer_length)) 
					{
						break;
					}
					nkmers_read++;
#if SINGLE_BAR
					if(verbose==0 && nkmers_read==10000)
					{
						if (nb_threads == 1)
							progress.inc(nkmers_read*sizeof(kmer_type));
						else
							progress.inc(nkmers_read*sizeof(kmer_type),tid);
						nkmers_read=0;
					}
#endif
                }
                
                
				if (verbose >= 2)
					 printf("Pass %d/%d partition %d/%d hash load factor: %0.3f\n",current_pass+1,nb_passes,p+1,nb_partitions,hash->load_factor());
                	for( s=0;s<totalKmers;s++) 
					{
						OAHash * temp_ = new OAHash(max_memory*1024LL*1024LL/2);
						hash->start_iterator();
						while (hash->next_iterator())
                				{
							uint_abundance_t abundance = hash->iterator->value;
        	       		 			uint_abundance_t abund_tid = (current_pass+1)*100+p;
							if(output_histo)
							{
							 uint_abundance_t saturated_abundance;
							 saturated_abundance = (abundance >= 10000) ? 10000 : abundance;
#if OMP
							 histo_count_omp[tid][saturated_abundance]++;
#else
					
							 histo_count[saturated_abundance]++;
#endif
							}
							int length_kmer = hash->iterator->length;
							lkmer = hash->iterator->key;
	                    				if (abundance >= nks && abundance <= max_couv && length_kmer == Kmerlist[s])
							{
								//write if lkmer is the smaller of it and its reverse complement
								lkmer_revcomp = revcomp(lkmer,length_kmer);
								if(lkmer < lkmer_revcomp)
								{
								SolidKmers[s]->write_element_buffered(&(hash->iterator->key),tid);
							
								 NbSolid_omp[tid]++;
								if (write_count)
										SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false);
								}
							}
		                    			distinct_kmers_per_partition[p]++;
							if(s!=totalKmers-1)
							{
								if(length_kmer == Kmerlist[s])
								{
									exp = (((kmer_type)1)<<(Kmerlist[s+1]*2))-1;
									lkmer_temp = lkmer & exp;
									temp_->increment_by_value(lkmer_temp,abundance,Kmerlist[s+1]);
								}else {
									temp_->increment_by_value(lkmer,abundance,length_kmer);
								}
							}
						}
						hash->~OAHash();
						hash = temp_;
					}
				hash->~OAHash();
			//printf("All hashes closed and destroyed \n");
			}
            
			else
			{
				// This part does it in slower fashion
				// sort partition and save to solid file 
        	    //vector < kmer_type > kmers;
				vector < kmer_type > kmers[totalKmers];
                uint64_t nkmers_read=0;
               	//int s=0; 
				
				redundant_partitions_file[p]->read_element_buffered(&lkmer_length);
				while (redundant_partitions_file[p]->read_element_buffered (&lkmer))
				{
    		        for(s=0;s<totalKmers;s++)
					{
						//kmer_type lkmer_temp;
						//kmer_type exp;
						if(lkmer_length<Kmerlist[s])
							continue;
						if(s==0)
							kmers[s].push_back (lkmer);
						else
						{
							exp = (((kmer_type)1)<<(Kmerlist[s]*2))-1;
							lkmer_temp = lkmer & exp; // Converting the kmer to its smaller equivalent in binary 
							kmers[s].push_back (lkmer_temp);
						}
                    }
					nkmers_read++;
					if(!redundant_partitions_file[p]->read_element_buffered(&lkmer_length)) break;  //Added to get the next length of kmer
#if SINGLE_BAR
					if(verbose==0 && nkmers_read==10000)
					{
						if (nb_threads == 1)
							progress.inc(nkmers_read*sizeof(kmer_type));
						else
							progress.inc(nkmers_read*sizeof(kmer_type),tid);
						nkmers_read=0;
					}
#endif
                }
                
                for(s=0;s<totalKmers;s++)
               	{
					sort (kmers[s].begin (), kmers[s].end ());
                
					kmer_type previous_kmer = *(kmers[s].begin ());
					uint_abundance_t abundance = 0;
					for (vector < kmer_type >::iterator it = kmers[s].begin (); it != kmers[s].end ();it++)
					{
						kmer_type current_kmer = *it;
					
						if (current_kmer == previous_kmer)
							abundance++;
						else
						{
							if(output_histo)
							{
									uint_abundance_t saturated_abundance;
									saturated_abundance = (abundance >= 10000) ? 10000 : abundance;
#if OMP
									histo_count_omp[tid][saturated_abundance]++;
#else
									histo_count[saturated_abundance]++;
#endif
					
							}
							if (abundance >= nks  && abundance <= max_couv)
							{
								 NbSolid_omp[tid]++;
								 SolidKmers[s]->write_element_buffered(&previous_kmer,tid);
						
								 if (write_count)
									SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false);
							}		
								abundance = 1;
							distinct_kmers_per_partition[p]++;
						}
						previous_kmer = current_kmer;
					}
                
                //last kmer
					distinct_kmers_per_partition[p]++;
					if(output_histo)
					{
							uint_abundance_t saturated_abundance;
							saturated_abundance = (abundance >= 10000) ? 10000 : abundance;
#if OMP
							histo_count_omp[tid][saturated_abundance]++;
#else
							histo_count[saturated_abundance]++;
#endif
				
					}
					if (abundance >= nks && abundance <= max_couv)
					{
							NbSolid_omp[tid]++;
							SolidKmers[s]->write_element_buffered(&previous_kmer,tid);
				
							if (write_count)
							   SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false);
				
					}	
				}
			}
            
            
		//printf("Done writing kmers for all K \n");
            
            	if (verbose >= 1)
                	fprintf(stderr,"%cPass %d/%d, loaded and sorted partition %d/%d, found %lld solid kmers so far",13,current_pass+1,nb_passes,p+1,nb_partitions,(long long)(NbSolid_omp[tid]));
            
		//printf("Done writing kmers for all K %d check 1 \n",p);
            	if (verbose >= 2)
                	printf("\nPass %d/%d partition %d/%d %ld distinct kmers\n",current_pass+1,nb_passes,p+1,nb_partitions,/*total_kmers_per_partition[p],*/distinct_kmers_per_partition[p]);
            
#if !SINGLE_BAR
            	if (verbose == 0 && nb_threads==1)
                	progress.inc(1);
            	else if (verbose == 0 && nb_threads>1)
                	progress.inc(1,tid);
#endif
            
            	//if(redundant_partitions_file[p]->find_error()) {
		//	printf("Error in the binary file \n");
		//}
            	redundant_partitions_file[p]->close();
		
            	remove(redundant_filename[p]);
 
        } // end for partitions

#if OMP
        //merge histo
        if(output_histo)

        {
            for (int cc=1; cc<10001; cc++) {
                uint64_t sum_omp = 0;
                for(int ii=0;ii<nb_threads;ii++)
                {
                    sum_omp += histo_count_omp[ii][cc];
                }
                histo_count[cc] = sum_omp;
            }
        }
#endif
        
#if !SINGLE_BAR
        if (verbose == 0 && nb_threads == 1)
            progress.finish();
        else if (verbose == 0 && nb_threads > 1 )
            progress.finish_threaded();
#endif

        if (verbose) fprintf(stderr,"\n");

        if (verbose >= 2)
        {
            STOPWALL(debtri,"Reading and sorting partitions");
            STOPWALL(debpass,"Pass total");

        }
       
	//printf("Done writing kmers for all K check 4 \n");
        if(use_compressed_reads)
            binread->close();
        
        //delete
            for (uint32_t p=0;p<nb_partitions;p++)
            {
                delete redundant_partitions_file[p] ;
            }
        
    }
	//printf("Done writing kmers for all K check 5 \n");

    //single bar
#if SINGLE_BAR
    if (verbose == 0 && nb_threads == 1)
        progress.finish();
    else if (verbose == 0 && nb_threads > 1 )
        progress.finish_threaded();
#endif
    
    if(output_histo)
    {
        FILE * histo_file = fopen(return_file_name(histo_file_name),"w");
        for (int cc=1; cc<10001; cc++) {
            fprintf(histo_file,"%i\t%llu\n",cc,(unsigned long long)(histo_count[cc]));
        }
        fclose(histo_file);
    }
    free(histo_count);

    NbSolid = NbSolid_omp[0];
#if OMP
    NbSolid=0;
    for(int ii=0;ii<nb_threads;ii++)
    {
        NbSolid += NbSolid_omp[ii];
    }
#endif
   for ( int s=0;s<totalKmers;s++) 
    	SolidKmers[s]->close();
    printf("\nSaved %lld solid kmers\n",(long long)NbSolid);
    rmdir(temp_dir);

    STOPWALL(count,"Counted kmers");
    fprintf(stderr,"\n------------------ Counted kmers and kept those with abundance >=%i,     \n",nks);
} 
Exemple #16
0
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,  double threshold, double multiplier)
{
  box_V = new AlignBox("5");
  box_D = new AlignBox("4");
  box_J = new AlignBox("3");

  segmented = false;
  dSegmented = false;
  because = NOT_PROCESSED ;
  segmented_germline = germline ;
  info_extra = "" ;
  label = seq.label ;
  sequence = seq.sequence ;
  segment_cost=segment_c;
  evalue = NO_LIMIT_VALUE;
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
  box_V->marked_pos = 0;
  box_J->marked_pos = 0;

  CDR3start = -1;
  CDR3end = -1;

  JUNCTIONstart = -1;
  JUNCTIONend = -1;

  bool reverse_V = false ;
  bool reverse_J = false ;

  if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U))
    {
      // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
      if (kseg->isSegmented())
        {
          reversed = kseg->isReverse();

          KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ;
          KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ;

          delete kseg ;

          reverse_V = (left.getStrand() == -1);
          reverse_J = (right.getStrand() == -1);

          code = "Unexpected ";

          code += left.toStringSigns() + germline->index->getLabel(left).basename;
          code += "/";
          code += right.toStringSigns() + germline->index->getLabel(right).basename;
          info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")";

          if (germline->seg_method == SEG_METHOD_MAX1U)
            return ;

          germline->override_rep5_rep3_from_labels(left, right);
        }
      else
        {
          delete kseg ;
          return ;
        }
    }

  // Strand determination, with KmerSegmenter (with default e-value parameters)
  // Note that we use only the 'strand' component
  // When the KmerSegmenter fails, continue with positive strand
  // TODO: flag to force a strand / to test both strands ?

  KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
  reversed = kseg->isReverse();
  delete kseg ;
  
  sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed


  /* Segmentation */
  align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
                                        box_V, segment_cost);

  align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false,
                                          box_J, segment_cost);

  // J was run with '!reverseJ', we copy the box informations from right to left
  // Should this directly be handled in align_against_collection() ?
  box_J->start = box_J->end ;
  box_J->del_left = box_J->del_right;

  /* E-values */
  evalue_left  = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first);
  evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first);
  evalue = evalue_left + evalue_right ;

  /* Unsegmentation causes */
  if (box_V->end == (int) string::npos)
    {
      evalue_left = BAD_EVALUE ;
    }
      
  if (box_J->start == (int) string::npos)
    {
      evalue_right = BAD_EVALUE ;
    }

  checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);

  if (because != NOT_PROCESSED)
    {
      segmented = false;
      info = " @" + string_of_int (box_V->end + FIRST_POS) + "  @" + string_of_int(box_J->start + FIRST_POS) ;
      return ;
    }

  /* The sequence is segmented */
  segmented = true ;
  because = reversed ? SEG_MINUS : SEG_PLUS ;

    //overlap VJ
  seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
                                    box_V, box_J, segment_cost);

  // Reset extreme positions
  box_V->start = 0;
  box_J->end = sequence.length()-1;

  // Why could this happen ?
      if (box_J->start>=(int) sequence.length())
	  box_J->start=sequence.length()-1;

  // seg_N will be recomputed in finishSegmentation()

  boxes.clear();
  boxes.push_back(box_V);
  boxes.push_back(box_J);
  code = codeFromBoxes(boxes, sequence_or_rc);
  info = posFromBoxes(boxes);

  finishSegmentation();
}
Exemple #17
0
void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id,
                              bool reverse_ref, bool reverse_both, bool local,
                             AlignBox *box, Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  box->ref_nb = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;

  vector<pair<int, int> > score_r;

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;

  // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence
  string sequence_or_rc = revcomp(read, reverse_ref);
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      if (r == forbidden_rep_id)
        continue;

      DynProg dp = DynProg(sequence_or_rc, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both,
                          rep.read(r).marked_pos);

      bool onlyBottomTriangle = !local ;
      int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
      
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  box->ref_nb = r ;
	  box->ref_label = rep.label(r) ;

          if (!local)
            dp.backtrack();
          box->marked_pos = dp.marked_pos_i ;
	}
	
	score_r.push_back(make_pair(score, r));

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif

    }
    sort(score_r.begin(),score_r.end(),comp_pair);

  box->ref = rep.sequence(box->ref_nb);
  box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1;
  box->del_left = best_first_j;
  box->start = best_first_i;
  
  box->score = score_r;

#ifdef DEBUG_SEGMENT	
  cout << "best: " << box->ref_label << " " << best_score ;
  cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl;
  cout << endl;
#endif

  if (reverse_ref)
    // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ;
    best_best_i = read.length() - best_best_i - 1 ;

  box->end = best_best_i ;
}