void testRevcompRepresentative() { list<Sequence> reads = Fasta("../../data/representative_revcomp.fa").getAll(); KmerRepresentativeComputer krc(reads, "##############"); krc.setOptions(false, 3, 0.5); krc.setCoverageReferenceLength(50); krc.compute(); Sequence representative = krc.getRepresentative(); // Computing reads revcomp for (list <Sequence>::iterator it = reads.begin(); it != reads.end(); it++) { it->sequence = revcomp(it->sequence); } KmerRepresentativeComputer krc2(reads, "##############"); krc2.setOptions(false, 3, 0.5); krc2.setCoverageReferenceLength(50); krc2.compute(); Sequence representative2 = krc2.getRepresentative(); // Check position of [ in label, so that we remove that part, and then we // can compare the labels size_t pos1 = representative.label.find_first_of('['); size_t pos2 = representative2.label.find_first_of('['); TAP_TEST(representative.label.substr(0, pos1) == representative2.label.substr(0, pos2), TEST_KMER_REPRESENTATIVE_REVCOMP, "The two representatives should have the same label"); TAP_TEST(revcomp(representative.sequence) == representative2.sequence, TEST_KMER_REPRESENTATIVE_REVCOMP, "The two representatives should have the same sequence (but revcomp-ed)"); }
string WindowsStorage::getLabel(junction window) { bool found = false; for (auto it: windows_labels) { string sequence_of_interest = it.first; if (sequence_of_interest.size() < window.size()) { found = window.find(sequence_of_interest) != string::npos || window.find(revcomp(sequence_of_interest)) != string::npos; } else { found = sequence_of_interest.find(window) != string::npos || sequence_of_interest.find(revcomp(window)) != string::npos; } if (found) return it.second; } return ""; }
char check_intron_consensus(unsigned long prev_exon_rend, unsigned long next_exon_lend, const string &genome_seq, char strand) { unsigned int genome_seq_length = genome_seq.length(); if (prev_exon_rend > genome_seq_length || next_exon_lend > genome_seq_length) { stringstream errmsg; errmsg << "Error, coordinates: " << prev_exon_rend << " and " << next_exon_lend << " are not entirely within genome sequence length: " << genome_seq_length; throw (errmsg.str()); } string left_dinuc; left_dinuc += genome_seq[prev_exon_rend]; left_dinuc += genome_seq[prev_exon_rend + 1]; string right_dinuc; right_dinuc += genome_seq[next_exon_lend - 3]; right_dinuc += genome_seq[next_exon_lend - 2]; if (strand == '-') { string left_dinuc_copy = left_dinuc; string right_dinuc_copy = right_dinuc; left_dinuc = revcomp(right_dinuc_copy); right_dinuc = revcomp(left_dinuc_copy); } if ( ((left_dinuc == "GT" || left_dinuc == "GC") && right_dinuc == "AG") || (left_dinuc == "CT" && right_dinuc == "AC") ) { return ('N'); // has proper splice boundaries. } else { return ('D'); } }
void init_aa0(unsigned char **aa0, int n0, int nm0, unsigned char **aa0s, unsigned char **aa1s, int qframe, int qshuffle_flg, int max_tot, struct pstruct *ppst, void **f_str, void **qf_str, void *my_rand_state) { int id; /* note that aa[5,4,3,2] are never used, but are provided so that frame can range from 0 .. 5; likewise for f_str[5..2] */ aa0[5] = aa0[4] = aa0[3] = aa0[2] = aa0[1] = aa0[0]; /* zero out for SSE2/ALTIVEC -- make sure this is ALWAYS done */ for (id=0; id < SEQ_PAD; id++) aa0[0][n0+id] = '\0'; init_work (aa0[0], n0, ppst, &f_str[0]); f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0]; if (qframe == 2) { if ((aa0[1]=(unsigned char *)calloc((size_t)n0+2+SEQ_PAD,sizeof(unsigned char)))==NULL) { fprintf(stderr," cannot allocate aa01[%d]\n", n0); } *aa0[1]='\0'; aa0[1]++; memcpy(aa0[1],aa0[0],n0+1); /* for ALTIVEC/SSE2, must pad with 16 NULL's */ for (id=0; id<SEQ_PAD; id++) {aa0[1][n0+id]=0;} revcomp(aa0[1],n0,ppst->c_nt); init_work (aa0[1], n0, ppst, &f_str[1]); } if (qshuffle_flg) { if ((*aa0s=(unsigned char *)calloc(n0+2+SEQ_PAD,sizeof(char)))==NULL) { fprintf(stderr,"cannot allocate aa0s[%d]\n",n0+2); exit(1); } **aa0s='\0'; (*aa0s)++; memcpy(*aa0s,aa0[0],n0); qshuffle(*aa0s,n0,nm0, my_rand_state); /* for SSE2/ALTIVEC, must pad with 16 NULL's */ for (id=0; id<SEQ_PAD; id++) {(*aa0s)[n0+id]=0;} init_work (*aa0s, n0, ppst, qf_str); } /* always allocate shuffle space */ if((*aa1s=calloc(max_tot+1,sizeof(char))) == NULL) { fprintf(stderr,"unable to allocate shuffled library sequence [%d]\n", max_tot); exit(1); } else { **aa1s=0; (*aa1s)++; } }
Sequence Segmenter::getSequence() const { Sequence s ; s.label_full = info ; if (segmented) { s.label = label + " " + (reversed ? "-" : "+"); s.sequence = revcomp(sequence, reversed); } else { s.sequence = sequence; } return s ; }
/** * @brief This function concatenates the reverse complement to a given master * string. A `#` sign is used as a separator. * @param s The master string. * @param len Its length. * @return The newly concatenated string. */ char *catcomp(char *s, size_t len) { if (!s) return NULL; char *rev = revcomp(s, len); char *temp = realloc(rev, 2 * len + 2); CHECK_MALLOC(temp); rev = temp; rev[len] = '#'; memcpy(rev + len + 1, s, len + 1); return rev; }
/* prepareSeq: prepares sequence string for analysis by shustring-type programs. * Does the following: 1) set all residues to upper case * 2) generate reverse complement * 3) concatenate reverse complement to end of forward strand * e.g. if the string of the original seq. is ACCGZ\0, (Z for the border) * then the new one which includes the reversed complement seq. looks like this: ACCGZCGGTZ\0 * */ void prepareSeq(Sequence *sequence){ Sequence *rstrand; Int64 i, j; char *nuc = "TCAGtcag"; strtoupper(sequence->seq); /* take care of reverse strand */ rstrand = revcomp(sequence); /* reverse and complement a sequence */ rstrand->headers = (char **)emalloc(sizeof(char *)); rstrand->headers[0] = (char *)emalloc(sizeof(char)); rstrand->borders = (Int64 *)emalloc(sizeof(Int64)); rstrand->freqTab = NULL; rstrand->numSeq = 1; sequence->seq[sequence->len] = '\0'; sequence->len += sequence->len; /* new seq. length = 2 x original size */ sequence->seq = (char *)erealloc(sequence->seq,(size_t)(sequence->len+1)*sizeof(char)); /* number of borders = 2 x original size */ sequence->borders = (Int64 *)erealloc(sequence->borders, 2*(size_t)sequence->numSeq * sizeof(Int64)); /* adjust the border values */ for(i=1;i<sequence->numSeq;i++){ /* seq. looks like this: F1 F2 .. Fn Rn .. R2 R1 */ sequence->borders[2*sequence->numSeq-i-1] = sequence->len - sequence->borders[i-1] - 2; } sequence->borders[2*sequence->numSeq-1] = sequence->len - 1; /* move first border of reverted sequences to the end */ rstrand->seq++; /* since the last border of the original seq is the first char of the reversed seq */ //strncat(sequence->seq,rstrand->seq,(size_t)sequence->len); ?? strncat(sequence->seq,rstrand->seq,(size_t)sequence->len / 2); rstrand->seq--; /* return the pointer */ sequence->seq[sequence->len-1] = BORDER; sequence->seq[sequence->len] = '\0'; freeSequence(rstrand); sequence->numNuc = 0; for(i = 0; i < 8; i++) { //sequence->numNuc += sequence->freqTab[(int)nuc[i]]; for (j = 0; j < sequence->numNuc; j ++) { sequence->numNuc += sequence->freqTab[j][(Int64)nuc[i]]; sequence->freqTab[j][(Int64)nuc[i]] *= 2; /* fwd and rev strand */ } } sequence->numNuc *= 2; sequence->numSbjctNuc *= 2; }
/** * @param sequences - An array of pointers to the sequences. * @param n - The number of sequences. */ void run(seq_t *sequences, size_t n) { seq_t *subject = &sequences[0]; esa_s E; if (seq_subject_init(subject) || esa_init(&E, subject)) { errx(1, "Failed to create index for %s.", subject->name); } size_t i = 0; // now compare every other sequence to the subject for (size_t j = 0; j < n; j++) { if (j == i) { continue; } // TODO: Provide a nicer progress indicator. if (FLAGS & F_EXTRA_VERBOSE) { #pragma omp critical { fprintf(stderr, "comparing %zu and %zu\n", i, j); } } size_t ql = sequences[j].len; if (FLAGS & F_FORWARD) { printf("> %s\n", sequences[j].name); dist_anchor(&E, sequences[j].S, ql, subject->gc); } if (FLAGS & F_REVCOMP) { char *R = revcomp(sequences[j].S, ql); printf("> %s Reverse\n", sequences[j].name); dist_anchor(&E, R, ql, subject->gc); free(R); } } esa_free(&E); seq_subject_free(subject); }
void testRevcomp() { TAP_TEST(complement("AATCAGactgactagATCGAn") == "TTAGTCTGACTGATCTAGCTN", TEST_REVCOMP, ""); TAP_TEST(revcomp("AATCAGactgactagATCGAn") == "NTCGATCTAGTCAGTCTGATT", TEST_REVCOMP, ""); TAP_TEST(revcomp("") == "", TEST_REVCOMP, ""); TAP_TEST(revcomp("aaaaaa") == "TTTTTT", TEST_REVCOMP, ""); }
int main (int argc, char **argv) { char c; int mismatch = 0; char *in[3] = {0,0,0}; char *out[5]; char *orep=NULL; int out_n = 0; int in_n = 0; int threads = 1; // not really necessary char verify='\0'; int i; int mino = 6; int pctdiff = 8; // this number tested well on exome data... tweak for best results bool omode = false; char *bfil = NULL; bool norevcomp = false; bool allow_ex = false; while ( (c = getopt (argc, argv, "-dRnbeo:t:v:m:p:r:xV")) != -1) { switch (c) { case '\1': if (!in[0]) in[0]=optarg; else if (!in[1]) in[1]=optarg; else if (!in[2]) in[2]=optarg; else { usage(stderr); return 1; } ++in_n; break; case 'o': if (out_n == 3) { usage(stderr); return 1; } out[out_n++] = optarg; break; case 'r': orep = optarg; break; case 't': threads = atoi(optarg); break; case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); return 0; break; case 'm': mino = atoi(optarg); break; case 'x': allow_ex = true; break; case 'p': pctdiff = atoi(optarg); break; case 'R': norevcomp = true; break; case 'd': ++debug; break; case 'v': if (strlen(optarg)>1) { fprintf(stderr, "Option -v requires a single character argument"); exit(1); } verify = *optarg; break; case '?': if (strchr("otvmpr", optopt)) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint(optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); usage(stderr); return 1; } } if (argc < 3 || !in[1] || (!in[2] && out_n != 1 && out_n != 3) || (in[2] && out_n != 1 && out_n != 5)) { usage(stderr); return 1; } FILE *fin[2]; bool gzin[2]; meminit(gzin); for (i = 0; i < in_n; ++i) { fin[i] = gzopen(in[i], "r",&gzin[i]); if (!fin[i]) { fprintf(stderr, "Error opening file '%s': %s\n",in[i], strerror(errno)); return 1; } } const char *suffix[5]={"un1", "un2", "join", "un3", "join2"}; FILE *fout[5]; meminit(fout); bool gzout[5]; meminit(gzout); char *pre = out[0]; for (i = 0; i < (in[2] ? 5 : 3); ++i) { // prefix out if (out_n == 1) { out[i]=(char *)malloc(strlen(pre)+10); strcpy(out[i], pre); char *p; if (p=strchr(out[i], '%')) { // substiture instead of append strcpy(p, suffix[i]); strcpy(p+strlen(suffix[i]), pre+(p-out[i])+1); } else { strcat(out[i], suffix[i]); } } // else explicit fout[i] = gzopen(out[i], "w",&gzout[i]); if (!fout[i]) { fprintf(stderr, "Error opening output file '%s': %s\n",out[i], strerror(errno)); return 1; } } //printf("in_n:%d in:%x fo:%x", in_n, in[3], fout[4]); //return 1; FILE *frep = NULL; if (orep) { frep = fopen(orep, "w"); if (!orep) { fprintf(stderr, "Error opening report file '%s': %s\n",out[i], strerror(errno)); return 1; } } // some basic validation of the file formats { for (i=0;i<in_n;++i) { char c=getc(fin[i]); if (c != '@') { fprintf(stderr, "%s doesn't appear to be a fastq file (%c)\n", in[i], c); return 1; } ungetc(c, fin[i]); } } struct fq fq[3]; meminit(fq); int nrec=0; int nerr=0; int nok=0; int joincnt=0; double tlen=0; double tlensq=0; int read_ok; struct fq rc; meminit(rc); // read in 1 record from each file while (read_ok=read_fq(fin[0], nrec, &fq[0])) { for (i=1;i<in_n;++i) { int mate_ok=read_fq(fin[i], nrec, &fq[i]); if (read_ok != mate_ok) { fprintf(stderr, "# of rows in mate file '%s' doesn't match primary file, quitting!\n", in[i]); return 1; } if (verify) { // verify 1 in 100 if (0 == (nrec % 100)) { char *p=strchr(fq[i].id.s,verify); if (!p) { fprintf(stderr, "File %s is missing id verification char %c at line %d", in[i], verify, nrec*4+1); return 1; } int l = p-fq[i].id.s; if (strncmp(fq[0].id.s, fq[i].id.s, l)) { fprintf(stderr, "File %s, id doesn't match file %s at line %d", in[0], in[i], nrec*4+1); return 1; } } } } ++nrec; if (read_ok < 0) continue; if (debug) fprintf(stderr, "seq: %s %d\n", fq[0].seq.s, fq[0].seq.n); if (!norevcomp) { revcomp(&rc, &fq[1]); } else { rc=fq[1]; } if (debug) fprintf(stderr, "comp: %s %d\n", rc.seq.s, rc.seq.n); int maxo = min(fq[0].seq.n, rc.seq.n); int bestscore=INT_MAX; int besto=-1; for (i=mino; i <= maxo; ++i) { int mind = (pctdiff * i) / 100; int d; d=hd(fq[0].seq.s+fq[0].seq.n-i, rc.seq.s, i); if (debug) fprintf(stderr, "hd: %d, %d\n", i, d); if (d <= mind) { // squared-distance over length, probably can be proven better (like pearson's) int score = (1000*(d*d+1))/i; if (score < bestscore) { bestscore=score; besto=i; } } } int hasex=0; if (allow_ex && besto<maxo) { if (fq[0].seq.n > rc.seq.n) { int mind = (pctdiff * maxo) / 100; for (i=0; i < fq[0].seq.n-maxo; ++i ) { int d; d=hd(fq[0].seq.s+fq[0].seq.n-rc.seq.n-i-1, rc.seq.s, maxo); if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d); if (d <= mind) { // squared-distance over length, probably can be proven better (like pearson's) int score = (1000*(d*d+1))/maxo; if (score < bestscore) { bestscore=score; // negative overlap! hasex=-i; besto=maxo; } } } } else if (fq[0].seq.n < rc.seq.n) { int mind = (pctdiff * maxo) / 100; for (i=0; i < rc.seq.n-maxo; ++i ) { int d; d=hd(fq[0].seq.s, rc.seq.s+i, maxo); if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d); if (d <= mind) { // squared-distance over length, probably can be proven better (like pearson's) int score = (1000*(d*d+1))/maxo; if (score < bestscore) { bestscore=score; // negative overlap! hasex=-i; besto=maxo; } } } } } if (debug) { fprintf(stderr, "best: %d %d\n", besto-hasex, bestscore); } FILE *fmate = NULL; int olen = besto-hasex; if (besto > 0) { ++joincnt; tlen+=olen; tlensq+=olen*olen; char *sav_fqs=NULL, *sav_rcs; char *sav_fqq, *sav_rcq; if (hasex) { sav_fqs=fq[0].seq.s; sav_fqq=fq[0].qual.s; sav_rcs=rc.seq.s; sav_rcq=rc.qual.s; if (fq[0].seq.n < rc.seq.n) { rc.seq.s=rc.seq.s-hasex; rc.qual.s=rc.qual.s-hasex; rc.seq.n=maxo; rc.qual.n=maxo; } else { // fprintf(stderr, "rc negative overlap: %s %d\n", rc.seq.s, hasex); fq[0].seq.s=fq[0].seq.s+fq[0].seq.n-maxo+hasex-1; fq[0].qual.s=fq[0].qual.s+fq[0].seq.n-maxo+hasex-1; fq[0].seq.n=maxo; fq[0].qual.n=maxo; // fprintf(stderr, "negative overlap: %s -> %s, %d\n", fq[0].seq.s, rc.seq.s, maxo); } // ok now pretend everythings normal, 100% overlap //if (debug) } FILE *f=fout[2]; if (verify) { char *p=strchr(fq[0].id.s,verify); if (p) { *p++ = '\n'; *p = '\0'; } } fputs(fq[0].id.s,f); for (i = 0; i < besto; ++i ) { int li = fq[0].seq.n-besto+i; int ri = i; if (debug>=2) printf("%c %c / %c %c / ", fq[0].seq.s[li], rc.seq.s[ri], fq[0].qual.s[li], rc.qual.s[ri]); if (fq[0].seq.s[li] == rc.seq.s[ri]) { fq[0].qual.s[li] = max(fq[0].qual.s[li], rc.qual.s[ri]); // bounded improvement in quality, since there's no independence // fq[0].qual.s[ri] = max(fq[0].qual.s[li], rc.qual.s[ri])+min(3,min(fq[0].qual.s[li],rc.qual.s[ri])-33); } else { // use the better-quality read // this approximates the formula: E = min(0.5,[(1-e2/2) * e1] / [(1-e1) * e2/2 + (1-e2/2) * e1]) if (fq[0].qual.s[li] > rc.qual.s[ri]) { // reduction in quality, based on phred-difference fq[0].qual.s[li] = 33+min(fq[0].qual.s[li],max(fq[0].qual.s[li]-rc.qual.s[ri],3)); } else { fq[0].seq.s[li] = rc.seq.s[ri]; // reduction in quality, based on phred-difference fq[0].qual.s[li] = 33+min(rc.qual.s[ri],max(rc.qual.s[ri]-fq[0].qual.s[li],3)); } } if (debug>=2) printf("%c %c\n", fq[0].seq.s[li], fq[0].qual.s[li]); } fwrite(fq[0].seq.s,1,fq[0].seq.n,f); fputs(rc.seq.s+besto,f); fputc('\n',f); fputs(fq[0].com.s,f); fwrite(fq[0].qual.s,1,fq[0].qual.n,f); fputs(rc.qual.s+besto,f); fputc('\n',f); fmate=fout[4]; if (sav_fqs) { fq[0].seq.s=sav_fqs; fq[0].qual.s=sav_fqq; rc.seq.s=sav_rcs; rc.qual.s=sav_rcq; } if (frep) { fprintf(frep, "%d\n", besto); } } else { for (i=0;i<2;++i) { FILE *f=fout[i]; fputs(fq[i].id.s,f); fputs(fq[i].seq.s,f); fputc('\n',f); fputs(fq[i].com.s,f); fputs(fq[i].qual.s,f); fputc('\n',f); } fmate=fout[3]; } if (fmate) { fputs(fq[2].id.s,fmate); fputs(fq[2].seq.s,fmate); fputc('\n',fmate); fputs(fq[2].com.s,fmate); fputs(fq[2].qual.s,fmate); fputc('\n',fmate); } } double dev = sqrt((((double)joincnt)*tlensq-pow((double)tlen,2)) / ((double)joincnt*((double)joincnt-1)) ); printf("Total reads: %d\n", nrec); printf("Total joined: %d\n", joincnt); printf("Average join len: %.2f\n", (double) tlen / (double) joincnt); printf("Stdev join len: %.2f\n", dev); printf("Version: %s.%d\n", VERSION, SVNREV); return 0; }
bool IFindObserver<span>::contains(KmerType kmer) { kmer = std::min(kmer, revcomp(kmer, this->_find->kmer_size())); Node node = Node(Node::Value(kmer)); return this->_find->graph_contains(node); }
int main(int argc, char **argv) { char *cmfile; ESL_ALPHABET *abc; char *seqfile; ESL_SQFILE *sqfp; int format; CM_FILE *cmfp; CM_t *cm; ESL_SQ *seq; float sc, rev_sc; Parsetree_t *tr; Fancyali_t *fali; Fancyali_t *rev_fali; int do_local; /* int status; */ /* char *optname; */ /* char *optarg; */ int optind; int status; char errbuf[eslERRBUFSIZE]; cmfile = seqfile = NULL; abc = NULL; sqfp = NULL; cmfp = NULL; cm = NULL; seq = NULL; tr = NULL; fali = NULL; rev_fali = NULL; format = eslSQFILE_UNKNOWN; do_local = TRUE; /* Should process options, but for now assume none and set optind */ optind = 1; if ( argc - optind != 2 ) cm_Die("Incorrect number of arguments\n"); cmfile = argv[optind++]; seqfile = argv[optind++]; if((status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf)) != eslOK) cm_Die("Failed to open covariance model save file\n"); if ((status = cm_file_Read(cmfp, TRUE, &abc, &cm)) != eslOK) cm_Die("Failed to read a CM from cm file\n"); if (cm == NULL) cm_Die("CM file empty?\n"); cm_file_Close(cmfp); if ( esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK ) cm_Die("Failed to open sequence database file\n"); if (do_local) cm->config_opts |= CM_CONFIG_LOCAL; if((status = cm_Configure(cm, errbuf, -1)) != eslOK) cm_Die(errbuf); /*SetMarginalScores_reproduce_bug_i27(cm);*/ seq = esl_sq_Create(); while ( esl_sqio_Read(sqfp, seq) == eslOK ) { if (seq->n == 0) continue; int i0 = 1; int j0 = seq->n; if (seq->dsq == NULL) esl_sq_Digitize(abc, seq); sc = TrCYK_DnC(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */ /* sc = TrCYK_Inside(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, FALSE, &tr); */ fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons, seq->dsq, FALSE, NULL); /* float sc, struct_sc; * ParsetreeScore(cm, NULL, NULL, tr, seq->dsq, FALSE, &sc, &struct_sc, NULL, NULL, NULL); * printf("Parsetree score: %.4f\n", sc); * ParsetreeDump(stdout, tr, cm, seq->dsq); */ FreeParsetree(tr); revcomp(abc, seq, seq); rev_sc = TrCYK_DnC(cm,seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */ rev_fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons,seq->dsq, FALSE, NULL); /*ParsetreeDump(stdout, tr, cm, seq->dsq);*/ FreeParsetree(tr); if (sc > rev_sc) { printf("sequence: %s\n", seq->name); printf("score: %.2f\n",sc); PrintFancyAli(stdout, fali, 0, FALSE, FALSE, 60); } else { printf("sequence: %s (reversed)\n", seq->name); printf("score: %.2f\n",rev_sc); PrintFancyAli(stdout, fali, seq->n, TRUE, FALSE, 60); } FreeFancyAli(fali); FreeFancyAli(rev_fali); esl_sq_Destroy(seq); seq = esl_sq_Create(); } esl_sq_Destroy(seq); FreeCM(cm); esl_sqfile_Close(sqfp); return EXIT_SUCCESS; }
int run (int argc, char* argv[]) { if (argc < 3) { stringstream s; s << "Usage: " << argv[0] << " file.fasta kmer_length [DS_mode]" << endl << endl; cerr << s.str(); return(1); } string fasta_filename (argv[1]); unsigned int kmer_length = atoi(argv[2]); bool DS_mode = (argc >= 3) ? true : false; Fasta_reader fasta_reader(fasta_filename); Ktree ktree; long read_counter = 0; while (fasta_reader.hasNext()) { read_counter++; if (read_counter % 1000 == 0) { cerr << "\rread[" << read_counter << "] "; } Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); string sequence = fe.get_sequence(); // cerr << "Processing: " << sequence << endl; if (sequence.length() < kmer_length + 1) { continue; } for (unsigned int i = 0; i <= sequence.length() - kmer_length; i++) { string kmer = sequence.substr(i, kmer_length); if (! contains_non_gatc(kmer)) { ktree.add_kmer(kmer); if (DS_mode) { kmer = revcomp(kmer); ktree.add_kmer(kmer); } } } } ktree.report_kmer_counts(); return(0); }
bool checkMapability(const KmerIndex& index, const std::string &s, const std::vector<std::pair<KmerEntry,int>>& v, std::vector<int> &u) { const int maxMismatch = 2; const int maxSoftclip = 5; Kmer km; KmerEntry val; int p; if (!v.empty()) { p = findFirstMappingKmer(v,val); km = Kmer(s.c_str()+p); } else { return false; } std::vector<int> vtmp; vtmp.reserve(u.size()); for (auto tr : u) { auto trpos = index.findPosition(tr, km, val, p); int tpos = (int)trpos.first; int sz = (int)s.size(); bool add = true; if (trpos.second) { if (tpos < 1 || tpos + sz - 1 > index.target_seqs_[tr].size()) { add = false; } else { //std::cout << index.target_seqs_[tr].substr(tpos,sz) << std::endl; //std::cout << s << std::endl; int mis = 0; for (int i = 0; i < sz - maxSoftclip; i++) { if (index.target_seqs_[tr][tpos-1 + i] != s[i]) { ++mis; if (mis > maxMismatch) { break; } } } add = (mis <= maxMismatch); } } else { if (tpos > index.target_seqs_[tr].size() || tpos - sz < 1) { add = false; } else { std::string rs = revcomp(s); //std::cout << index.target_seqs_[tr].substr(tpos - sz, sz) << std::endl; //std::cout << rs << std::endl; int mis = 0; for (int i = sz-1; i >= maxSoftclip; i--) { if (index.target_seqs_[tr][tpos-sz+i] != rs[sz]) { ++mis; if (mis > maxMismatch) { break; } } } add = (mis <= maxMismatch); } } if (add) { vtmp.push_back(tr); } } if (vtmp.empty()) { return false; } if (vtmp.size() < u.size()) { u = vtmp; // copy } return true; }
// main k-mer counting function, shared between minia and dsk // verbose == 0 : stderr progress bar // verbose >= 1 : print basic status // verbose >= 2 : print extra partition information // write_count == True: include kmer count in results file, in that form: // - save kmer count for each kmer in the resulting binary file // - the very first four bytes of the result file are the kmer length void sorting_count(Bank *Sequences, char *prefix, int max_memory, int max_disk_space, bool write_count, int verbose) { // create a temp dir from the prefix char temp_dir[1024]; sprintf(temp_dir,"%s_temp",prefix); // clear the temp folder (needs to be done before estimating disk space) DIR* dp; struct dirent* ep; char p_buf[512] = {0}; dp = opendir(temp_dir); while ( (dp != NULL) && ((ep = readdir(dp)) != NULL)) { sprintf(p_buf, "%s/%s", temp_dir, ep->d_name); remove(p_buf); } if(dp != NULL) closedir(dp); if (max_disk_space == 0) { // default max disk space struct statvfs buffer ; char current_path[1000]; getcwd(current_path,sizeof(current_path)); // int ret = statvfs(current_path, &buffer); int available = (int)(((double)buffer.f_bavail * (double)buffer.f_bsize) / 1024 / 1024); uint32_t tt_new_temp = (uint32_t) (((double)Sequences->filesizes)/(1024*1024)); printf("Available disk space in %s: %d %u %llu MB\n",current_path,available,tt_new_temp,Sequences->filesizes); // not working in osx (is that a TODO then?) max_disk_space = min((uint32_t)available/2, tt_new_temp); } if (max_disk_space <= 0) // still 0? max_disk_space = 10000; // = default for osx // estimate number of iterations TODO Check if multiplication with totalKmers is actually required or not. It may be just increasing number of partitions for no reason //uint64_t volume = totalKmers*Sequences->estimate_kmers_volume(smallestKmer); //Since there are totalKmers no of kmers and an upper bound can be estimated by using the smallest size of kmer. Added by Raunaq uint64_t volume = Sequences->estimate_kmers_volume(smallestKmer); //Since there are totalKmers no of kmers and an upper bound can be estimated by using the smallest size of kmer. Added by Raunaq uint32_t nb_passes = ( volume / max_disk_space ) + 1; int passes_hash ; int nb_threads=1; #if OMP use_compressed_reads =true; nb_threads = 8; max_memory /= nb_threads; max_memory = max (max_memory,1); #endif // temp bugfix: don't use compressed reads for long reads if (Sequences->estimate_max_readlen() > 1000000) use_compressed_reads = false; uint64_t volume_per_pass,volume_per_partition; uint32_t nb_partitions; int partitions_hash; // loop to lower the number of partitions below the maximum number of simulatenously open files do { volume_per_pass = volume / nb_passes; nb_partitions = ( volume_per_pass * totalKmers / max_memory ) + 1; //printf("volume per pass and total volume %llu %llu \n",volume_per_pass,(unsigned long long)volume); // if partitions are hashed instead of sorted, adjust for load factor // (as in the worst case, all kmers in the partition are distinct and partition may be slightly bigger due to hash-repartition) if (use_hashing) { nb_partitions = (uint32_t) ceil((float) nb_partitions / load_factor); nb_partitions = ((nb_partitions * OAHash::size_entry() ) + sizeof(key_type)-1) / sizeof(key_type); // also adjust for hash overhead } struct rlimit lim; int max_open_files = 1000; int err = getrlimit(RLIMIT_NOFILE, &lim); if (err == 0) max_open_files = lim.rlim_cur / 2; if (nb_partitions >= max_open_files) nb_passes++; else break; } while (1); volume_per_partition= volume_per_pass/nb_partitions; passes_hash = ceil(log(nb_passes)/log(4)); partitions_hash = ceil(log(nb_partitions)/log(4)); int size_for_reestimation = ceil((passes_hash + partitions_hash)*1.8); double * lmer_counts = (double * ) malloc(sizeof(long)*pow(4,size_for_reestimation)); long * lmers_for_hash = (long * ) malloc(sizeof(long)*pow(4,size_for_reestimation)); int * partitions_for_lmers =(int * ) malloc(sizeof(int)*pow(4,size_for_reestimation)); Sequences->count_kmers_for_small_value(size_for_reestimation,lmer_counts); int temp_partition=reestimate_partitions(size_for_reestimation,volume_per_partition,lmer_counts,lmers_for_hash,partitions_for_lmers); unordered_map<long,int> part_hash; int total_lmers=pow(4,size_for_reestimation); for(int it=0;it<total_lmers;it++) { pair<long,int> temp_pair(lmers_for_hash[it],partitions_for_lmers[it]); part_hash.insert (temp_pair); // Add element to the hash } //uint64_t up_passes_size = volume_per_pass; do { //recompute the number of partitions based on updated partitions estimate nb_partitions = ceil(temp_partition*1.0/nb_passes); struct rlimit lim; int max_open_files = 1000; int err = getrlimit(RLIMIT_NOFILE, &lim); if (err == 0) max_open_files = lim.rlim_cur / 2; if (nb_partitions >= max_open_files) nb_passes++; else break; }while(1); printf("no of partitions before %lu and after %d passes %lu \n",nb_partitions*nb_passes,temp_partition,nb_passes); uint64_t total_IO = volume * 2LL * 1024LL*1024LL ;// in bytes + nb_passes * ( volume / (sizeof(kmer_type)*4) ) ; // in bytes uint64_t temp_IO = 0; BinaryBankConcurrent * redundant_partitions_file[nb_partitions]; char redundant_filename[nb_partitions][256]; kmer_type kmer; int max_read_length = KMERSBUFFER_MAX_READLEN; kmer_type * kmer_table_seq = (kmer_type * ) malloc(sizeof(kmer_type)*max_read_length); ; kmer_type * kmer_length_table_seq = (kmer_type * ) malloc(sizeof(kmer_type)*max_read_length); BinaryReads * binread = NULL; if(use_compressed_reads) binread = new BinaryReads(return_file_name(binary_read_file),true); fprintf(stderr,"Sequentially counting ~%llu MB of kmers with %d partition(s) and %d passes using %d thread(s), ~%d MB of memory and ~%d MB of disk space\n", (unsigned long long)volume, nb_partitions,nb_passes, nb_threads, max_memory * nb_threads, max_disk_space); STARTWALL(count); mkdir(temp_dir, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); // Open totalKmers files to store counts of totalKmers different k's BinaryBankConcurrent * SolidKmers[totalKmers]; for (int s=0;s<totalKmers;s++) { char temp[1024]; sprintf(temp,"%s.%d",return_file_name(solid_kmers_file),Kmerlist[s]); uint64_t exp = (((uint64_t)1)<<(Kmerlist[s]*2))-1; SolidKmers[s] = new BinaryBankConcurrent(temp,sizeof(kmer),true,nb_threads); //printf("kmer is %d exp is %llu \n",Kmerlist[s],exp); //BinaryBankConcurrent * SolidKmers = new BinaryBankConcurrent(return_file_name(solid_kmers_file),sizeof(kmer),true,nb_threads); if (write_count) { // write k-mer nbits as the first 4 bytes; and actual k-mer size as the next 4 bits uint32_t kmer_nbits = sizeof(kmer) * 8; SolidKmers[s]->write_buffered(&kmer_nbits, 4,0); SolidKmers[s]->write_buffered(&Kmerlist[s], 4,0); SolidKmers[s]->flush(0); } } int64_t estimated_NbReads = Sequences->estimate_nb_reads(); char * rseq; int readlen; int64_t NbSolid = 0; int64_t * NbSolid_omp = (int64_t *) calloc(nb_threads,sizeof(int64_t)); //long total_kmers_per_partition[nb_partitions]; //guillaume probably commented it because updating this variable would require synchronization long distinct_kmers_per_partition[nb_partitions]; uint64_t * histo_count = (uint64_t *) calloc(10001,sizeof(uint64_t)); #if OMP uint64_t ** histo_count_omp = (uint64_t **) calloc(nb_threads,sizeof(uint64_t *)); for(int ii=0;ii<nb_threads;ii++) { histo_count_omp[ii]= (uint64_t *) calloc(10001,sizeof(uint64_t)); } #endif //start by the conversion of the file to binary format if(use_compressed_reads) { char * pt_begin; int idx =0 ; int64_t NbRead = 0; Progress progress_conversion; // progress_conversion.timer_mode=1; // to switch to timer mode (show elapsed and estimated remaining time) progress_conversion.init(estimated_NbReads,"First step: Converting input file into Binary format"); Sequences->rewind_all(); while(1) { if(! Sequences->get_next_seq(&rseq,&readlen)) break; // read original fasta file if(readlen > max_read_length) // realloc kmer_table_seq if needed { max_read_length = 2*readlen; kmer_table_seq = (kmer_type * ) realloc(kmer_table_seq,sizeof(kmer_type)*max_read_length); kmer_length_table_seq = (kmer_type * ) realloc(kmer_length_table_seq,sizeof(kmer_type)*max_read_length); } pt_begin = rseq; //should be ok while (pt_begin < (rseq+ readlen)) { idx=0; // start a new read //skips NN while (*pt_begin =='N' && pt_begin < (rseq+ readlen)) { pt_begin ++; } // goes to next N or end of seq while ( (pt_begin[idx] !='N') && ((pt_begin +idx) < (rseq+ readlen)) ) { idx++; } //we have a seq beginning at pt_begin of size idx ,without any N, will be treated as a read: binread->write_read(pt_begin,idx); revcomp_sequence(pt_begin,idx); // reverse complement the string binread->write_read(pt_begin,idx); // write reverse complement string revcomp_sequence(pt_begin,idx); // restore the string pt_begin += idx; } // binread->write_read(rseq,readlen); NbRead++; if ((NbRead%10000)==0) { progress_conversion.inc(10000); } } //printf("Number of reads converted to binary %d \n",NbRead); progress_conversion.finish(); binread->close(); } ///fin conversion if (clear_cache) { #ifdef OSX system("purge"); #else system("echo 3 > /proc/sys/vm/drop_caches"); #endif } #if SINGLE_BAR Progress progress; char message[1000]; sprintf(message,"Counting kmers"); progress.timer_mode=1; if (verbose == 0 ) progress.init(total_IO,message); #endif //use_compressed_reads=false; // for testing compute_kmer_from_one_seq // how many times we will traverse the whole reads file (has an influence on temp disk space) uint64_t iter_partition=0; for (uint32_t current_pass = 0; current_pass < nb_passes; current_pass ++) { // stop computing if all partitions are done Added by Raunaq if (iter_partition==temp_partition) break; if(use_compressed_reads ) //open binary reads for reading binread->open(false); STARTWALL(debpass); STARTWALL(debw); int initial_value = current_pass*nb_partitions; for (uint32_t p=0;p<nb_partitions;p++) { sprintf(redundant_filename[p],"%s/partition%d.redundant_kmers",temp_dir,p); redundant_partitions_file[p] = new BinaryBankConcurrent (redundant_filename[p],sizeof(kmer_type),true, nb_threads); distinct_kmers_per_partition[p]=0; } int final_value = ((current_pass+1)*nb_partitions)-1; printf("Storing k-mers in partition files between %d and %d \n",initial_value,final_value); Sequences->rewind_all(); #if !SINGLE_BAR Progress progress; progress.timer_mode=1; // to switch to timer mode (show elapsed and estimated remaining time) char message[1000]; sprintf(message,"Pass %d/%d, Step 1: partitioning",current_pass+1,nb_passes); if (verbose == 0 ) progress.init(estimated_NbReads,message); #endif //current_pass> 0 && #if OMP #pragma omp parallel if(use_compressed_reads) num_threads(nb_threads) #endif { int64_t nbkmers_written =0; int tid =0; int64_t NbRead = 0; int64_t nread =0; int64_t tempread =0; long it_zero_wrt =0; #if OMP tid = omp_get_thread_num(); #endif int nreads_in_buffer= 1000; KmersBuffer * kbuff =NULL; if(use_compressed_reads) { kbuff = new KmersBuffer (binread, 1000000, nreads_in_buffer); //buffer size (in nb of kmers), seq per task // the buffer is per thread kbuff->binary_read_file = binread->binary_read_file; } kmer_type * kmer_table ; kmer_type * kmer_length_info ; // Added by Raunaq, to store the length of read into the partitions file while(1) { //read the fasta file if(use_compressed_reads) // && current_pass>0 { nread = kbuff->readkmers(); if( nread < 0) break; NbRead+= nread; tempread+= nread; } else { if(! Sequences->get_next_seq(&rseq,&readlen)) break; // read original fasta file if(readlen > max_read_length) // realloc kmer_table_seq if needed { max_read_length = 2*readlen; kmer_table_seq = (kmer_type * ) realloc(kmer_table_seq,sizeof(kmer_type)*max_read_length); kmer_length_table_seq = (kmer_type * ) realloc(kmer_length_table_seq,sizeof(kmer_type)*max_read_length); } } // if(use_compressed_reads ) //write compressed read file at first pass //&& current_pass==0 // binread->write_read(rseq,readlen); int i; int nbkmers =readlen-sizeKmer+1; if( use_compressed_reads) //current_pass >0 && { nbkmers = kbuff->nkmers; kmer_table = kbuff->kmers_buffer; kmer_length_info = kbuff->kmer_length; } else //old fashion { compute_kmer_table_from_one_seq(readlen,rseq,kmer_table_seq,kmer_length_table_seq,Kmerlist[totalKmers-1]); // Added by Raunaq for computing kmers for all values of k nbkmers =readlen-Kmerlist[totalKmers-1]+1; kmer_table = kmer_table_seq; kmer_length_info = kmer_length_table_seq; NbRead++; //printf("Number of kmers read from seq %d \n",nbkmers); } nbkmers_written= 0; char temp_kmer[256]; int zero; //compute the kmers stored in the buffer kmer_table for (i=0; i<nbkmers; i++) { kmer_type lkmer; kmer_type lkmer_length; // kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp); lkmer = kmer_table[i]; lkmer_length = kmer_length_info[i]; // zero = code2seq(lkmer,temp_kmer); long pass_lkmer = code2first_n_nucleotide(lkmer,size_for_reestimation); unordered_map<long,int>::const_iterator got = part_hash.find(pass_lkmer); int p;// compute in which partition this kmer falls into if(got==part_hash.end()) continue; else p = got->second; // check if this kmer should be included in the current pass if(!(p >= initial_value && p<= final_value)) continue; /* #ifdef _ttmath (reduced_kmer % nb_partitions).ToInt(p); #else p = reduced_kmer % nb_partitions; #endif */ p = p - current_pass*nb_partitions; nbkmers_written++; redundant_partitions_file[p]->write_element_buffered(&lkmer,tid); // save this kmer to the right partition file redundant_partitions_file[p]->write_buffered(&lkmer_length,sizeof(lkmer_length),tid,false); // save the kmer length next to the kmer in the same partition file // total_kmers_per_partition[p]++; // guillaume probably commented it because updating this variable would require synchronization } //NbRead++; #if SINGLE_BAR if(verbose==0) { if (nb_threads == 1) progress.inc(nbkmers_written * sizeof(kmer_type)); else progress.inc(nbkmers_written * sizeof(kmer_type),tid); } #endif // if ((NbRead%10000)==0) if(tempread> 10000) { tempread -= 10000; if (verbose) fprintf (stderr,"%cPass %d/%d, loop through reads to separate (redundant) kmers into partitions, processed %lluM reads out of %lluM",13,current_pass+1,nb_passes,(unsigned long long)(NbRead/1000/1000),(unsigned long long)(estimated_NbReads/1000/1000)); #if !SINGLE_BAR else if (nb_threads == 1) progress.set(NbRead); else progress.inc(10000,tid); #endif } } //end while // printf("Count of zero in write is %lu \n",it_zero_wrt); if(use_compressed_reads) delete kbuff; } // end OMP #if !SINGLE_BAR if (verbose == 0) { if (nb_threads == 1) progress.finish(); else progress.finish_threaded(); // here only one thread sprintf(message,"Pass %d/%d, Step 2: computing kmer count per partition",current_pass+1,nb_passes); progress.init(nb_partitions+1,message); } #endif if (verbose)fprintf(stderr,"\n"); if (verbose >= 2) { STOPWALL(debw,"Writing redundant kmers"); } STARTWALL(debtri); for (uint32_t p=0;p<nb_partitions;p++) { redundant_partitions_file[p]->close(); redundant_partitions_file[p]->open(false); } // for better timing: clear the file cache, since the partitions may still be in memory, that's unfair to low mem machines if (clear_cache) { #ifdef OSX system("purge"); #else system("echo 3 > /proc/sys/vm/drop_caches"); #endif } //quick and dirty parall with omp, testing //todo if we want omp and histo : separate histo_count tab per thread that needs to be merged at the end // TODO to guillaume: remove that todo above, because it is done, right? kmer_type lkmer,lkmer_length,lkmer_temp,exp; long it_zero=0; OAHash * hash; int p,s; #if OMP //omp_set_numthreads(2); //num_threads(2) //if(!output_histo) num_threads(nb_threads) #pragma omp parallel for private (p,s,lkmer,lkmer_length,hash,lkmer_temp,exp) num_threads(nb_threads) #endif // load, sort each partition to output solid kmers for ( p=0;p<nb_partitions;p++) { char temp_kmer[256]; // bug check code int zero; kmer_type lkmer_revcomp; // to store revcomps bool use_hashing_for_this_partition = use_hashing; if(hybrid_mode) { if( (redundant_partitions_file[p]->nb_elements()*sizeof(kmer_type)) < (max_memory*1024LL*1024LL) ) // Maintain totalKmers hash for each partition file { use_hashing_for_this_partition = false; } else { use_hashing_for_this_partition = true; } } int tid =0; //int s; //Computing if hashing should be used or not for this partition #if OMP tid = omp_get_thread_num(); #endif //use_hashing_for_this_partition = false; //to check the vector part of the code if (use_hashing_for_this_partition) { // hash partition and save to solid file hash = new OAHash(max_memory*1024LL*1024LL/2); // One hash to store all types of k-mer lengths uint64_t nkmers_read=0; redundant_partitions_file[p]->read_element_buffered(&lkmer_length); while (redundant_partitions_file[p]->read_element_buffered(&lkmer)) { if(lkmer_length == Kmerlist[0]) //only add the largest k-mer hash->increment(lkmer,convert_to_int(lkmer_length)); else { unordered_map<int,int>::const_iterator got = kmerlength_map.find(convert_to_int(lkmer_length)); exp = (((kmer_type)1)<<(got->second*2))-1; lkmer_temp = lkmer & exp; hash->increment(lkmer_temp,got->second); } if(!redundant_partitions_file[p]->read_element_buffered(&lkmer_length)) { break; } nkmers_read++; #if SINGLE_BAR if(verbose==0 && nkmers_read==10000) { if (nb_threads == 1) progress.inc(nkmers_read*sizeof(kmer_type)); else progress.inc(nkmers_read*sizeof(kmer_type),tid); nkmers_read=0; } #endif } if (verbose >= 2) printf("Pass %d/%d partition %d/%d hash load factor: %0.3f\n",current_pass+1,nb_passes,p+1,nb_partitions,hash->load_factor()); for( s=0;s<totalKmers;s++) { OAHash * temp_ = new OAHash(max_memory*1024LL*1024LL/2); hash->start_iterator(); while (hash->next_iterator()) { uint_abundance_t abundance = hash->iterator->value; uint_abundance_t abund_tid = (current_pass+1)*100+p; if(output_histo) { uint_abundance_t saturated_abundance; saturated_abundance = (abundance >= 10000) ? 10000 : abundance; #if OMP histo_count_omp[tid][saturated_abundance]++; #else histo_count[saturated_abundance]++; #endif } int length_kmer = hash->iterator->length; lkmer = hash->iterator->key; if (abundance >= nks && abundance <= max_couv && length_kmer == Kmerlist[s]) { //write if lkmer is the smaller of it and its reverse complement lkmer_revcomp = revcomp(lkmer,length_kmer); if(lkmer < lkmer_revcomp) { SolidKmers[s]->write_element_buffered(&(hash->iterator->key),tid); NbSolid_omp[tid]++; if (write_count) SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false); } } distinct_kmers_per_partition[p]++; if(s!=totalKmers-1) { if(length_kmer == Kmerlist[s]) { exp = (((kmer_type)1)<<(Kmerlist[s+1]*2))-1; lkmer_temp = lkmer & exp; temp_->increment_by_value(lkmer_temp,abundance,Kmerlist[s+1]); }else { temp_->increment_by_value(lkmer,abundance,length_kmer); } } } hash->~OAHash(); hash = temp_; } hash->~OAHash(); //printf("All hashes closed and destroyed \n"); } else { // This part does it in slower fashion // sort partition and save to solid file //vector < kmer_type > kmers; vector < kmer_type > kmers[totalKmers]; uint64_t nkmers_read=0; //int s=0; redundant_partitions_file[p]->read_element_buffered(&lkmer_length); while (redundant_partitions_file[p]->read_element_buffered (&lkmer)) { for(s=0;s<totalKmers;s++) { //kmer_type lkmer_temp; //kmer_type exp; if(lkmer_length<Kmerlist[s]) continue; if(s==0) kmers[s].push_back (lkmer); else { exp = (((kmer_type)1)<<(Kmerlist[s]*2))-1; lkmer_temp = lkmer & exp; // Converting the kmer to its smaller equivalent in binary kmers[s].push_back (lkmer_temp); } } nkmers_read++; if(!redundant_partitions_file[p]->read_element_buffered(&lkmer_length)) break; //Added to get the next length of kmer #if SINGLE_BAR if(verbose==0 && nkmers_read==10000) { if (nb_threads == 1) progress.inc(nkmers_read*sizeof(kmer_type)); else progress.inc(nkmers_read*sizeof(kmer_type),tid); nkmers_read=0; } #endif } for(s=0;s<totalKmers;s++) { sort (kmers[s].begin (), kmers[s].end ()); kmer_type previous_kmer = *(kmers[s].begin ()); uint_abundance_t abundance = 0; for (vector < kmer_type >::iterator it = kmers[s].begin (); it != kmers[s].end ();it++) { kmer_type current_kmer = *it; if (current_kmer == previous_kmer) abundance++; else { if(output_histo) { uint_abundance_t saturated_abundance; saturated_abundance = (abundance >= 10000) ? 10000 : abundance; #if OMP histo_count_omp[tid][saturated_abundance]++; #else histo_count[saturated_abundance]++; #endif } if (abundance >= nks && abundance <= max_couv) { NbSolid_omp[tid]++; SolidKmers[s]->write_element_buffered(&previous_kmer,tid); if (write_count) SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false); } abundance = 1; distinct_kmers_per_partition[p]++; } previous_kmer = current_kmer; } //last kmer distinct_kmers_per_partition[p]++; if(output_histo) { uint_abundance_t saturated_abundance; saturated_abundance = (abundance >= 10000) ? 10000 : abundance; #if OMP histo_count_omp[tid][saturated_abundance]++; #else histo_count[saturated_abundance]++; #endif } if (abundance >= nks && abundance <= max_couv) { NbSolid_omp[tid]++; SolidKmers[s]->write_element_buffered(&previous_kmer,tid); if (write_count) SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false); } } } //printf("Done writing kmers for all K \n"); if (verbose >= 1) fprintf(stderr,"%cPass %d/%d, loaded and sorted partition %d/%d, found %lld solid kmers so far",13,current_pass+1,nb_passes,p+1,nb_partitions,(long long)(NbSolid_omp[tid])); //printf("Done writing kmers for all K %d check 1 \n",p); if (verbose >= 2) printf("\nPass %d/%d partition %d/%d %ld distinct kmers\n",current_pass+1,nb_passes,p+1,nb_partitions,/*total_kmers_per_partition[p],*/distinct_kmers_per_partition[p]); #if !SINGLE_BAR if (verbose == 0 && nb_threads==1) progress.inc(1); else if (verbose == 0 && nb_threads>1) progress.inc(1,tid); #endif //if(redundant_partitions_file[p]->find_error()) { // printf("Error in the binary file \n"); //} redundant_partitions_file[p]->close(); remove(redundant_filename[p]); } // end for partitions #if OMP //merge histo if(output_histo) { for (int cc=1; cc<10001; cc++) { uint64_t sum_omp = 0; for(int ii=0;ii<nb_threads;ii++) { sum_omp += histo_count_omp[ii][cc]; } histo_count[cc] = sum_omp; } } #endif #if !SINGLE_BAR if (verbose == 0 && nb_threads == 1) progress.finish(); else if (verbose == 0 && nb_threads > 1 ) progress.finish_threaded(); #endif if (verbose) fprintf(stderr,"\n"); if (verbose >= 2) { STOPWALL(debtri,"Reading and sorting partitions"); STOPWALL(debpass,"Pass total"); } //printf("Done writing kmers for all K check 4 \n"); if(use_compressed_reads) binread->close(); //delete for (uint32_t p=0;p<nb_partitions;p++) { delete redundant_partitions_file[p] ; } } //printf("Done writing kmers for all K check 5 \n"); //single bar #if SINGLE_BAR if (verbose == 0 && nb_threads == 1) progress.finish(); else if (verbose == 0 && nb_threads > 1 ) progress.finish_threaded(); #endif if(output_histo) { FILE * histo_file = fopen(return_file_name(histo_file_name),"w"); for (int cc=1; cc<10001; cc++) { fprintf(histo_file,"%i\t%llu\n",cc,(unsigned long long)(histo_count[cc])); } fclose(histo_file); } free(histo_count); NbSolid = NbSolid_omp[0]; #if OMP NbSolid=0; for(int ii=0;ii<nb_threads;ii++) { NbSolid += NbSolid_omp[ii]; } #endif for ( int s=0;s<totalKmers;s++) SolidKmers[s]->close(); printf("\nSaved %lld solid kmers\n",(long long)NbSolid); rmdir(temp_dir); STOPWALL(count,"Counted kmers"); fprintf(stderr,"\n------------------ Counted kmers and kept those with abundance >=%i, \n",nks); }
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, double threshold, double multiplier) { box_V = new AlignBox("5"); box_D = new AlignBox("4"); box_J = new AlignBox("3"); segmented = false; dSegmented = false; because = NOT_PROCESSED ; segmented_germline = germline ; info_extra = "" ; label = seq.label ; sequence = seq.sequence ; segment_cost=segment_c; evalue = NO_LIMIT_VALUE; evalue_left = NO_LIMIT_VALUE; evalue_right = NO_LIMIT_VALUE; box_V->marked_pos = 0; box_J->marked_pos = 0; CDR3start = -1; CDR3end = -1; JUNCTIONstart = -1; JUNCTIONend = -1; bool reverse_V = false ; bool reverse_J = false ; if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U)) { // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters) KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1); if (kseg->isSegmented()) { reversed = kseg->isReverse(); KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ; KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ; delete kseg ; reverse_V = (left.getStrand() == -1); reverse_J = (right.getStrand() == -1); code = "Unexpected "; code += left.toStringSigns() + germline->index->getLabel(left).basename; code += "/"; code += right.toStringSigns() + germline->index->getLabel(right).basename; info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")"; if (germline->seg_method == SEG_METHOD_MAX1U) return ; germline->override_rep5_rep3_from_labels(left, right); } else { delete kseg ; return ; } } // Strand determination, with KmerSegmenter (with default e-value parameters) // Note that we use only the 'strand' component // When the KmerSegmenter fails, continue with positive strand // TODO: flag to force a strand / to test both strands ? KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1); reversed = kseg->isReverse(); delete kseg ; sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed /* Segmentation */ align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false, box_V, segment_cost); align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false, box_J, segment_cost); // J was run with '!reverseJ', we copy the box informations from right to left // Should this directly be handled in align_against_collection() ? box_J->start = box_J->end ; box_J->del_left = box_J->del_right; /* E-values */ evalue_left = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first); evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first); evalue = evalue_left + evalue_right ; /* Unsegmentation causes */ if (box_V->end == (int) string::npos) { evalue_left = BAD_EVALUE ; } if (box_J->start == (int) string::npos) { evalue_right = BAD_EVALUE ; } checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1); if (because != NOT_PROCESSED) { segmented = false; info = " @" + string_of_int (box_V->end + FIRST_POS) + " @" + string_of_int(box_J->start + FIRST_POS) ; return ; } /* The sequence is segmented */ segmented = true ; because = reversed ? SEG_MINUS : SEG_PLUS ; //overlap VJ seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(), box_V, box_J, segment_cost); // Reset extreme positions box_V->start = 0; box_J->end = sequence.length()-1; // Why could this happen ? if (box_J->start>=(int) sequence.length()) box_J->start=sequence.length()-1; // seg_N will be recomputed in finishSegmentation() boxes.clear(); boxes.push_back(box_V); boxes.push_back(box_J); code = codeFromBoxes(boxes, sequence_or_rc); info = posFromBoxes(boxes); finishSegmentation(); }
void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id, bool reverse_ref, bool reverse_both, bool local, AlignBox *box, Cost segment_cost) { int best_score = MINUS_INF ; box->ref_nb = MINUS_INF ; int best_best_i = (int) string::npos ; int best_best_j = (int) string::npos ; int best_first_i = (int) string::npos ; int best_first_j = (int) string::npos ; vector<pair<int, int> > score_r; DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions; if (local==true) dpMode = DynProg::Local; // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence string sequence_or_rc = revcomp(read, reverse_ref); for (int r = 0 ; r < rep.size() ; r++) { if (r == forbidden_rep_id) continue; DynProg dp = DynProg(sequence_or_rc, rep.sequence(r), dpMode, // DynProg::SemiGlobalTrans, segment_cost, // DNA reverse_both, reverse_both, rep.read(r).marked_pos); bool onlyBottomTriangle = !local ; int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT); if (local==true){ dp.backtrack(); } if (score > best_score) { best_score = score ; best_best_i = dp.best_i ; best_best_j = dp.best_j ; best_first_i = dp.first_i ; best_first_j = dp.first_j ; box->ref_nb = r ; box->ref_label = rep.label(r) ; if (!local) dp.backtrack(); box->marked_pos = dp.marked_pos_i ; } score_r.push_back(make_pair(score, r)); // #define DEBUG_SEGMENT #ifdef DEBUG_SEGMENT cout << rep.label(r) << " " << score << " " << dp.best_i << endl ; #endif } sort(score_r.begin(),score_r.end(),comp_pair); box->ref = rep.sequence(box->ref_nb); box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1; box->del_left = best_first_j; box->start = best_first_i; box->score = score_r; #ifdef DEBUG_SEGMENT cout << "best: " << box->ref_label << " " << best_score ; cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl; cout << endl; #endif if (reverse_ref) // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ; best_best_i = read.length() - best_best_i - 1 ; box->end = best_best_i ; }