Example #1
0
static void mem_collect_intv(const SalmonOpts& sopt, const mem_opt_t *opt, SalmonIndex* sidx, int len, const uint8_t *seq, smem_aux_t *a)
{
    const bwt_t* bwt = sidx->bwaIndex()->bwt;
    int i, k, x = 0, old_n;
    int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1;
    int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
    a->mem.n = 0;

    // first pass: find all SMEMs
    if (sidx->hasAuxKmerIndex()) {
        KmerIntervalMap& auxIdx = sidx->auxIndex();
        uint32_t klen = auxIdx.k();
        while (x < len) {
            if (seq[x] < 4) {
                // Make sure there are at least k bases left
                if (len - x < klen) { x = len; continue; }
                // search for this key in the auxiliary index
                KmerKey kmer(const_cast<uint8_t*>(&(seq[x])), klen);
                auto it = auxIdx.find(kmer);
                // if we can't find it, move to the next key
                if (it == auxIdx.end()) { ++x; continue; }
                // otherwise, start the search using the initial interval @it->second from the hash
                int xb = x;
                x = bwautils::bwt_smem1_with_kmer(bwt, len, seq, x, start_width, it->second, &a->mem1, a->tmpv);
                for (i = 0; i < a->mem1.n; ++i) {
                    bwtintv_t *p = &a->mem1.a[i];
                    int slen = (uint32_t)p->info - (p->info>>32); // seed length
                    if (slen >= opt->min_seed_len)
                        kv_push(bwtintv_t, a->mem, *p);
                }
            } else ++x;
        }
Example #2
0
string Bayesian::getTaxonomy(Sequence* seq) {
	try {
		string tax = "";
		Kmer kmer(kmerSize);
		flipped = false;
		
		//get words contained in query
		//getKmerString returns a string where the index in the string is hte kmer number 
		//and the character at that index can be converted to be the number of times that kmer was seen
		string queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
		
		vector<int> queryKmers;
		for (int i = 0; i < queryKmerString.length()-1; i++) {	// the -1 is to ignore any kmer with an N in it
			if (queryKmerString[i] != '!') { //this kmer is in the query
				queryKmers.push_back(i);
			}
		}
		
		//if user wants to test reverse compliment and its reversed use that instead
		if (flip) {	
			if (isReversed(queryKmers)) { 
				flipped = true;
				seq->reverseComplement(); 
				queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
				queryKmers.clear();
				for (int i = 0; i < queryKmerString.length()-1; i++) {	// the -1 is to ignore any kmer with an N in it
					if (queryKmerString[i] != '!') { //this kmer is in the query
						queryKmers.push_back(i);
					}
				}
			}  
		}
		
		if (queryKmers.size() == 0) {  m->mothurOut(seq->getName() + " is bad. It has no kmers of length " + toString(kmerSize) + "."); m->mothurOutEndLine(); simpleTax = "unknown;";  return "unknown;"; }
		
		
		int index = getMostProbableTaxonomy(queryKmers);
		
		if (m->control_pressed) { return tax; }
					
		//bootstrap - to set confidenceScore
		int numToSelect = queryKmers.size() / 8;
	
        if (m->debug) {  m->mothurOut(seq->getName() + "\t"); }
        
		tax = bootstrapResults(queryKmers, index, numToSelect);
        
        if (m->debug) {  m->mothurOut("\n"); }
		
		return tax;	
	}
	catch(exception& e) {
		m->errorOut(e, "Bayesian", "getTaxonomy");
		exit(1);
	}
}
Example #3
0
int main(int argc, char** argv) {
  Options opt(argc, argv);

  kmers_map_type kmers_map;

  MMAPReads readfile(opt.readFName);

#pragma omp parallel
  {
#pragma omp single
    {
      std::cout << "Using " << omp_get_num_threads() << " threads" << std::endl;
    }

#pragma omp for
    for(int read_index = opt.read_st; read_index < opt.read_ed; read_index++) {
      bool orig = readfile.isOrig(read_index);
      inner_map_type occ_map;
      std::tr1::shared_ptr<DNASeq> cur_read(new DNASeq(readfile[read_index]));

      for (int pos = 0; pos <= cur_read->size() - opt.K; pos++) {
        Kmer kmer(cur_read, pos, opt.K);
        if (occ_map.find(kmer) == occ_map.end() || !orig) {
          occ_map[kmer] = KmerOccurrence(read_index, pos);
        }
      }

      for(inner_map_type::iterator it = occ_map.begin(); it != occ_map.end(); it++) {
        kmers_map_type::const_accessor ac;
        if (kmers_map.find(ac, it->first)) {
          if (ac->second.size() > KmerHashMap::MaxBinSize) {
            continue;
          }
        } else {
          kmers_map.insert(ac, it->first);
        }
        ac.release();

        kmers_map_type::accessor acw;
        if (kmers_map.find(acw, it->first)) {
          acw->second.push_back(it->second);
        } else {
          std::cout << "ERROR: read is not found" << std::endl; 
        }
        acw.release();
      }
    }
  }

  dump_hash(kmers_map, opt.fpre, opt.fsuf);

  return 0;
}
Example #4
0
//********************************************************************************************************************
int Bayesian::generateWordPairDiffArr(){
	try{
		Kmer kmer(kmerSize);
		for (int i = 0; i < WordPairDiffArr.size(); i++) {
			int reversedWord = kmer.getReverseKmerNumber(i);
			WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob;
		}
		
		return 0;
	}catch(exception& e) {
		m->errorOut(e, "Bayesian", "generateWordPairDiffArr");
		exit(1);
	}
}
Example #5
0
string decode_kmer_from_intval(kmer_int_type_t intval, unsigned int kmer_length) {

  string kmer(kmer_length, ' ');
  
  for (unsigned int i = 1; i <= kmer_length; i++) {
	
	int base_num = intval & 3ll;
	
	kmer[kmer_length-i] = _int_to_base[base_num];
	
	// cerr << "base: " << base << endl;
	
	intval = intval >> 2;
  }

  return(kmer);
}
Example #6
0
int main(void)
{
    FILE *fp = fopen("rosalind_ba3a.txt", "r");
    if (fp == NULL)
    {
	printf("File open failed!");
	return 1;
    }

    int k;
    fscanf(fp, "%d", &k);

    char txt[TXTLEN];
    fscanf(fp, "%9s", txt);
    
    printf("%d %s\n", k, txt);

    char **kmers = kmer(k, txt);

    for (int i = 0; i < (strlen(txt) - k + 1); i++)
    {
	printf("%s\n", kmers[i]);
    }

    for (int i = 0; i < (strlen(txt) - k + 1); i++)
    {
        free(kmers[i]);
    }
    free(kmers);
    
    if (fclose(fp) != 0)
    {
	printf("File close failed!");
    }
    
    return 0;
}
 void operator() ()
 {
     section("simple debruijn test based on the example data in the paper");
     std::string test = "TACGTCGACGACT";
     std::string alphabet = "$ACGT";
     const char * i = test.c_str();
     std::vector<std::string> kmers;
     std::string kmer("$$$$");
     
     while (*i)
     {
         // shift kmer to the left by one
         kmer.erase(0,1);
         
         // append the next nucleotide
         kmer.push_back(*i);
         
         // and store
         kmers.push_back(kmer);
         i++;
     }
     
     // and shift it once more and add the $
     kmer.erase(0,1);
     kmer.push_back('$');
     kmers.push_back(kmer);
     
     // build the graph!
     debruijn_succinct db(4,alphabet,kmers.begin(),kmers.end());
             
     // and .. test it
     check_equal(5UL, db.forward(2), "forward(2)");
     check_equal(2UL, db.backward(5), "backward(5)");
     check_equal(7UL, db.backward(1), "backward(1)");
     check_equal(3UL, db.backward(7), "backward(1)");
     check_equal(db.no_node, db.backward(0), "backward(1)");
     
     check_equal(2, db.outdegree(6), "outdegree(6)");
     check_equal(1, db.outdegree(0), "outdegree(0)");
     check_equal(12UL, db.forward(8), "forward(8)");
     check_equal(10UL, db.outgoing(6,'T'), "outgoing(6,T)");
     const unsigned long forward_expected[13] = 
     {
         10, //0 $$$T -> $$TA
         4,  //1 CGAC -> GACT
         5,  //2 $TAC -> TACG-
         8,  //3 GACG -> ACGT
         11, //4 GACT -> ACT$
         
         8,  //5 TACG- -> ACGT
         9,  //6 GTCG -> TCGA-
         1,  //7 ACGA -> CGAC   
         12, //8 ACGT -> CGTC
         1,  //9 TCGA -> CGAC
         2, //10 $$TA -> $TAC
         (unsigned long)-1,//11 ACT$ -> -1
         6, //12 CGTC -> GTCG
     };
     for (int i = 0; i < 13;i ++)
         check_equal(forward_expected[i], db.forward(i), "forward(i)");
     check_equal(std::string("$$$"), db.label(0), "label(0)");
     check_equal(std::string("CGA"), db.label(1), "label(1)");
     check_equal(std::string("$TA"), db.label(2), "label(2)");
     check_equal(std::string("GAC"), db.label(3), "label(3)");
     check_equal(std::string("TAC"), db.label(4), "label(4)");
     check_equal(std::string("GTC"), db.label(5), "label(5)");
     check_equal(std::string("ACG"), db.label(6), "label(6)");
     check_equal(std::string("TCG"), db.label(7), "label(7)");
     check_equal(std::string("$$T"), db.label(8), "label(8)");
     check_equal(std::string("ACT"), db.label(9), "label(9)");
     check_equal(std::string("CGT"), db.label(10), "label(10)");
     
     check_equal(0,db.indegree(0), "indegree(0)");
     check_equal(2,db.indegree(1), "indegree(1)");
     check_equal(2,db.indegree(6), "indegree(6)");
     check_equal(1,db.indegree(2), "indegree(2)");
     
     check_equal(4UL,db.incoming(6,'T'), "incoming(6,T)");
     
     check_equal(11UL, db.num_nodes(), "num_nodes");
     
     section("test succinct against basic debruijn");
     debruijn_basic db_basic(4,kmers.begin(),kmers.end());
     debruijn_comparer dc(db_basic,db,alphabet);
     dc.run(this);
    
 }