示例#1
0
int main(int argc, char *argv[])
{
    if(argc <  3)
    {
        fprintf (stderr,"%s: [d]isk [s]treaming of [k]-mers (constant-memory k-mer counting)\n",argv[0]);
        fprintf (stderr,"usage:\n");
        fprintf (stderr," %s input_file kmer_size [-t min_abundance] [-m max_memory] [-d max_disk_space] [-o out_prefix] [-histo]\n",argv[0]);
        fprintf (stderr,"details:\n [-t min_abundance] filters out k-mers seen ( < min_abundance ) times, default: 1 (all kmers are returned)\n [-m max_memory] is in MB, default: min(total system memory / 2, 5 GB) \n [-d max_disk_space] is in MB, default: min(available disk space / 2, reads file size)\n [-o out_prefix] saves results in [out_prefix].solid_kmers. default out_prefix = basename(input_file)\n [-histo] outputs histogram of kmers abundance\n [-rev] outputs only one of forward or reverse complement k-mers\n Input file can be fasta, fastq, gzipped or not, or a file containing a list of file names.\n");
#ifdef SVN_REV
fprintf(stderr,"Running dsk version %s\n",STR(SVN_REV));
#endif
        return 0;
    }

    // reads file
    Bank *Reads = new Bank(argv[1]);

    if (argv[2][0] == '-')
    {
        printf("please specify a k value\n");
        exit(1);
    }
	/* Changes by Raunaq
	 * Code addition for taking in multiple values of k. The file containing should have values of k sorted in decreasing order  
	 *
	*/
	int *Kmerlist =  loadKmers(argv[2]);
	
    // kmer size
	//fprintf(stderr,"Smallest kmer is %d \n",smallestKmer);
    sizeKmer = Kmerlist[0];
    if (sizeKmer>(int)(sizeof(kmer_type)*4))
    {
        printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4);
        exit(1);
    }
    kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1;

    // default solidity 
    nks = 1;

    // default max memory
    max_memory = 5*1024;
    #ifndef OSX
    struct sysinfo info;
    sysinfo(&info);
    int total_ram = (int)(((double)info.totalram*(double)info.mem_unit)/1024/1024);
    printf("Total RAM: %d MB\n",total_ram);
#else
    int total_ram = 128*1024;
#endif


    // default prefix is the reads file basename
    char *reads_path=strdup(argv[1]);
    string reads_name(basename(reads_path)); // posix basename() may alter reads_path
    free(reads_path);
    int lastindex = reads_name.find_last_of("."); 
    strcpy(prefix,reads_name.substr(0, lastindex).c_str()); 

    for (int n_a = 3; n_a < argc ; n_a++)
    {
        if (strcmp(argv[n_a],"-t")==0)
            nks = atoi(argv[n_a+1]);

        if (strcmp(argv[n_a],"-o")==0)
            strcpy(prefix,argv[n_a+1]);
    }

    int verbose = 0;
    bool reverse = false;
    max_disk_space = 0;

    output_histo =false;
    // parse the remaining arguments: these will override the default max memory / max disk
    for (int n_a = 3; n_a < argc ; n_a++)
    {
        if (strcmp(argv[n_a],"-m")==0)
            max_memory = atoi(argv[n_a+1]);

        if (strcmp(argv[n_a],"-d")==0)
            max_disk_space = atoi(argv[n_a+1]);

        if (strcmp(argv[n_a],"-v")==0)
            verbose = 1;

        if (strcmp(argv[n_a],"-vv")==0)
            verbose = 2;
        
        if (strcmp(argv[n_a],"-histo")==0)
            output_histo =true;
	
	if (strcmp(argv[n_a],"-rev")==0)
	    reverse = true;
    }

    if (max_memory > total_ram)
    {
        printf("Maximum memory (%d MB), exceeds total RAM (%d MB). Setting maximum memory to %d MB.\n",max_memory,total_ram,total_ram/2);
        max_memory = total_ram/2;
    }

    STARTWALL(0);

    sorting_count(Reads,prefix,max_memory,max_disk_space,true,verbose,reverse);

    STOPWALL(0,"Total");

    delete Reads;

    return 0;
}
int main(int argc, char *argv[])
{
    
    if(argc <  6)
    {
        fprintf (stderr,"usage:\n");
        fprintf (stderr," %s input_file kmer_size min_abundance estimated_genome_size prefix\n",argv[0]);
        fprintf (stderr,"hints:\n min_abundance ~ 3\n estimated_genome_size is in bp, does not need to be accurate, only controls memory usage\n prefix is any name you want the results to start with\n");

        return 1;
    }

    bool FOUR_BLOOM_VERSION = true;

     // shortcuts to go directly to assembly using serialized bloom and serialized hash
    int START_FROM_SOLID_KMERS=0; // if = 0, construct the fasta file of solid kmers, if = 1, start directly from that file 
    int LOAD_FALSE_POSITIVE_KMERS=0; // if = 0, construct the fasta file of false positive kmers (debloom), if = 1, load that file into the hashtable
    int NO_FALSE_POSITIVES_AT_ALL=0; // if = 0, normal behavior, if = 1, don't load false positives (will be a probabilistic de bruijn graph)
    int max_disk_space = 0;// let dsk decide
    for (int n_a = 6; n_a < argc ; n_a++)
    {
        if (strcmp(argv[n_a],"--original") == 0)
    	    FOUR_BLOOM_VERSION = false;

        if (strcmp(argv[n_a],"--dont-count")==0)
            START_FROM_SOLID_KMERS = 1;

        if (strcmp(argv[n_a],"--dont-debloom")==0)
            LOAD_FALSE_POSITIVE_KMERS = 1;

        if (strcmp(argv[n_a],"--just-assemble")==0)
        {
            START_FROM_SOLID_KMERS = 1;
            LOAD_FALSE_POSITIVE_KMERS = 1;
        }

        if (strcmp(argv[n_a],"--titus-mode")==0)
            NO_FALSE_POSITIVES_AT_ALL = 1;
        
        
        if (strcmp(argv[n_a],"-d")==0)
            max_disk_space = atoi(argv[n_a+1]);
        
        
        if (strcmp(argv[n_a],"-maxc")==0)
	    max_couv = atoi(argv[n_a+1]);
        
        if (strcmp(argv[n_a],"--le-changement")==0)
            {printf("c'est maintenant!\n");exit(0);}
    }


    // kmer size
    sizeKmer=27; // let's make it even for now, because i havnt thought of how to handle palindromes (dont want to stop on them)
    if(argc >=  3)
    {
        sizeKmer = atoi(argv[2]);
        if (sizeKmer%2==0)
        {
            sizeKmer-=1;
            printf("Need odd kmer size to avoid palindromes. I've set kmer size to %d.\n",sizeKmer);
        }
        if (sizeKmer>((int)sizeof(kmer_type)*4))
        {
            printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4);
            exit(1);
        }
    }

    if (sizeKmer == (int)(sizeof(kmer_type)*4))
        kmerMask = -1;
    else
        kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1;

    double lg2 = log(2);
   
    if (sizeKmer > 128)
    {
        FOUR_BLOOM_VERSION = false;
        printf("Reverted to single Bloom filter implementation for k>128\n");
    }

    if (!FOUR_BLOOM_VERSION) 
      NBITS_PER_KMER = log(16*sizeKmer*(lg2*lg2))/(lg2*lg2); // needed to process argv[5]
    else 
      NBITS_PER_KMER = rvalues[sizeKmer][1];

    // solidity 
    nks =NNKS;
    if(argc >=  4)
    {
        nks = atoi(argv[3]);
        if (nks==0) nks=1; // min abundance can't be 0
    }


   if(argc >=  5)
    {
       genome_size  = atoll(argv[4]);
      // int estimated_bloom_size = max( (int)ceilf(log2f(genome_size * NBITS_PER_KMER )), 1);
        uint64_t estimated_bloom_size = (uint64_t) (genome_size * NBITS_PER_KMER);

       uint64_t estimated_nb_FP =  (uint64_t)(genome_size * 4 * powf(0.6,11)); // just indicative
    
       //max_memory = max( (1LL << estimated_bloom_size)/8LL /1024LL/1024LL, 1LL );
        max_memory =  max((int64_t) estimated_bloom_size/8LL /1024LL/1024LL,1LL);

      printf("estimated values: nbits Bloom %lli, nb FP %lld, max memory %i MB\n",estimated_bloom_size,estimated_nb_FP,max_memory);

    }

    // output prefix
    if(argc >=  6)
    {
        strcpy(prefix,argv[5]);
    }

   


    fprintf (stderr,"taille cell %lu \n", sizeof(cell<kmer_type>));

    STARTWALL(0);

    Bank *Reads = new Bank(argv[1]);
    
    // counter kmers, write solid kmers to disk
    if (!START_FROM_SOLID_KMERS)
    {
        int verbose = 0;
        bool write_count = false;
        bool skip_binary_conversion = false;

        sorting_count(Reads,prefix,max_memory,max_disk_space,write_count,verbose, skip_binary_conversion);
    }

    // debloom, write false positives to disk, insert them into false_positives
    if (! LOAD_FALSE_POSITIVE_KMERS)
    {
        debloom(order, max_memory);
    }
    
    bloo1 = bloom_create_bloo1((BloomCpt *)NULL, false);

    if (! NO_FALSE_POSITIVES_AT_ALL)
    {
        // load false positives from disk into false_positives
        if (!FOUR_BLOOM_VERSION) 
            false_positives = load_false_positives();
	else
	    false_positives = load_false_positives_cascading4();
    }
    else
    {
        // titus mode: no FP's
        false_positives = dummy_false_positives();
    }

    //  return 1;
    assemble(); 

    STOPWALL(0,"Total");

    delete Reads;
    return 0;
}