Ejemplo n.º 1
0
unsigned long int fq_stream_trimmer(UT_string *fq_fn, int pipe_fd, UT_string *out_prefix, int no_pre, int len_pre, unsigned long int *comp_cnt, unsigned long int *org, char split, int fmt_fasta) {
    
    UT_string *new_head_data;
    utstring_new(new_head_data);
    UT_string *head_data;
    utstring_new(head_data);
    UT_string *seq_data;
    utstring_new(seq_data);
    UT_string *extra_data;
    utstring_new(extra_data);
    UT_string *qual_data;
    utstring_new(qual_data);
    
    unsigned long int cnt = 0;
    
    char *start = NULL; 
    char *end = NULL;
    char *suffix = NULL;
    
    FILE *fq_file = NULL;

    FILE *pipe_in = fdopen(pipe_fd, "w");
    
    if (!(utstring_len(fq_fn))) {
        fclose(pipe_in);
        return(0);
    }
    
    // Try to open the fastq file
    if (!(fq_file = gzopen(utstring_body(fq_fn), "r"))) {
        utstring_printf(fq_fn, ".gz");
        if (!(fq_file = gzopen(utstring_body(fq_fn), "r"))) {
            fclose(pipe_in);
            return(0);
        }
    } 
    
    int x = 0;
    
    char head_char = '@';
    
    if (fmt_fasta) {
        head_char = '>';
    }
    
    while (ss_gzget_utstring(fq_file, head_data)) {
        
        ss_gzget_utstring(fq_file, seq_data);
        ss_gzget_utstring(fq_file, extra_data);
        ss_gzget_utstring(fq_file, qual_data);
        
        if (!split || ((suffix = strchr(utstring_body(head_data), '/')) && (suffix[1] == split))) {

            (*org)++;
            
            if ((x = trimmer(utstring_body(qual_data))) >= min_len) {  // Keep at least some of read
                
                // Reject read if complexity is too low
                if ((entropy_cutoff < 0.0) || (entropy_calc(utstring_body(seq_data), x) >= entropy_cutoff)) {
                    
                    // Truncate sequence
                    
                    ss_trim_utstring(seq_data, x);
                    ss_strcat_utstring(seq_data, "\n");
                    
                    if (!fmt_fasta) {
                        ss_trim_utstring(qual_data, x);
                        ss_strcat_utstring(qual_data, "\n");
                    }
                    
                    // Fixup the read name
                    utstring_clear(new_head_data);
                    
                    end = strchr(utstring_body(head_data), ':');
                    
                    if (no_pre) {
                        if ((start = strchr(utstring_body(head_data), '|'))) {
                            start++;
                        } else {
                            if (colorspace_flag) {
                                start = utstring_body(head_data) + 4;
                            } else {
                                start = utstring_body(head_data) + 1;
                            }
                        }
                        *end = '\0';
                    } else {
                        start = utstring_body(out_prefix);
                    }
                    
                    end++;
                    
                    if (colorspace_flag) {
                        if (len_pre) {
                            utstring_printf(new_head_data, "%c%.2s+%u|%s:%s",head_char,utstring_body(head_data)+1,x,start,end);
                        } else {
                            utstring_printf(new_head_data, "%c%.2s+%s:%s",head_char,utstring_body(head_data)+1,start,end);
                        }
                    } else {
                        if (len_pre) {
                            utstring_printf(new_head_data, "%c%u|%s:%s",head_char,x,start,end);
                        } else {
                            utstring_printf(new_head_data, "%c%s:%s",head_char,start,end);
                        }
                    }
                    
                    fputs(utstring_body(new_head_data), pipe_in);
                    fputs(utstring_body(seq_data), pipe_in);
                    
                    if (!fmt_fasta) {
                        fputs(utstring_body(extra_data), pipe_in);
                        fputs(utstring_body(qual_data), pipe_in);
                    }
                
                    cnt++;
                    
                } else {
                    // rejected by entropy filter
                    // Send along placeholder read to be discarded, keeping read1 and read2 in sync
                    // Empty fastq header is a read to be rejected by consumer threads
                    (*comp_cnt)++;
                    fputs("\n", pipe_in);
                }
            } else {
                // rejected by minimum length cutoff
                // Send along placeholder read to be discarded, keeping read1 and read2 in sync
                // Empty fastq header is a read to be rejected by consumer threads
                fputs("\n", pipe_in);
            }
        }
    }
    
    fclose(pipe_in);
    
    gzclose(fq_file);
    
    utstring_free(new_head_data);
    utstring_free(head_data);
    utstring_free(seq_data);
    utstring_free(extra_data);
    utstring_free(qual_data);

    return(cnt);
}
Ejemplo n.º 2
0
int main(int argc, char* argv[])
{
  Getopt getopt;
  getopt.addToHelp(" <input file>");
  if (getopt.processOpts(argc, argv)) {
    getopt.showHelp(std::cout);
    exit(1);
  }
  if(argc - getopt.first_non_opt() != 1) {
    getopt.showHelp(std::cout);
    exit(1);
  }

  const char* filename = argv[getopt.first_non_opt()];

  // read data, create System pointer
  printf("reading input file %s\n", filename);
  LinAlg::System::ptr genes;
  try {
    Cq::CqFile cqfile(filename);
    genes = Cq::read_data(cqfile);
  }
  catch(std::exception& e) {
    printf("%s\n", e.what());
    return 1;
  }

  Stat::PTree tree;
  printf("adding vectors to tree\n");
  tree.add_vectors(*genes);

  printf("calculating probabilities\n");
  tree.calc_probs();
  //  printf("running show_probs\n");
  //  tree.show_probs(stdout);

  double total_prob = 0.0;
  double entropy = 0.0;
  for(size_t i = 0; i < genes->n_vectors(); ++i) {
    const LinAlg::ConstVector cv(i, *genes);
    const double prob = tree.vector_prob(cv);
    total_prob += prob;
    entropy += prob * log(prob) * -1.0;
  }
  printf("total prob=%f\n", total_prob);
  printf("exp(entropy)=%f\n", exp(entropy));
  printf("quitting\n");
  return 0;

#if 0
  const size_t nSymbols = LinAlg::count_letters(*genes);
  Stat::EntropyCalculator entropy_calc(nSymbols, genes);

  const double entropy = entropy_calc.entropy();
  printf("entropy for %d genes: %f\n", (int)genes->n_vectors(), entropy);

  const double should_be_one = entropy_calc.prob_check();
  printf("should be 1 => %f\n", should_be_one);

  return 0;
#endif
}