unsigned long int fq_stream_trimmer(UT_string *fq_fn, int pipe_fd, UT_string *out_prefix, int no_pre, int len_pre, unsigned long int *comp_cnt, unsigned long int *org, char split, int fmt_fasta) { UT_string *new_head_data; utstring_new(new_head_data); UT_string *head_data; utstring_new(head_data); UT_string *seq_data; utstring_new(seq_data); UT_string *extra_data; utstring_new(extra_data); UT_string *qual_data; utstring_new(qual_data); unsigned long int cnt = 0; char *start = NULL; char *end = NULL; char *suffix = NULL; FILE *fq_file = NULL; FILE *pipe_in = fdopen(pipe_fd, "w"); if (!(utstring_len(fq_fn))) { fclose(pipe_in); return(0); } // Try to open the fastq file if (!(fq_file = gzopen(utstring_body(fq_fn), "r"))) { utstring_printf(fq_fn, ".gz"); if (!(fq_file = gzopen(utstring_body(fq_fn), "r"))) { fclose(pipe_in); return(0); } } int x = 0; char head_char = '@'; if (fmt_fasta) { head_char = '>'; } while (ss_gzget_utstring(fq_file, head_data)) { ss_gzget_utstring(fq_file, seq_data); ss_gzget_utstring(fq_file, extra_data); ss_gzget_utstring(fq_file, qual_data); if (!split || ((suffix = strchr(utstring_body(head_data), '/')) && (suffix[1] == split))) { (*org)++; if ((x = trimmer(utstring_body(qual_data))) >= min_len) { // Keep at least some of read // Reject read if complexity is too low if ((entropy_cutoff < 0.0) || (entropy_calc(utstring_body(seq_data), x) >= entropy_cutoff)) { // Truncate sequence ss_trim_utstring(seq_data, x); ss_strcat_utstring(seq_data, "\n"); if (!fmt_fasta) { ss_trim_utstring(qual_data, x); ss_strcat_utstring(qual_data, "\n"); } // Fixup the read name utstring_clear(new_head_data); end = strchr(utstring_body(head_data), ':'); if (no_pre) { if ((start = strchr(utstring_body(head_data), '|'))) { start++; } else { if (colorspace_flag) { start = utstring_body(head_data) + 4; } else { start = utstring_body(head_data) + 1; } } *end = '\0'; } else { start = utstring_body(out_prefix); } end++; if (colorspace_flag) { if (len_pre) { utstring_printf(new_head_data, "%c%.2s+%u|%s:%s",head_char,utstring_body(head_data)+1,x,start,end); } else { utstring_printf(new_head_data, "%c%.2s+%s:%s",head_char,utstring_body(head_data)+1,start,end); } } else { if (len_pre) { utstring_printf(new_head_data, "%c%u|%s:%s",head_char,x,start,end); } else { utstring_printf(new_head_data, "%c%s:%s",head_char,start,end); } } fputs(utstring_body(new_head_data), pipe_in); fputs(utstring_body(seq_data), pipe_in); if (!fmt_fasta) { fputs(utstring_body(extra_data), pipe_in); fputs(utstring_body(qual_data), pipe_in); } cnt++; } else { // rejected by entropy filter // Send along placeholder read to be discarded, keeping read1 and read2 in sync // Empty fastq header is a read to be rejected by consumer threads (*comp_cnt)++; fputs("\n", pipe_in); } } else { // rejected by minimum length cutoff // Send along placeholder read to be discarded, keeping read1 and read2 in sync // Empty fastq header is a read to be rejected by consumer threads fputs("\n", pipe_in); } } } fclose(pipe_in); gzclose(fq_file); utstring_free(new_head_data); utstring_free(head_data); utstring_free(seq_data); utstring_free(extra_data); utstring_free(qual_data); return(cnt); }
int main(int argc, char* argv[]) { Getopt getopt; getopt.addToHelp(" <input file>"); if (getopt.processOpts(argc, argv)) { getopt.showHelp(std::cout); exit(1); } if(argc - getopt.first_non_opt() != 1) { getopt.showHelp(std::cout); exit(1); } const char* filename = argv[getopt.first_non_opt()]; // read data, create System pointer printf("reading input file %s\n", filename); LinAlg::System::ptr genes; try { Cq::CqFile cqfile(filename); genes = Cq::read_data(cqfile); } catch(std::exception& e) { printf("%s\n", e.what()); return 1; } Stat::PTree tree; printf("adding vectors to tree\n"); tree.add_vectors(*genes); printf("calculating probabilities\n"); tree.calc_probs(); // printf("running show_probs\n"); // tree.show_probs(stdout); double total_prob = 0.0; double entropy = 0.0; for(size_t i = 0; i < genes->n_vectors(); ++i) { const LinAlg::ConstVector cv(i, *genes); const double prob = tree.vector_prob(cv); total_prob += prob; entropy += prob * log(prob) * -1.0; } printf("total prob=%f\n", total_prob); printf("exp(entropy)=%f\n", exp(entropy)); printf("quitting\n"); return 0; #if 0 const size_t nSymbols = LinAlg::count_letters(*genes); Stat::EntropyCalculator entropy_calc(nSymbols, genes); const double entropy = entropy_calc.entropy(); printf("entropy for %d genes: %f\n", (int)genes->n_vectors(), entropy); const double should_be_one = entropy_calc.prob_check(); printf("should be 1 => %f\n", should_be_one); return 0; #endif }