void show_pairwise(FILE *finp, int i, int j) { // shows score between sequence i and j int dplus, drc; num_seqs = i>j?i:j; num_seqs++; read_sequences(finp,0,num_seqs); dplus = distpair(work,i,j,0); drc = distpair(work,i,j,1); fprintf(outf,"%d %d\n",dplus,drc); }
void compared2nummatches(FILE *finp, int opt) { // This is an undocumented feature, useful for // doing statistics. The current version of // num_matches does not necessarily compute the // tne number of matches unless the heuristic threshold // is set to 0 int i, j; int num_mat_pos, num_mat_rc, posd2,negd2,d2,h1,h2,count,sp,sr; int passh1=0, passh2=0, passd2=0,d2table[100],h1fh2p=0,step=1; if(opt==2) step=17; for(i=0;i<100;i++) d2table[i]=0; read_sequences(finp,0,num_seqs); for(i=0; i<num_seqs; i=i+1) { set_up_word_table(work,i); for(j=i+1; j<num_seqs; j=j+step) { sample_heuristic(work,i,j,&sp,&sr); tv_heuristic(work,i,j,&num_mat_pos,&num_mat_rc); if (seqInfo[i].len<window_len || seqInfo[j].len<window_len) { posd2=negd2=-1; } else { if(num_mat_pos>=0 || num_mat_rc >=0) { posd2 = d2pair(work,i,j,0); negd2 = d2pair(work,i,j,1); d2 = MIN(posd2,negd2); h1 = MAX(sp,sr); h2 = MAX(num_mat_pos,num_mat_rc); if (h1 > 1) passh1++; if (h2 >= NM_threshold) passh2++; if ((num_mat_pos>= NM_threshold)&&(sp<= 1)) h1fh2p++; if ((num_mat_rc >= NM_threshold)&&(sr<= 1)) h1fh2p++; if (d2 <= theta) { passd2++; //if (count<max_count) max_count=count; } d2table[d2/10]++; if ( (h2<7) && (d2<50)) fprintf(outf,"H2 FAIL: %d %d : h2=%d d2=%d\n",i,j,h2,d2); if ( (h1<2) && (d2<60)) fprintf(outf,"H1 FAIL: %d %d :h1=%d,h2=%d\n",i,j, h1, d2); if (opt==2) fprintf(outf,"%3d,%8d,%8d,%2d,%3d,%3d,%2d,%3d,%3d,%2d,%3d,%3d\n",i,j, count,sp,num_mat_pos,posd2,sr,num_mat_rc, negd2, MAX(sp,sr), MAX(num_mat_pos,num_mat_rc), MIN(posd2,negd2)); } } } clear_word_table(work,i); } fprintf(outf,"h1=%d; h2 %d; h1fh2p %d; d2 %d\n",passh1,passh2,h1fh2p,passd2); for(i=0; i<40; i++) { fprintf(outf,"%3d %10d\n",i,d2table[i]); } }
/* static */ void Module_DMAP::generic_worker_single_thr(Module_DMAP * search, int id) { Mask * sequences; int read_seq; vector<Mask> printable_solutions; Transmitting_Result received; if (search->my_rank == 0) { sequences = new Mask[SEQUENCES_FOR_BLOCK]; read_seq = read_sequences(search->input_file1, SEQUENCES_FOR_BLOCK, sequences, search->fastqformat, search->gui_output); } else { received = search->receive_from_previous(id); sequences = received.first; read_seq = received.second; } while (read_seq != 0) { Items solutions; //ItemsGapped solutions_gapped; t_errors calculated_errors; for (int i = 0; i < read_seq; i++) { Mask & s = sequences[i]; if (search->my_rank == 0 and search->trim) { // check_routine(sequences[i], 0); s.quality_trimming_MOTT(search->min_phred_value_MOTT,search->min_mean_quality,search->min_size); } if (s.status_discarded()) { if (s.status_low_complexity()) s.low_complexity =true; //s.set_type(low_complexity); else s.low_quality = true; //s.set_type(quality_discarded); printable_solutions.push_back(s); continue; } if (search->auto_errors) calculated_errors = round((double)s.get_good_length() / search->errors_rate); else calculated_errors = search->common_errors_allowed; if (search->my_rank != 0 and s.algn > 0 and s.NM < calculated_errors) calculated_errors = s.NM; t_errors count = 0; for (t_pattern_length i = s.get_good_region_start()-1; (i < s.get_good_region_stop()) and (count <= calculated_errors); i++) if (s.sequence[i] == 'N' or s.sequence[i] == 'n') count++; if (count > calculated_errors) { //s.set_type(alignments_not_found); printable_solutions.push_back(s); continue; } /** ALIGNMENT **/ solutions.clear(); if (search->my_rank == 0 and search->contamination_check) { search->CR.search(s.get_good_sequence(),solutions,calculated_errors); if (solutions.size() > 0) s.contaminated = true; } if (not s.contaminated) search->H.search(s.get_good_sequence(),solutions,calculated_errors); if (solutions.size() == 0) { /** Try gapped **/ /* solutions_gapped.clear(); if (search->gap) search->H.search_gapped(s.get_good_sequence(),solutions_gapped,search->seed_sizes,search->seed_errors,calculated_errors,search->max_gap); */ /* if (solutions_gapped.size() == 0) {// size 0 means no alignment found */ //s.set_type(alignments_not_found); printable_solutions.push_back(s); continue; /* } else { if (not search->printAll) { Random_Choice_Result r; bool improved = (s.NM + s.NM_gap) > (solutions_gapped.at(0).errors1 + solutions_gapped.at(0).errors2); if (improved) r = Search_MPI::random_choice_from_previous(0,solutions_gapped.size()); else r = Search_MPI::random_choice_from_previous(s.algn,solutions_gapped.size()); if (not improved and r.first) { // take the previous solution s.algn += solutions_gapped.size(); } else { // update solution const ResultItemGapped & HM = solutions_gapped.at(r.second); s.globalPosition = HM.GlobalPosition1; if (improved) s.algn = solutions_gapped.size(); else s.algn += solutions_gapped.size(); s.HI = 1; s.IH = 1; s.primary = true; s.strand = HM.strand; s.NM = HM.errors1; s.NM_gap = HM.errors2; s.contig = HM.contig; s.position = HM.GlobalPosition1 - search->H.globaltolocal.startPositions[HM.contig] + 1 ; s.position_gap = HM.GlobalPosition2 - search->H.globaltolocal.startPositions[HM.contig] + 1 ; s.length1_gap = HM.length1; s.length2_gap = HM.length2; s.contig = search->contig_conversion.convert(s.contig); } printable_solutions.push_back(s); } else { // printALL */ /* unsigned int processed=0; unsigned int alignments; (search->toBePrinted < solutions_gapped.size()) ? alignments = search->toBePrinted : alignments = solutions_gapped.size() ; while(processed < alignments) { const ResultItemGapped & HM = solutions_gapped.at(processed); s.globalPosition = HM.GlobalPosition1; s.algn = alignments; s.HI =1; s.IH =1; (processed == 0 ) ? s.primary = true : s.primary = false; s.strand = HM.strand; s.NM = HM.errors1; s.NM_gap = HM.errors2; s.contig = HM.contig ; s.position = HM.GlobalPosition1 - search->H.globaltolocal.startPositions[HM.contig] + 1 ; s.position_gap = HM.GlobalPosition2 - search->H.globaltolocal.startPositions[HM.contig] + 1 ; s.length1_gap = HM.length1; s.length2_gap = HM.length2; s.contaminated = contaminated; printable_solutions.push_back(s); processed++; } */ /* ERROR_CHANNEL << "--print-all option not implemented yet!" << endl; exit(3); } } */ } else if (not search->printAll) { sort(solutions.begin(), solutions.end(), ResultItem::less()); // sort solutions solutions.erase(unique(solutions.begin(), solutions.end(), ResultItem::equal()), solutions.end()); Random_Choice_Result r; bool improved = (s.NM + s.NM_gap) > (solutions.at(0).errors); if (improved) r = Module_DMAP::random_choice_from_previous(0,solutions.size()); else { r = Module_DMAP::random_choice_from_previous(s.algn,solutions.size()); s.algn += solutions.size(); } if (not r.first) { const ResultItem & HM = solutions.at(r.second); s.HI = 1; s.IH = 1; s.primary = true; s.globalPosition = HM.globalPosition; s.strand = HM.strand; s.NM = HM.errors; s.NM_gap = 0; if ((search->my_rank == 0) and s.contaminated) { s.contig = search->CR.globalToLocal.searchContig(HM.globalPosition); // find the contig/scaffold s.position = HM.globalPosition - search->CR.globalToLocal.startPositions[s.contig] + 1; s.contig = search->contig_conversion.convert(s.contig); } else { s.contig = search->H.globalToLocal.searchContig(HM.globalPosition); // find the contig/scaffold s.position = HM.globalPosition - search->H.globalToLocal.startPositions[s.contig] + 1; s.contig = search->contig_conversion.convert(s.contig); } } printable_solutions.push_back(s); continue; } else { // printAll /* // memorize all printable solutions sort(solutions.begin(), solutions.end(), ResultItem::less()); // sort solutions solutions.erase(unique(solutions.begin(), solutions.end(), ResultItem::equal()), solutions.end()); unsigned int processed=0; unsigned int alignments; (search->toBePrinted < solutions.size()) ? alignments = search->toBePrinted : alignments = solutions.size() ; while(processed < alignments) { // while I print enough solution or there are no more solutions const ResultItem & HM = solutions.at(processed); s.globalPosition = HM.GlobalPosition; s.algn = solutions.size(); s.IH = alignments; s.HI = processed+1; (processed == 0 ) ? s.primary = true : s.primary = false; s.strand = HM.strand; s.NM = HM.errors; if (contaminated) { s.contig = search->CR.globaltolocal.searchContig(HM.GlobalPosition); // find the contig/scaffold s.position = HM.GlobalPosition - search->CR.globaltolocal.startPositions[s.contig] + 1; } else { s.contig = search->H.globaltolocal.searchContig(HM.GlobalPosition); // find the contig/scaffold s.position = HM.GlobalPosition - search->H.globaltolocal.startPositions[s.contig] + 1; } s.contaminated = contaminated; printable_solutions.push_back(s); } */ ERROR_CHANNEL << "--print-all option not implemented yet!" << endl; exit(3); } } if (search->my_rank == (search->nprocs-1)) { // now print all for(unsigned int i=0; i < printable_solutions.size(); i++) search->output_samfile.print_output(printable_solutions.at(i)); search->processed += read_seq; } else // send data to next node search->send_to_next(printable_solutions,id); delete [] sequences; printable_solutions.clear(); if (search->my_rank == 0) { sequences = new Mask[SEQUENCES_FOR_BLOCK]; read_seq = read_sequences(search->input_file1, SEQUENCES_FOR_BLOCK, sequences, search->fastqformat, search->gui_output); if (read_seq == 0) delete [] sequences; } else { received = search->receive_from_previous(id); sequences = received.first; read_seq = received.second; } } }
struct alignment* detect_and_read_sequences(struct alignment* aln,struct parameters* param) { int feature = 0; char **input = 0; unsigned short int* input_type = 0; unsigned short int* input_numseq = 0; int num_input = 0; int i = 0; int j = 0; int c = 0; int a,b; int free_read = 1; unsigned int numseq = get_kalign_context()->numseq; while(free_read == 1 || param->infile[i]){ num_input++; i++; free_read = 0; } numseq = 0; input = malloc(sizeof(char*) * num_input); input_type = malloc(sizeof(unsigned short int) * num_input); input_numseq = malloc(sizeof(unsigned short int) * num_input); for (i = 0; i < num_input;i++){ input[i] = 0; input_type[i] = 0; input_numseq[i] = 0; } free_read = 0; if(param->quiet){ c = 1; }else{ c = 0; } for (i = c; i < num_input;i++){ if(!param->infile[i]){ k_printf("reading from STDIN: "); }else{ k_printf("reading from %s: ",param->infile[i]); } input[i] = get_input_into_string(input[i],param->infile[i]); if(input[i]){ free_read++; if (byg_start("<macsim>",input[i]) != -1){ input_numseq[i] = count_sequences_macsim(input[i]); feature = 1; input_type[i] = 1; }else if (byg_start("<uniprot",input[i]) != -1){ input_numseq[i] = count_sequences_uniprot(input[i]); input_type[i] = 2; }else if(byg_start("This SWISS-PROT",input[i]) != -1){ input_numseq[i] = count_sequences_swissprot(input[i]); input_type[i] = 3; }else if (byg_start("This Swiss-Prot",input[i]) != -1){ input_numseq[i] = count_sequences_swissprot(input[i]); input_type[i] = 3; }else if (byg_start("CLUSTAL W",input[i]) != -1){ input_numseq[i] = count_sequences_clustalw(input[i]); input_type[i] = 4; }else if (byg_start("PileUp",input[i]) != -1){ input_numseq[i] = count_sequences_clustalw(input[i]); input_type[i] = 4; }else if (byg_start("MSF:",input[i]) != -1){ input_numseq[i] = count_sequences_clustalw(input[i]); input_type[i] = 4; }else if (byg_start("STOCKHOLM",input[i]) != -1){ input_numseq[i] = count_sequences_stockholm(input[i]); input_type[i] = 5; }else{ input_numseq[i] = count_sequences_fasta(input[i]); input_type[i] = 0; } k_printf("found %d sequences\n",input_numseq[i]); if(input_numseq[i] < 1){ free(input[i]); input[i] = 0; }else{ numseq += input_numseq[i]; } }else{ k_printf("found no sequences.\n"); if(!param->outfile && i){ param->outfile = param->infile[i]; k_printf("-> output file, in "); //try to set format.... if(!param->format){ if (byg_start("msf",param->outfile) != -1){ param->format = "msf"; }else if (byg_start("clustal",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("aln",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("macsim",param->outfile) != -1){ param->format = "macsim"; }else{ param->format = "fasta"; } if(param->reformat){ k_printf("unaligned fasta format\n"); }else if(param->format){ k_printf("%s format\n",param->format); }else{ k_printf("fasta format\n"); } } } k_printf("\n"); } } if(numseq < 2){ k_printf("%s\n", usage); if(!numseq){ k_printf("\nWARNING: No sequences found.\n\n"); }else{ k_printf("\nWARNING: Only one sequence found.\n\n"); } for (i = 0; i < num_input;i++){ free(input[i]); } free(input_numseq); free(input_type); free(input); free_param(param); exit(0); } if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){ if( free_read < 2){ k_printf("\nWARNING: You are trying to perform a profile - profile alignment but ony one input file was detected.\n\n"); param->alignment_type = "default"; } } if (param->feature_type && !feature){ for (i = 0; i < num_input;i++){ free(input[i]); } free(input_numseq); free(input_type); free(input); free_param(param); throwKalignException(k_printf("\nWARNING: You are trying to perform a feature alignment but the input format(s) do not contain feature information.\n")); } get_kalign_context()->numprofiles = (numseq << 1) - 1; aln = aln_alloc(aln); //numseq = 0; if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){ j = 0; for (i = 0; i < num_input;i++){ if(input[i]){ switch(input_type[i]){ case 0: aln = read_alignment(aln,input[i]); break; case 1: aln = read_alignment_macsim_xml(aln,input[i]); break; case 2: aln = read_alignment_uniprot_xml(aln,input[i]); break; case 3: aln = read_alignment_from_swissprot(aln, input[i]); break; case 4: aln = read_alignment_clustal(aln,input[i]); break; case 5: aln = read_alignment_stockholm(aln,input[i]); break; default: aln = read_alignment(aln,input[i]); break; } input[i] = 0; //create partial profile.... aln->nsip[numseq+j] = input_numseq[i]; aln->sip[numseq+j] = malloc(sizeof(int)*aln->nsip[numseq+j]); //k_printf("%d %d\n",numseq+j,aln->sl[numseq+j]); j++; } } num_input = j; c = 0; for (i = 0;i < num_input;i++){ // for ( j = 0; j < aln->nsip[numseq+i];j++){ aln->sip[numseq+i][j] = c; c++; // k_printf("%d ",aln->sip[numseq+i][j]); } aln->sl[numseq+i] = aln->sl[aln->sip[numseq+i][0]]; // k_printf("PROFILE:%d contains: %d long:%d\n",i+numseq,aln->nsip[numseq+i],aln->sl[numseq+i]); // k_printf("\n"); } //sanity check -are all input for (i = 0;i < num_input;i++){ for ( j = 0; j < aln->nsip[numseq+i]-1;j++){ a = aln->sip[numseq+i][j]; a = aln->sl[a]; for (c = j+1; j < aln->nsip[numseq+i];j++){ b = aln->sip[numseq+i][c]; b = aln->sl[b]; if(a != b){ for (i = 0; i < num_input;i++){ free(input[i]); } free(input_numseq); free(input_type); free(input); free_aln(aln); free_param(param); throwKalignException(k_printf("Unaligned sequences in input %s.\n",param->infile[i])); } } } } //exit(0); /*for (i = 0; i < numseq;i++){ k_printf("len%d:%d\n",i,aln->sl[i]); for ( j =0 ; j < aln->sl[i];j++){ //if(aln->s[i][j]> 23 || aln->s[i][j] < 0){ // aln->s[i][j] = -1; //} k_printf("%d ",aln->s[i][j]); } // k_printf("\n"); } exit(0);*/ }else{ for (i = 0; i < num_input;i++){ if(input[i]){ switch(input_type[i]){ case 0: aln = read_sequences(aln,input[i]); break; case 1: aln = read_sequences_macsim_xml(aln,input[i]); break; case 2: aln = read_sequences_uniprot_xml(aln,input[i]); break; case 3: aln = read_sequences_from_swissprot(aln, input[i]); break; case 4: aln = read_sequences_clustal(aln,input[i]); break; case 5: aln = read_sequences_stockholm(aln,input[i]); break; default: aln = read_sequences(aln,input[i]); break; } /*if (byg_start("<macsim>",input[i]) != -1){ aln = read_sequences_macsim_xml(aln,input[i]); }else if (byg_start("<uniprot",input[i]) != -1){ aln = read_sequences_uniprot_xml(aln,input[i]); }else if(byg_start("This SWISS-PROT entry is copyright.",input[i]) != -1){ aln = read_sequences_from_swissprot(aln, input[i]); }else if (byg_start("This Swiss-Prot entry is copyright.",input[i]) != -1){ aln = read_sequences_from_swissprot(aln, input[i]); }else if (byg_start("CLUSTAL W",input[i]) != -1){ aln = read_sequences_clustal(aln,input[i]); }else if (byg_start("PileUp",input[i]) != -1){ aln = read_sequences_clustal(aln,input[i]); }else if (byg_start("MSF:",input[i]) != -1){ aln = read_sequences_clustal(aln,input[i]); }else if (byg_start("STOCKHOLM",input[i]) != -1){ aln = read_sequences_stockholm(aln,input[i]); }else{ aln = read_sequences(aln,input[i]); }*/ input[i] = 0; } } } if(numseq < 2){ free_param(param); throwKalignException(k_printf("\nNo sequences could be read.\n")); } if(!param->format && param->outfile){ if (byg_start("msf",param->outfile) != -1){ param->format = "msf"; }else if (byg_start("clustal",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("aln",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("macsim",param->outfile) != -1){ param->format = "macsim"; } k_printf("Output file: %s, in %s format.\n",param->outfile,param->format); } free(input); free(input_type); free(input_numseq); return aln; }
int main(int argc, char *argv[]) { struct tms usage; FILE *finp; int i,j, ticks; int numinfirst; char chkfile[255]; i=0; dump_file=NULL; do_cluster=do_pairwise_cluster; srandom(563573); bzero(&prog_opts,sizeof(ProgOptionsType)); outf=stdout; // set default distance function dist = d2; distpair= d2pair; #ifdef MPI MPI_Init(&argc, &argv); MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); #endif if(myid==0) { // Master process_options(argc, argv); } else { process_slave_options(argc, argv); } if (prog_opts.show_version || (argc==1)) { if (myid==0) printf("Version \n%s\n",version); #ifdef MPI MPI_Finalize(); #endif exit(0); } // Allocate space for the RC table for big words rc_big = calloc(BIG_WORD_TSIZE, sizeof(SeqElt)); // work is an array of work blocks. If non-parallel, there'll only // be one. work[0] acts a template work = (WorkPtr) calloc(num_threads,sizeof(WorkBlock)); work->filename = argv[optind]; work->index = NULL; if(prog_opts.do_dump) dump_file = fopen(prog_opts.dname,"w"); #ifdef MPI if (numprocs > 1) if (myid>0) { // slaves if (prog_opts.split) { MPI_Finalize(); return 0; } handleMPISlaveSetup(&num_seqs); initialise(work, prog_opts.edfile); internalTest(); perform_clustering(work); transmitMPISlaveResponse(work); if (prog_opts.show_perf) show_performance(outf); MPI_Finalize(); exit(0); } #else if (numprocs > 1) { printf("This version of wcd is not compiled with MPI\n"); printf("You cannot run it with a multiple processes\n"); printf("Either only run it with one process or do a \n"); printf(" ./configure --enable-mpi\n"); printf(" make clean\n"); printf(" make \n"); exit(5); } #endif // work out number of sequences // if the user has specified a value for num_seqs then // use that, else use the number of sequences in the file num_seqs = count_seqs(argv[optind], &data_size)+reindex_value; seq = (SeqPtr *) calloc(num_seqs,sizeof(SeqPtr)); seqInfo = (SeqInfoPtr) calloc(num_seqs,sizeof(SeqInfoStruct)); tree= (UnionFindPtr) calloc(num_seqs,sizeof(UnionFindStruct)); data= (SeqPtr) calloc(data_size,sizeof(SeqElt)); init_dummy_sequences(); #ifndef AUXINFO seqID = (SeqIDPtr) calloc(num_seqs,sizeof(SeqIDStruct)); #endif if (seq == NULL) { perror("SeqStruct allocation"); exit(50); } numinfirst = global_i_end = num_seqs; global_j_beg = 0; // if merging, need to check the other file too if (prog_opts.domerge || prog_opts.doadd ) { global_j_beg = global_i_end; num_seqs = handleMerge(argv[optind+2], num_seqs); if (prog_opts.doadd) global_i_end = num_seqs; } initialise(work, prog_opts.edfile); if (data == NULL) { sprintf(chkfile,"Main data store (%d bytes)",data_size); perror(chkfile); exit(51); } for(i=0; i<num_seqs; i++) seqInfo[i].flag=0; // reopen sequence file for reading finp = fopen(argv[optind],"r"); if (finp == NULL) { perror(argv[optind]); exit(51); } // Some messy stuff to hande auxiliary options // Skip to next comment on first reading if (prog_opts.pairwise==1) { sscanf(argv[optind+1], "%d", &i); sscanf(argv[optind+2], "%d", &j); show_pairwise(finp,i,j); return 0; } if (prog_opts.statgen) { compared2nummatches(finp,prog_opts.statgen); return 0; } if (prog_opts.range) { sscanf(argv[optind+1], "%d", &global_i_beg); sscanf(argv[optind+2], "%d", &global_i_end); } if (prog_opts.show_comp==41) { char * fname; fname = malloc(255); sscanf(argv[optind+1], "%s", fname); read_sequences(finp,reindex_value,num_seqs); checkfile = fopen(fname,"r"); sscanf(argv[optind+2], "%d", &j); while (fscanf(checkfile,"%d", &i) != -1) { do_compare(finp,i,j,1); } return 0; } if (prog_opts.show_comp) { sscanf(argv[optind+1], "%d", &i); sscanf(argv[optind+2], "%d", &j); //printf("Comparing %d and %d of %d flag %d\n",i,j,num_seqs,prog_opts.flag); read_sequences(finp,reindex_value,num_seqs); do_compare(finp,i,j,prog_opts.flag); return 0; } if (prog_opts.show_index) { show_sequence(finp, prog_opts.index,prog_opts.flag); return 0; } // Now read in the sequences if (do_cluster == do_pairwise_cluster||do_cluster==do_MPImaster_cluster||do_cluster == do_suffix_cluster) read_sequences(finp,reindex_value,numinfirst); else init_sequences(finp,reindex_value,numinfirst); fclose(finp); //printf("%d Allocated %d, start=%d, last=%d\n",num_seqs,data_size,data,seq[num_seqs-1].seq); if (prog_opts.split) { process_split(prog_opts.clfname1, prog_opts.split); #ifdef MPI MPI_Finalize(); #endif return 0; } if (prog_opts.consfname1) process_constraints(prog_opts.consfname1,0); if (prog_opts.clustercomp) { cluster_compare(argv[optind+1]); return 0; } // If merging or adding need to open the second sequence file if (prog_opts.domerge || prog_opts.doadd) { finp = fopen(argv[optind+2], "r"); if (finp == NULL) { perror(argv[optind]); exit(1); } if (do_cluster == do_pairwise_cluster) read_sequences(finp,numinfirst+reindex_value,num_seqs); else init_sequences(finp,numinfirst+reindex_value,num_seqs); get_clustering(argv[optind+1],0); if (prog_opts.domerge) get_clustering(argv[optind+3],numinfirst); } if (prog_opts.init_cluster) get_clustering(prog_opts.clfname1, 0); if (prog_opts.recluster) reclustering(work,prog_opts.clfname2); else { // This really assumes there is only one thread for suffix if (prog_opts.pairwise==2) { matrix_compare(finp); return 0; } work->workflag = prog_opts.noninterleavednlc;//kludge for suffixarray global_j_end = num_seqs; perform_clustering(work); #ifdef MPI if (myid>0) transmitMPISlaveResponse(work); #endif } if (prog_opts.show_ext) show_EXT(outf); if (prog_opts.show_histo) show_histogram(work); if (prog_opts.show_clust&1) show_clusters(outf); if (prog_opts.show_clust&8) produce_clusters(prog_opts.clthresh,prog_opts.dirname); if (prog_opts.show_perf) show_performance(outf); if (prog_opts.do_dump) { strcpy(chkfile,prog_opts.dname); strcat(chkfile,"-FIN"); fclose(dump_file); dump_file = fopen(chkfile,"w"); times(&usage); ticks = sysconf(_SC_CLK_TCK); fprintf(dump_file,"Completed %ld %ld", usage.tms_utime/ticks, usage.tms_stime*1000/ticks); fclose(dump_file); } if (prog_opts.show_version) fprintf(outf,"\n%s\n",version); fclose(outf); #ifdef MPI MPI_Finalize(); #endif exit(0); }
RcppExport SEXP FitPhasingBurst(SEXP R_signal, SEXP R_flowCycle, SEXP R_read_sequence, SEXP R_phasing, SEXP R_burstFlows, SEXP R_maxEvalFlow, SEXP R_maxSimFlow) { SEXP ret = R_NilValue; char *exceptionMesg = NULL; try { Rcpp::NumericMatrix signal(R_signal); Rcpp::NumericMatrix phasing(R_phasing); // Standard phasing parameters string flowCycle = Rcpp::as<string>(R_flowCycle); Rcpp::StringVector read_sequences(R_read_sequence); Rcpp::NumericVector phasing_burst(R_burstFlows); Rcpp::NumericVector max_eval_flow(R_maxEvalFlow); Rcpp::NumericVector max_sim_flow(R_maxSimFlow); int window_size = 38; // For normalization ion::FlowOrder flow_order(flowCycle, flowCycle.length()); unsigned int num_flows = flow_order.num_flows(); unsigned int num_reads = read_sequences.size(); // Containers to store results Rcpp::NumericVector null_fit(num_reads); Rcpp::NumericMatrix null_prediction(num_reads, num_flows); Rcpp::NumericVector best_fit(num_reads); Rcpp::NumericVector best_ie_value(num_reads); Rcpp::NumericMatrix best_prediction(num_reads, num_flows); BasecallerRead bc_read; DPTreephaser dpTreephaser(flow_order); DPPhaseSimulator PhaseSimulator(flow_order); vector<double> cf_vec(num_flows, 0.0); vector<double> ie_vec(num_flows, 0.0); vector<double> dr_vec(num_flows, 0.0); // IE Burst Estimation Loop for (unsigned int iRead=0; iRead<num_reads; iRead++) { // Set read object vector<float> my_signal(num_flows); for (unsigned int iFlow=0; iFlow<num_flows; iFlow++) my_signal.at(iFlow) = signal(iRead, iFlow); bc_read.SetData(my_signal, num_flows); string my_sequence = Rcpp::as<std::string>(read_sequences(iRead)); // Default phasing as baseline double my_best_fit, my_best_ie; double base_cf = (double)phasing(iRead, 0); double base_ie = (double)phasing(iRead, 1); double base_dr = (double)phasing(iRead, 2); int burst_flow = (int)phasing_burst(iRead); vector<float> my_best_prediction; cf_vec.assign(num_flows, base_cf); dr_vec.assign(num_flows, base_dr); int my_max_flow = min((int)num_flows, (int)max_sim_flow(iRead)); int my_eval_flow = min(my_max_flow, (int)max_eval_flow(iRead)); PhaseSimulator.SetBaseSequence(my_sequence); PhaseSimulator.SetMaxFlows(my_max_flow); PhaseSimulator.SetPhasingParameters_Basic(base_cf, base_ie, base_dr); PhaseSimulator.UpdateStates(my_max_flow); PhaseSimulator.GetPredictions(bc_read.prediction); dpTreephaser.WindowedNormalize(bc_read, (my_eval_flow/window_size), window_size, true); my_best_ie = base_ie; my_best_prediction = bc_read.prediction; my_best_fit = 0; for (int iFlow=0; iFlow<my_eval_flow; iFlow++) { double residual = bc_read.raw_measurements.at(iFlow) - bc_read.prediction.at(iFlow); my_best_fit += residual*residual; } for (unsigned int iFlow=0; iFlow<num_flows; iFlow++) null_prediction(iRead, iFlow) = bc_read.prediction.at(iFlow); null_fit(iRead) = my_best_fit; // Make sure that there are enough flows to fit a burst if (burst_flow < my_eval_flow-10) { int num_steps = 0; double step_size = 0.0; double step_start = 0.0; double step_end = 0.0; // Brute force phasing burst value estimation using grid search, crude first, then refine for (unsigned int iIteration = 0; iIteration<3; iIteration++) { switch(iIteration) { case 0: step_size = 0.05; step_end = 0.8; break; case 1: step_end = (floor(my_best_ie / step_size)*step_size) + step_size; step_start = max(0.0, (step_end - 2.0*step_size)); step_size = 0.01; break; default: step_end = (floor(my_best_ie / step_size)*step_size) + step_size; step_start = max(0.0, step_end - 2*step_size); step_size = step_size / 10; } num_steps = 1+ ((step_end - step_start) / step_size); for (int iPhase=0; iPhase <= num_steps; iPhase++) { double try_ie = step_start+(iPhase*step_size); ie_vec.assign(num_flows, try_ie); PhaseSimulator.SetBasePhasingParameters(burst_flow, cf_vec, ie_vec, dr_vec); PhaseSimulator.UpdateStates(my_max_flow); PhaseSimulator.GetPredictions(bc_read.prediction); dpTreephaser.WindowedNormalize(bc_read, (my_eval_flow/window_size), window_size, true); double my_fit = 0.0; for (int iFlow=burst_flow+1; iFlow<my_eval_flow; iFlow++) { double residual = bc_read.raw_measurements.at(iFlow) - bc_read.prediction.at(iFlow); my_fit += residual*residual; } if (my_fit < my_best_fit) { my_best_fit = my_fit; my_best_ie = try_ie; my_best_prediction = bc_read.prediction; } } } } // Set output information for this read best_fit(iRead) = my_best_fit; best_ie_value(iRead) = my_best_ie; for (unsigned int iFlow=0; iFlow<num_flows; iFlow++) best_prediction(iRead, iFlow) = my_best_prediction.at(iFlow); } ret = Rcpp::List::create(Rcpp::Named("null_fit") = null_fit, Rcpp::Named("null_prediction") = null_prediction, Rcpp::Named("burst_flow") = phasing_burst, Rcpp::Named("best_fit") = best_fit, Rcpp::Named("best_ie_value") = best_ie_value, Rcpp::Named("best_prediction") = best_prediction); } catch(std::exception& ex) { forward_exception_to_r(ex); } catch(...) { ::Rf_error("c++ exception (unknown reason)"); } if(exceptionMesg != NULL) Rf_error(exceptionMesg); return ret; }
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }