Exemple #1
0
void show_pairwise(FILE *finp, int i, int j) {
  // shows score between sequence i and j
  int dplus, drc;

  num_seqs = i>j?i:j;
  num_seqs++;
  read_sequences(finp,0,num_seqs);
  dplus = distpair(work,i,j,0);
  drc   = distpair(work,i,j,1);
  fprintf(outf,"%d %d\n",dplus,drc);
}
Exemple #2
0
void compared2nummatches(FILE *finp, int opt) {
  // This is an undocumented feature, useful for 
  // doing statistics. The current version of
  // num_matches does not necessarily compute the
  // tne number of matches unless the heuristic threshold
  // is set to 0
  int i, j;
  int num_mat_pos, num_mat_rc, posd2,negd2,d2,h1,h2,count,sp,sr;
  int passh1=0, passh2=0, passd2=0,d2table[100],h1fh2p=0,step=1;
  
  if(opt==2) step=17;
  for(i=0;i<100;i++) d2table[i]=0;
  read_sequences(finp,0,num_seqs);
  for(i=0; i<num_seqs; i=i+1) {
    set_up_word_table(work,i);
    for(j=i+1; j<num_seqs; j=j+step) {
      sample_heuristic(work,i,j,&sp,&sr);
      tv_heuristic(work,i,j,&num_mat_pos,&num_mat_rc);
      if (seqInfo[i].len<window_len || seqInfo[j].len<window_len) {
	posd2=negd2=-1;
      } else {
	if(num_mat_pos>=0 || num_mat_rc >=0) {
	  posd2 = d2pair(work,i,j,0);
	  negd2 = d2pair(work,i,j,1);
	  d2 = MIN(posd2,negd2);
	  h1 = MAX(sp,sr);
	  h2 = MAX(num_mat_pos,num_mat_rc);
	  if (h1 > 1) passh1++;
	  if (h2 >= NM_threshold) passh2++;
	  if ((num_mat_pos>= NM_threshold)&&(sp<= 1)) h1fh2p++;
	  if ((num_mat_rc >= NM_threshold)&&(sr<= 1)) h1fh2p++;
	  if (d2 <= theta) {
	    passd2++;
	    //if (count<max_count) max_count=count;
	  }
	  d2table[d2/10]++;
	  if ( (h2<7) && (d2<50)) fprintf(outf,"H2 FAIL: %d %d : h2=%d d2=%d\n",i,j,h2,d2);
	  if ( (h1<2) &&  (d2<60)) fprintf(outf,"H1 FAIL: %d %d :h1=%d,h2=%d\n",i,j, h1, d2);
 	 
	  if (opt==2)
	    fprintf(outf,"%3d,%8d,%8d,%2d,%3d,%3d,%2d,%3d,%3d,%2d,%3d,%3d\n",i,j, 
		    count,sp,num_mat_pos,posd2,sr,num_mat_rc, negd2,
		    MAX(sp,sr), MAX(num_mat_pos,num_mat_rc),
		    MIN(posd2,negd2));
	}
      }
    }
    clear_word_table(work,i);
  } 
  fprintf(outf,"h1=%d; h2 %d; h1fh2p %d; d2 %d\n",passh1,passh2,h1fh2p,passd2);
  for(i=0; i<40; i++) {
    fprintf(outf,"%3d %10d\n",i,d2table[i]);
  }
}
Exemple #3
0
/* static */
void Module_DMAP::generic_worker_single_thr(Module_DMAP * search, int id) {
	Mask * sequences;
	int read_seq;
	vector<Mask> printable_solutions;
	Transmitting_Result received;

	if (search->my_rank == 0) {
		sequences = new Mask[SEQUENCES_FOR_BLOCK];
		read_seq = read_sequences(search->input_file1, SEQUENCES_FOR_BLOCK, sequences, search->fastqformat, search->gui_output);
	} else {
		received = search->receive_from_previous(id);
		sequences = received.first;
		read_seq = received.second;
	}

	while (read_seq != 0) {
		Items solutions;
		//ItemsGapped solutions_gapped;
		t_errors calculated_errors;

		for (int i = 0; i < read_seq; i++) {
			Mask & s = sequences[i];
			if (search->my_rank == 0 and search->trim) {
					//	check_routine(sequences[i], 0);
					s.quality_trimming_MOTT(search->min_phred_value_MOTT,search->min_mean_quality,search->min_size);
			}
			if (s.status_discarded()) {
				if (s.status_low_complexity())
					s.low_complexity  =true; //s.set_type(low_complexity);
				else
					s.low_quality = true; //s.set_type(quality_discarded);
				printable_solutions.push_back(s);
				continue;
			}
			if (search->auto_errors)
				calculated_errors = round((double)s.get_good_length() / search->errors_rate);
			else
				calculated_errors = search->common_errors_allowed;
			if (search->my_rank != 0 and s.algn > 0 and s.NM < calculated_errors)
				calculated_errors = s.NM;
			t_errors count = 0;
			for (t_pattern_length i = s.get_good_region_start()-1; (i < s.get_good_region_stop()) and (count <= calculated_errors); i++)
				if (s.sequence[i] == 'N' or s.sequence[i] == 'n')
					count++;
			if (count > calculated_errors) {
				//s.set_type(alignments_not_found);
				printable_solutions.push_back(s);
				continue;
			}

			/** ALIGNMENT **/
			solutions.clear();

			if (search->my_rank == 0 and search->contamination_check) {
				search->CR.search(s.get_good_sequence(),solutions,calculated_errors);
				if (solutions.size() > 0)
					s.contaminated = true;
			}

			if (not s.contaminated)
				search->H.search(s.get_good_sequence(),solutions,calculated_errors);
			if (solutions.size() == 0) {
				/** Try gapped **/
				/*
				solutions_gapped.clear();
				if (search->gap)
					search->H.search_gapped(s.get_good_sequence(),solutions_gapped,search->seed_sizes,search->seed_errors,calculated_errors,search->max_gap);
				*/
				/*
				if (solutions_gapped.size() == 0) {// size 0 means no alignment found
				*/
					//s.set_type(alignments_not_found);
					printable_solutions.push_back(s);
					continue;
				/*
				} else {
					if (not search->printAll) {
						Random_Choice_Result r;
						bool improved = (s.NM + s.NM_gap) > (solutions_gapped.at(0).errors1 + solutions_gapped.at(0).errors2);

						if (improved)
							r = Search_MPI::random_choice_from_previous(0,solutions_gapped.size());
						else
							r = Search_MPI::random_choice_from_previous(s.algn,solutions_gapped.size());
						if (not improved and r.first) {
							// take the previous solution
							s.algn += solutions_gapped.size();
						} else {
							// update solution
							const ResultItemGapped & HM = solutions_gapped.at(r.second);
							s.globalPosition = HM.GlobalPosition1;
							if (improved)
								s.algn = solutions_gapped.size();
							else
								s.algn += solutions_gapped.size();
							s.HI = 1;
							s.IH = 1;
							s.primary = true;
							s.strand = HM.strand;
							s.NM = HM.errors1;
							s.NM_gap = HM.errors2;
							s.contig = HM.contig;
							s.position = HM.GlobalPosition1 - search->H.globaltolocal.startPositions[HM.contig] + 1 ;
							s.position_gap = HM.GlobalPosition2 - search->H.globaltolocal.startPositions[HM.contig] + 1 ;
							s.length1_gap = HM.length1;
							s.length2_gap = HM.length2;
							s.contig = search->contig_conversion.convert(s.contig);

						}
						printable_solutions.push_back(s);


					} else { // printALL
				*/
						/*
						unsigned int processed=0;
						unsigned int alignments;
						(search->toBePrinted < solutions_gapped.size()) ? alignments = search->toBePrinted : alignments = solutions_gapped.size() ;
						while(processed < alignments) {
							const ResultItemGapped & HM = solutions_gapped.at(processed);
							s.globalPosition = HM.GlobalPosition1;
							s.algn = alignments;
							s.HI =1;
							s.IH =1;
							(processed == 0 ) ? s.primary = true : s.primary = false;
							s.strand = HM.strand;
							s.NM = HM.errors1;
							s.NM_gap = HM.errors2;
							s.contig = HM.contig ;
							s.position = HM.GlobalPosition1 - search->H.globaltolocal.startPositions[HM.contig] + 1 ;
							s.position_gap = HM.GlobalPosition2 - search->H.globaltolocal.startPositions[HM.contig] + 1 ;
							s.length1_gap = HM.length1;
							s.length2_gap = HM.length2;
							s.contaminated = contaminated;
							printable_solutions.push_back(s);
							processed++;
						}
						 */
/*
						ERROR_CHANNEL << "--print-all option not implemented yet!" << endl;
						exit(3);
					}
				}
				*/
			} else if (not search->printAll) {
				sort(solutions.begin(), solutions.end(), ResultItem::less()); // sort solutions
				solutions.erase(unique(solutions.begin(), solutions.end(), ResultItem::equal()), solutions.end());

				Random_Choice_Result r;
				bool improved = (s.NM + s.NM_gap) > (solutions.at(0).errors);

				if (improved)
					r = Module_DMAP::random_choice_from_previous(0,solutions.size());
				else {
					r = Module_DMAP::random_choice_from_previous(s.algn,solutions.size());
					s.algn += solutions.size();
				}
				if (not r.first) {
					const ResultItem & HM = solutions.at(r.second);
					s.HI = 1;
					s.IH = 1;
					s.primary = true;
					s.globalPosition = HM.globalPosition;
					s.strand = HM.strand;
					s.NM = HM.errors;
					s.NM_gap = 0;
					if ((search->my_rank == 0) and s.contaminated) {
						s.contig = search->CR.globalToLocal.searchContig(HM.globalPosition); // find the contig/scaffold
						s.position = HM.globalPosition - search->CR.globalToLocal.startPositions[s.contig] + 1;
						s.contig = search->contig_conversion.convert(s.contig);
					} else {
						s.contig = search->H.globalToLocal.searchContig(HM.globalPosition); // find the contig/scaffold
						s.position = HM.globalPosition - search->H.globalToLocal.startPositions[s.contig] + 1;
						s.contig = search->contig_conversion.convert(s.contig);
					}
				}
				printable_solutions.push_back(s);
				continue;
			} else { // printAll
				/*
				// memorize all printable solutions
				sort(solutions.begin(), solutions.end(), ResultItem::less()); // sort solutions
				solutions.erase(unique(solutions.begin(), solutions.end(), ResultItem::equal()), solutions.end());
				unsigned int processed=0;
				unsigned int alignments;
				(search->toBePrinted < solutions.size()) ? alignments = search->toBePrinted : alignments = solutions.size() ;
				while(processed < alignments) {
					// while I print enough solution or there are no more solutions
					const ResultItem & HM = solutions.at(processed);
					s.globalPosition = HM.GlobalPosition;
					s.algn = solutions.size();
					s.IH = alignments;
					s.HI = processed+1;
					(processed == 0 ) ? s.primary = true : s.primary = false;
					s.strand = HM.strand;
					s.NM = HM.errors;
					if (contaminated) {
						s.contig = search->CR.globaltolocal.searchContig(HM.GlobalPosition); // find the contig/scaffold
						s.position = HM.GlobalPosition - search->CR.globaltolocal.startPositions[s.contig] + 1;
					} else {
						s.contig = search->H.globaltolocal.searchContig(HM.GlobalPosition); // find the contig/scaffold
						s.position = HM.GlobalPosition - search->H.globaltolocal.startPositions[s.contig] + 1;
					}
					s.contaminated = contaminated;
					printable_solutions.push_back(s);
				}
				 */

				ERROR_CHANNEL << "--print-all option not implemented yet!" << endl;
				exit(3);
			}
		}

		if (search->my_rank == (search->nprocs-1)) {
			// now print all
			for(unsigned int i=0; i < printable_solutions.size(); i++)
				search->output_samfile.print_output(printable_solutions.at(i));
			search->processed += read_seq;
		} else // send data to next node
			search->send_to_next(printable_solutions,id);

		delete [] sequences;
		printable_solutions.clear();

		if (search->my_rank == 0) {
			sequences = new Mask[SEQUENCES_FOR_BLOCK];
			read_seq = read_sequences(search->input_file1, SEQUENCES_FOR_BLOCK, sequences, search->fastqformat, search->gui_output);
			if (read_seq == 0)
				delete [] sequences;
		} else {
			received = search->receive_from_previous(id);
			sequences = received.first;
			read_seq = received.second;
		}
	}

}
Exemple #4
0
struct alignment* detect_and_read_sequences(struct alignment* aln,struct parameters* param)
{
	
	int feature = 0;
	char **input = 0;
	unsigned short int* input_type = 0;
	unsigned short int* input_numseq = 0;
	
	int num_input = 0;
	int i = 0;
	int j = 0;
	int c = 0;
	int a,b;
	int free_read = 1;
	unsigned int numseq = get_kalign_context()->numseq;
	while(free_read == 1 || param->infile[i]){
		num_input++;
		i++;
		free_read = 0;
	}
	numseq = 0;

	
	input = malloc(sizeof(char*) * num_input);
	input_type = malloc(sizeof(unsigned short int) * num_input);
	input_numseq = malloc(sizeof(unsigned short int) * num_input);
	
	for (i = 0; i < num_input;i++){
		input[i] = 0;
		input_type[i] = 0;
		input_numseq[i] = 0;
	}

	free_read = 0;
	
	if(param->quiet){
		c = 1;
	}else{
		c = 0;
	}
	
	
	for (i = c; i < num_input;i++){
		if(!param->infile[i]){
			k_printf("reading from STDIN: ");
		}else{
			k_printf("reading from %s: ",param->infile[i]);
		}
		input[i] = get_input_into_string(input[i],param->infile[i]);
		if(input[i]){
			free_read++;
			if (byg_start("<macsim>",input[i]) != -1){
				input_numseq[i] = count_sequences_macsim(input[i]);
				feature = 1;
				input_type[i] = 1;
			}else if (byg_start("<uniprot",input[i]) != -1){
				input_numseq[i] = count_sequences_uniprot(input[i]);
				input_type[i] = 2;
			}else if(byg_start("This SWISS-PROT",input[i]) != -1){
				input_numseq[i] = count_sequences_swissprot(input[i]);
				input_type[i] = 3;
			}else if (byg_start("This Swiss-Prot",input[i]) != -1){
				input_numseq[i] = count_sequences_swissprot(input[i]);
				input_type[i] = 3;
			}else if (byg_start("CLUSTAL W",input[i]) != -1){
				input_numseq[i] = count_sequences_clustalw(input[i]);
				input_type[i] = 4;
			}else if (byg_start("PileUp",input[i]) != -1){
				input_numseq[i] = count_sequences_clustalw(input[i]);
				input_type[i] = 4;
			}else if (byg_start("MSF:",input[i]) != -1){
				input_numseq[i] = count_sequences_clustalw(input[i]);
				input_type[i] = 4;
			}else if (byg_start("STOCKHOLM",input[i]) != -1){
				input_numseq[i] = count_sequences_stockholm(input[i]);
				input_type[i] = 5;
			}else{
				input_numseq[i]  = count_sequences_fasta(input[i]);
				input_type[i] = 0;
			}
			k_printf("found %d sequences\n",input_numseq[i]);
			
			if(input_numseq[i] < 1){
				free(input[i]);
				input[i] = 0;
			}else{
				numseq += input_numseq[i];
			}
		}else{
			k_printf("found no sequences.\n");
			if(!param->outfile && i){
				param->outfile = param->infile[i];
				k_printf("-> output file, in ");
				//try to set format.... 
				if(!param->format){
					if (byg_start("msf",param->outfile) != -1){
						param->format = "msf";
					}else if (byg_start("clustal",param->outfile) != -1){
						param->format = "clustal";
					}else if (byg_start("aln",param->outfile) != -1){
						param->format = "clustal";
					}else if (byg_start("macsim",param->outfile) != -1){
						param->format = "macsim";
					}else{
						param->format = "fasta";
					}
					if(param->reformat){
						k_printf("unaligned fasta format\n");
					}else if(param->format){
						k_printf("%s format\n",param->format);
					}else{
						k_printf("fasta format\n");
					}
				}
			}
			k_printf("\n");
		}
	}

	
	if(numseq < 2){
		k_printf("%s\n", usage);
		if(!numseq){
		k_printf("\nWARNING: No sequences found.\n\n");
		}else{
		k_printf("\nWARNING: Only one sequence found.\n\n");
		}
		for (i = 0; i < num_input;i++){
			free(input[i]);
		}
		free(input_numseq);
		free(input_type);
		free(input);
		free_param(param);
		exit(0);
	}

	if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){
		if( free_read  < 2){
			k_printf("\nWARNING: You are trying to perform a profile - profile alignment but ony one input file was detected.\n\n");
			param->alignment_type = "default";
		}
	}

	
	if (param->feature_type && !feature){
		for (i = 0; i < num_input;i++){
			free(input[i]);
		}
		free(input_numseq);
		free(input_type);
		free(input);
		free_param(param);
		throwKalignException(k_printf("\nWARNING: You are trying to perform a feature alignment but the input format(s) do not contain feature information.\n"));
	}
	
	get_kalign_context()->numprofiles = (numseq << 1) - 1;
	aln = aln_alloc(aln);
	//numseq = 0;
	if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){
		j = 0;
		for (i = 0; i < num_input;i++){
			
			if(input[i]){
					
				switch(input_type[i]){
					case 0:
						aln = read_alignment(aln,input[i]);
						break;
					case 1:
						aln = read_alignment_macsim_xml(aln,input[i]);
						break;
					case 2:
						aln = read_alignment_uniprot_xml(aln,input[i]);
						break;
					case 3:

						aln = read_alignment_from_swissprot(aln, input[i]);
						break;
					case 4:
						aln = read_alignment_clustal(aln,input[i]);
						break;
					case 5:
						aln = read_alignment_stockholm(aln,input[i]);
						break;
					
					default:
						aln = read_alignment(aln,input[i]);
						break;
				}
				input[i] = 0;
				//create partial profile....
				aln->nsip[numseq+j] = input_numseq[i];
				aln->sip[numseq+j] = malloc(sizeof(int)*aln->nsip[numseq+j]);
				
				//k_printf("%d	%d\n",numseq+j,aln->sl[numseq+j]);
				j++;
			}
		}
		num_input = j;
		c = 0;
		for (i = 0;i < num_input;i++){
		//	
			for ( j = 0; j < aln->nsip[numseq+i];j++){
				aln->sip[numseq+i][j] = c;
				c++;
		//		k_printf("%d ",aln->sip[numseq+i][j]);
			}
			aln->sl[numseq+i] = aln->sl[aln->sip[numseq+i][0]];
		//	k_printf("PROFILE:%d	contains: %d long:%d\n",i+numseq,aln->nsip[numseq+i],aln->sl[numseq+i]);
	//		k_printf("\n");
		}
		
		//sanity check -are all input 
		
		for (i = 0;i < num_input;i++){
			for ( j = 0; j < aln->nsip[numseq+i]-1;j++){
				a = aln->sip[numseq+i][j];
				a = aln->sl[a];
				for (c =  j+1; j < aln->nsip[numseq+i];j++){
					b = aln->sip[numseq+i][c];
					b = aln->sl[b];
					if(a != b){
						
						for (i = 0; i < num_input;i++){
							free(input[i]);
						}
						free(input_numseq);
						free(input_type);
						free(input);
						free_aln(aln);
						free_param(param);
						throwKalignException(k_printf("Unaligned sequences in input %s.\n",param->infile[i]));
					}
				}
				
			}

		}
		
		//exit(0);
		
		/*for (i = 0; i < numseq;i++){
			k_printf("len%d:%d\n",i,aln->sl[i]);	
			for ( j =0 ; j < aln->sl[i];j++){
				//if(aln->s[i][j]> 23 || aln->s[i][j] < 0){
				//	 aln->s[i][j] = -1;
				//}
				k_printf("%d ",aln->s[i][j]);
			}
		//	k_printf("\n");
		}
		exit(0);*/
	}else{
		for (i = 0; i < num_input;i++){
			if(input[i]){
				switch(input_type[i]){
					case 0:
						aln = read_sequences(aln,input[i]);
						break;
					case 1:
						aln = read_sequences_macsim_xml(aln,input[i]);
						break;
					case 2:
						aln = read_sequences_uniprot_xml(aln,input[i]);
						break;
					case 3:
						aln = read_sequences_from_swissprot(aln, input[i]);
						break;
					case 4:
						aln = read_sequences_clustal(aln,input[i]);
						break;
					case 5:
						aln = read_sequences_stockholm(aln,input[i]);
						break;
					
					default:
						aln = read_sequences(aln,input[i]);
						break;
				}
				/*if (byg_start("<macsim>",input[i]) != -1){
					aln = read_sequences_macsim_xml(aln,input[i]);
				}else if (byg_start("<uniprot",input[i]) != -1){
					aln = read_sequences_uniprot_xml(aln,input[i]);
				}else if(byg_start("This SWISS-PROT entry is copyright.",input[i]) != -1){
					aln = read_sequences_from_swissprot(aln, input[i]);
				}else if (byg_start("This Swiss-Prot entry is copyright.",input[i]) != -1){
					aln = read_sequences_from_swissprot(aln, input[i]);
				}else if (byg_start("CLUSTAL W",input[i]) != -1){
					aln = read_sequences_clustal(aln,input[i]);
				}else if (byg_start("PileUp",input[i]) != -1){
					aln = read_sequences_clustal(aln,input[i]);
				}else if (byg_start("MSF:",input[i]) != -1){
					aln = read_sequences_clustal(aln,input[i]);
				}else if (byg_start("STOCKHOLM",input[i]) != -1){
					aln = read_sequences_stockholm(aln,input[i]);
				}else{
					aln = read_sequences(aln,input[i]);
				}*/
				input[i] = 0;
			}
		}
	}
	if(numseq < 2){
		free_param(param);
		throwKalignException(k_printf("\nNo sequences could be read.\n"));
	}
	if(!param->format && param->outfile){
			if (byg_start("msf",param->outfile) != -1){
				param->format = "msf";
			}else if (byg_start("clustal",param->outfile) != -1){
				param->format = "clustal";
			}else if (byg_start("aln",param->outfile) != -1){
				param->format = "clustal";
			}else if (byg_start("macsim",param->outfile) != -1){
				param->format = "macsim";
			}
			k_printf("Output file: %s, in %s format.\n",param->outfile,param->format);
	}
	
	
	free(input);
	free(input_type);
	free(input_numseq);
	return aln;
}
Exemple #5
0
int main(int argc, char *argv[]) {
  struct tms usage;
  FILE *finp;
  int i,j, ticks;
  int numinfirst;
  char chkfile[255];

  i=0;
  dump_file=NULL;

  do_cluster=do_pairwise_cluster;
  srandom(563573);
  bzero(&prog_opts,sizeof(ProgOptionsType));
  outf=stdout;
  // set default distance function
  dist = d2;
  distpair= d2pair;
#ifdef MPI
  MPI_Init(&argc, &argv);
  MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
#endif


  if(myid==0) { // Master
    process_options(argc, argv);
  } else {
    process_slave_options(argc, argv);
  }

  if (prog_opts.show_version || (argc==1)) {
      if (myid==0) printf("Version \n%s\n",version);
#ifdef MPI      
      MPI_Finalize();
#endif
      exit(0);
    }


  // Allocate space for the RC table for big words
  rc_big = calloc(BIG_WORD_TSIZE, sizeof(SeqElt));

  // work is an array of work blocks. If non-parallel, there'll only
  // be one. work[0] acts a template

  work = (WorkPtr) calloc(num_threads,sizeof(WorkBlock));
  work->filename = argv[optind];
  work->index    = NULL;


  if(prog_opts.do_dump) dump_file = fopen(prog_opts.dname,"w");

#ifdef MPI
  if (numprocs > 1)  
    if (myid>0) {  // slaves
      if (prog_opts.split) {
	MPI_Finalize();
	return 0;
      }
      handleMPISlaveSetup(&num_seqs);
      initialise(work, prog_opts.edfile);
      internalTest();

      perform_clustering(work);
      transmitMPISlaveResponse(work);
      if (prog_opts.show_perf)     show_performance(outf);
      MPI_Finalize();
      exit(0);
    }   
#else
  if (numprocs > 1) {
    printf("This version of wcd is not compiled with MPI\n");
    printf("You cannot run it with a multiple processes\n");
    printf("Either only run it with one process or do a \n");
    printf("  ./configure --enable-mpi\n");
    printf("  make clean\n");
    printf("  make \n");
    exit(5);
  }
#endif

  // work out number of sequences
  // if the user has specified a value for num_seqs then
  // use that, else use the number of sequences in the file
  num_seqs = count_seqs(argv[optind], &data_size)+reindex_value;

  seq = (SeqPtr *) calloc(num_seqs,sizeof(SeqPtr));
  seqInfo     = (SeqInfoPtr) calloc(num_seqs,sizeof(SeqInfoStruct));
  tree= (UnionFindPtr) calloc(num_seqs,sizeof(UnionFindStruct));
  data= (SeqPtr)  calloc(data_size,sizeof(SeqElt));
  init_dummy_sequences();
#ifndef AUXINFO
  seqID = (SeqIDPtr) calloc(num_seqs,sizeof(SeqIDStruct));
#endif
  if (seq == NULL) {
    perror("SeqStruct allocation");
    exit(50);
  }
  numinfirst = global_i_end = num_seqs;
  global_j_beg = 0;
  // if merging, need to check the other file too
  if (prog_opts.domerge || prog_opts.doadd ) {
    global_j_beg = global_i_end;
    num_seqs = handleMerge(argv[optind+2], num_seqs);
    if (prog_opts.doadd) global_i_end = num_seqs; 
  }

  initialise(work, prog_opts.edfile);
  if (data == NULL) {
    sprintf(chkfile,"Main data store (%d bytes)",data_size);
    perror(chkfile);
    exit(51);
  }
  for(i=0; i<num_seqs; i++) seqInfo[i].flag=0;
  // reopen sequence file for reading
  finp = fopen(argv[optind],"r");
  if (finp == NULL)  {
    perror(argv[optind]);
    exit(51);
  }
  // Some messy stuff to hande auxiliary options
  // Skip to next comment on first reading
  if (prog_opts.pairwise==1) {
    sscanf(argv[optind+1], "%d", &i);
    sscanf(argv[optind+2], "%d", &j);
    show_pairwise(finp,i,j);
    return 0;
  }
  if (prog_opts.statgen) {
    compared2nummatches(finp,prog_opts.statgen);
    return 0;
  }
  if (prog_opts.range) {
    sscanf(argv[optind+1], "%d", &global_i_beg);
    sscanf(argv[optind+2], "%d", &global_i_end);
  }     
  if (prog_opts.show_comp==41) {
    char * fname; fname = malloc(255);
    sscanf(argv[optind+1], "%s", fname);
    read_sequences(finp,reindex_value,num_seqs); 
    checkfile = fopen(fname,"r");
    sscanf(argv[optind+2], "%d", &j);
    while (fscanf(checkfile,"%d", &i) != -1) {
    	  do_compare(finp,i,j,1);
}
    return 0;
  }

  if (prog_opts.show_comp) {
    sscanf(argv[optind+1], "%d", &i);
    sscanf(argv[optind+2], "%d", &j);
    //printf("Comparing %d and %d of %d flag %d\n",i,j,num_seqs,prog_opts.flag);
    read_sequences(finp,reindex_value,num_seqs); 
    do_compare(finp,i,j,prog_opts.flag);
    return 0;
  }
  if (prog_opts.show_index) {
    show_sequence(finp, prog_opts.index,prog_opts.flag);
    return 0;
  }
  // Now read in the sequences
  if (do_cluster == do_pairwise_cluster||do_cluster==do_MPImaster_cluster||do_cluster == do_suffix_cluster) 
    read_sequences(finp,reindex_value,numinfirst);
  else
    init_sequences(finp,reindex_value,numinfirst);
  fclose(finp);
  //printf("%d Allocated %d, start=%d, last=%d\n",num_seqs,data_size,data,seq[num_seqs-1].seq);

  if (prog_opts.split) {
    process_split(prog_opts.clfname1, prog_opts.split);
#ifdef MPI
    MPI_Finalize();
#endif
    return 0;
  }
  if (prog_opts.consfname1) process_constraints(prog_opts.consfname1,0);
  if (prog_opts.clustercomp) {
    cluster_compare(argv[optind+1]);
    return 0;
  }
  // If merging or adding need to open the second sequence file
  if (prog_opts.domerge || prog_opts.doadd) {
    finp = fopen(argv[optind+2], "r");
    if (finp == NULL)  {
      perror(argv[optind]);
      exit(1);
    }
    if (do_cluster == do_pairwise_cluster)
      read_sequences(finp,numinfirst+reindex_value,num_seqs);
    else
      init_sequences(finp,numinfirst+reindex_value,num_seqs);
    get_clustering(argv[optind+1],0);
    if (prog_opts.domerge) get_clustering(argv[optind+3],numinfirst);
  }
  if (prog_opts.init_cluster) get_clustering(prog_opts.clfname1, 0);
  if (prog_opts.recluster)
    reclustering(work,prog_opts.clfname2);
  else {
    // This really assumes there is only one thread for suffix
    if (prog_opts.pairwise==2) {
      matrix_compare(finp);
      return 0;
    }
    work->workflag = prog_opts.noninterleavednlc;//kludge for suffixarray
    global_j_end = num_seqs;
    perform_clustering(work);
#ifdef MPI
    if (myid>0) transmitMPISlaveResponse(work);
#endif
  }
  if (prog_opts.show_ext)      show_EXT(outf);
  if (prog_opts.show_histo)    show_histogram(work);
  if (prog_opts.show_clust&1) show_clusters(outf); 
  if (prog_opts.show_clust&8) 
    produce_clusters(prog_opts.clthresh,prog_opts.dirname);
  if (prog_opts.show_perf)     show_performance(outf);
  if (prog_opts.do_dump) {
    strcpy(chkfile,prog_opts.dname);
    strcat(chkfile,"-FIN");
    fclose(dump_file);
    dump_file = fopen(chkfile,"w");
    times(&usage);
    ticks = sysconf(_SC_CLK_TCK);
    fprintf(dump_file,"Completed %ld %ld", usage.tms_utime/ticks, usage.tms_stime*1000/ticks);
    fclose(dump_file);
  }
  if (prog_opts.show_version) fprintf(outf,"\n%s\n",version);
  fclose(outf);
#ifdef MPI
  MPI_Finalize();
#endif
  exit(0);
}
Exemple #6
0
RcppExport SEXP FitPhasingBurst(SEXP R_signal, SEXP R_flowCycle, SEXP R_read_sequence,
                SEXP R_phasing, SEXP R_burstFlows, SEXP R_maxEvalFlow, SEXP R_maxSimFlow) {

 SEXP ret = R_NilValue;
 char *exceptionMesg = NULL;

 try {

     Rcpp::NumericMatrix  signal(R_signal);
     Rcpp::NumericMatrix  phasing(R_phasing);     // Standard phasing parameters
     string flowCycle   = Rcpp::as<string>(R_flowCycle);
     Rcpp::StringVector   read_sequences(R_read_sequence);
     Rcpp::NumericVector  phasing_burst(R_burstFlows);
     Rcpp::NumericVector  max_eval_flow(R_maxEvalFlow);
     Rcpp::NumericVector  max_sim_flow(R_maxSimFlow);
     int window_size    = 38; // For normalization


     ion::FlowOrder flow_order(flowCycle, flowCycle.length());
     unsigned int num_flows = flow_order.num_flows();
     unsigned int num_reads = read_sequences.size();


     // Containers to store results
     Rcpp::NumericVector null_fit(num_reads);
     Rcpp::NumericMatrix null_prediction(num_reads, num_flows);
     Rcpp::NumericVector best_fit(num_reads);
     Rcpp::NumericVector best_ie_value(num_reads);
     Rcpp::NumericMatrix best_prediction(num_reads, num_flows);


     BasecallerRead bc_read;
     DPTreephaser dpTreephaser(flow_order);
     DPPhaseSimulator PhaseSimulator(flow_order);
     vector<double> cf_vec(num_flows, 0.0);
     vector<double> ie_vec(num_flows, 0.0);
     vector<double> dr_vec(num_flows, 0.0);


     // IE Burst Estimation Loop
     for (unsigned int iRead=0; iRead<num_reads; iRead++) {

       // Set read object
       vector<float> my_signal(num_flows);
       for (unsigned int iFlow=0; iFlow<num_flows; iFlow++)
         my_signal.at(iFlow) = signal(iRead, iFlow);
       bc_read.SetData(my_signal, num_flows);
       string my_sequence = Rcpp::as<std::string>(read_sequences(iRead));

       // Default phasing as baseline
       double my_best_fit, my_best_ie;
       double base_cf  = (double)phasing(iRead, 0);
       double base_ie  = (double)phasing(iRead, 1);
       double base_dr  = (double)phasing(iRead, 2);
       int burst_flow = (int)phasing_burst(iRead);
       vector<float> my_best_prediction;

       cf_vec.assign(num_flows, base_cf);
       dr_vec.assign(num_flows, base_dr);
       int my_max_flow  = min((int)num_flows, (int)max_sim_flow(iRead));
       int my_eval_flow = min(my_max_flow, (int)max_eval_flow(iRead));

       PhaseSimulator.SetBaseSequence(my_sequence);
       PhaseSimulator.SetMaxFlows(my_max_flow);
       PhaseSimulator.SetPhasingParameters_Basic(base_cf, base_ie, base_dr);
       PhaseSimulator.UpdateStates(my_max_flow);
       PhaseSimulator.GetPredictions(bc_read.prediction);
       dpTreephaser.WindowedNormalize(bc_read, (my_eval_flow/window_size), window_size, true);


       my_best_ie = base_ie;
       my_best_prediction = bc_read.prediction;
       my_best_fit = 0;
       for (int iFlow=0; iFlow<my_eval_flow; iFlow++) {
         double residual = bc_read.raw_measurements.at(iFlow) - bc_read.prediction.at(iFlow);
         my_best_fit += residual*residual;
       }
       for (unsigned int iFlow=0; iFlow<num_flows; iFlow++)
         null_prediction(iRead, iFlow) = bc_read.prediction.at(iFlow);
       null_fit(iRead) = my_best_fit;

       // Make sure that there are enough flows to fit a burst
       if (burst_flow < my_eval_flow-10) {
    	 int    num_steps  = 0;
    	 double step_size  = 0.0;
    	 double step_start = 0.0;
    	 double step_end   = 0.0;

         // Brute force phasing burst value estimation using grid search, crude first, then refine
         for (unsigned int iIteration = 0; iIteration<3; iIteration++) {

           switch(iIteration) {
             case 0:
               step_size = 0.05;
               step_end = 0.8;
               break;
             case 1:
               step_end   = (floor(my_best_ie / step_size)*step_size) + step_size;
               step_start = max(0.0, (step_end - 2.0*step_size));
               step_size  = 0.01;
               break;
             default:
               step_end   = (floor(my_best_ie / step_size)*step_size) + step_size;
               step_start = max(0.0, step_end - 2*step_size);
               step_size = step_size / 10;
           }
           num_steps  = 1+ ((step_end - step_start) / step_size);

           for (int iPhase=0; iPhase <= num_steps; iPhase++) {

        	 double try_ie = step_start+(iPhase*step_size);
             ie_vec.assign(num_flows, try_ie);

             PhaseSimulator.SetBasePhasingParameters(burst_flow, cf_vec, ie_vec, dr_vec);
             PhaseSimulator.UpdateStates(my_max_flow);
             PhaseSimulator.GetPredictions(bc_read.prediction);
             dpTreephaser.WindowedNormalize(bc_read, (my_eval_flow/window_size), window_size, true);

             double my_fit = 0.0;
             for (int iFlow=burst_flow+1; iFlow<my_eval_flow; iFlow++) {
               double residual = bc_read.raw_measurements.at(iFlow) - bc_read.prediction.at(iFlow);
               my_fit += residual*residual;
             }
             if (my_fit < my_best_fit) {
               my_best_fit = my_fit;
               my_best_ie  = try_ie;
               my_best_prediction = bc_read.prediction;
             }
           }
         }
       }

       // Set output information for this read
       best_fit(iRead) = my_best_fit;
       best_ie_value(iRead)   = my_best_ie;
       for (unsigned int iFlow=0; iFlow<num_flows; iFlow++)
         best_prediction(iRead, iFlow) = my_best_prediction.at(iFlow);
     }

     ret = Rcpp::List::create(Rcpp::Named("null_fit")        = null_fit,
                              Rcpp::Named("null_prediction") = null_prediction,
                              Rcpp::Named("burst_flow")      = phasing_burst,
                              Rcpp::Named("best_fit")        = best_fit,
                              Rcpp::Named("best_ie_value")   = best_ie_value,
                              Rcpp::Named("best_prediction") = best_prediction);


 } catch(std::exception& ex) {
   forward_exception_to_r(ex);
 } catch(...) {
   ::Rf_error("c++ exception (unknown reason)");
 }

 if(exceptionMesg != NULL)
   Rf_error(exceptionMesg);
 return ret;

}
Exemple #7
0
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}