C++ (Cpp) load_seqsの例

プログラミング言語: C++ (Cpp)

メソッド/関数: load_seqs

hotexamples.comのコード掲載数: 4

C++ (Cpp) load_seqs - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC++ (Cpp)のload_seqsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: hpc-clust.cpp プロジェクト: jfmrod/hpc-clust

void actionMakeOtusMothur()
{
  estrarray uarr;
  eseqclusterData cdata;
  ldieif(argvc<4,"syntax: "+efile(argv[0]).basename()+" -makeotus_mothur <alignment> <mergelog> <cutoff>");

  cout << "# loading seqs file: " << argv[1] << endl;
  load_seqs(argv[1],uarr);
  cdata.load(argv[2],uarr.size());

  float t=estr(argv[3]).f();
  earray<eintarray> otuarr;
  cdata.getOTU(t,otuarr,uarr.size());

  cout << "label\tnumOtus";
  for (long i=0; i<otuarr.size(); ++i)
    cout << "\tOTU" << i;
  cout << endl;

  cout << (1.0-t) << "\t" << otuarr.size();
  for (long i=0; i<otuarr.size(); ++i){
//    cout << ">OTU" << i << " otu_size="<< otuarr[i].size() << endl;
    cout << "\t" << uarr.keys(otuarr[i][0]);
    for (long j=1; j<otuarr[i].size(); ++j)
      cout << "," << uarr.keys(otuarr[i][j]);
  }
  cout << endl;

  exit(0);
}

コード例 #2

ファイルを表示

ファイル: sand_filter_kernel.c プロジェクト: rmcgibbo/cctools-3.4.2-fork

int main(int argc, char **argv)
{
	FILE *input;
	FILE *repeats = 0;
	FILE *output;

	int start_x, end_x, start_y, end_y;

	debug_config(progname);
	get_options(argc, argv, progname);

	unsigned long start_mem, cand_mem, table_mem;

	input = fopen(sequence_filename, "r");
	if(!input) fatal("couldn't open %s: %s\n",sequence_filename,strerror(errno));

	if(repeat_filename) {
		repeats = fopen(repeat_filename, "r");
		if(!repeats) fatal("couldn't open %s: %s\n",repeat_filename,strerror(errno));
	}

	if(output_filename) {
		output = fopen(output_filename, "w");
	} else {
		output = stdout;
	}

	// Data is in the form:
	// >id metadata
	// data
	// >id metadata
	// data
	// >>
	// ...

	set_k(kmer_size);
	set_window_size(window_size);

	// If we only give one file, do an all vs. all
	// on them.
	if(!second_sequence_filename) {
		num_seqs = load_seqs(input);
		start_x = 0;
		end_x = num_seqs;
		start_y = 0;
		end_y = num_seqs;
	}
	// If we had two files, do not compare ones from
	// the same file to each other.
	else {
		FILE *input2 = fopen(second_sequence_filename, "r");
		if(!input2) {
			fprintf(stderr, "Could not open file %s for reading.\n", second_sequence_filename);
			exit(1);
		}
		num_seqs = load_seqs_two_files(input, &end_x, input2, &end_y);
		start_x = 0;
		start_y = end_x;
		debug(D_DEBUG,"First file contains %d sequences, stored from (%d,%d].\n", end_x, start_x, end_x);
		debug(D_DEBUG,"Second file contains %d sequences, stored from (%d,%d].\n", end_y-end_x, start_y, end_y);
	}
	fclose(input);

	debug(D_DEBUG,"Loaded %d sequences\n",num_seqs);

	init_cand_table(num_seqs * 5);
	init_mer_table(num_seqs * 5);

	if(repeats) {
		int repeat_count = init_repeat_mer_table(repeats, 2000000, 0);
		fclose(repeats);
		debug(D_DEBUG,"Loaded %d repeated mers\n", repeat_count);
	}

	if(rectangle_size == -1) {
		// Do get_mem_avail*0.95 to leave some memory for overhead
		rectangle_size = DYNAMIC_RECTANGLE_SIZE(max_mem_kb);
		debug(D_DEBUG,"Mem avail: %lu, rectangle size: %d\n",(unsigned long)MEMORY_FOR_MERS(max_mem_kb), rectangle_size);
	}

	int curr_start_x = start_x;
	int curr_start_y = start_y;

	candidate_t *output_list = 0;
	int num_in_list;

	while(curr_start_y < end_y) {
		while(curr_start_x < end_x) {
			if(start_x == start_y) {
				debug(D_DEBUG,"Loading mer table (%d,%d)\n", curr_rect_x, curr_rect_y);
			} else {
				debug(D_DEBUG,"Loading mer table for [%d,%d) and [%d,%d)\n",curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y));
			}

			start_mem = get_mem_usage();

			load_mer_table_subset(curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y), (curr_start_x == curr_start_y));

			table_mem = get_mem_usage();

			debug(D_DEBUG,"Finished loading, now generating candidates\n");
			debug(D_DEBUG,"Memory used: %lu\n", table_mem - start_mem);

			generate_candidates();
			cand_mem = get_mem_usage();

			debug(D_DEBUG,"Total candidates generated: %llu\n", (long long unsigned int) total_cand);
			debug(D_DEBUG,"Candidate memory used: %lu\n", cand_mem - table_mem);

			output_list = retrieve_candidates(&num_in_list);
			output_candidate_list(output, output_list, num_in_list);
			free(output_list);
			fflush(output);

			debug(D_DEBUG,"Now freeing\n");

			free_cand_table();
			free_mer_table();

			debug(D_DEBUG,"Successfully output and freed!\n");

			curr_rect_x++;
			curr_start_x += rectangle_size;
		}
		curr_rect_y++;
		curr_start_y += rectangle_size;
		curr_rect_x = curr_rect_y;
		if(start_y == 0) {
			curr_start_x = curr_start_y;
		} else {
			curr_start_x = start_x;
		}
	}

	fclose(output);

	return 0;
}

コード例 #3

ファイルを表示

ファイル: dgen.c プロジェクト: astheeggeggs/mcqueen

int main(int argc, char **argv)
{
  init_rand();
  setup_gsl_dgen();

  dgen_parse_cmdline(argc, argv);

  int i, j, h, p, k, c;
  
  int cons_cap, num_cons = -1, root_cap, num_root = -1;
  char **cons_seqs, **root_seqs;

  int internal_node_index = 0, leaf_node_index = M;
  int max_internal_node_index = 2000000;
  int max_leaf_node_index = 2000000;
  int sum_seen_or_unseen = 0;

  int ploidy, codon_sequence_length = -1, n_genes, total_n_HLA;
  Decimal mu;

  if(json_parameters_path == NULL) die("No .json file passed.");

  load_lengths_for_simulation_from_json(json_parameters_path, &kappa, &mu,
                                        &codon_sequence_length, &total_n_HLA, &ploidy, &n_genes);

  if(ploidy < 1) die("Ploidy is less than 1.");
  if(n_genes < 1) die("Number of genes is less than 1.");
  
  if(cons_path != NULL) {
    printf("Loading sequences to obtain consensus...\n");
    num_cons = load_seqs(cons_path, &cons_seqs, &cons_cap);
    assert(num_cons > 0);
    printf("Loaded %i sequences to determine consensus.\n", num_cons);
    
    codon_sequence_length = strlen(cons_seqs[0]);
    printf("Codon_sequence_length: %i\n",codon_sequence_length/3);
    
    if(codon_sequence_length % 3 != 0)
      die("Sequences contain partial codons [%i mod 3 != 0].", codon_sequence_length);

    for(c = 0; c < num_cons; c++) {
      if((int) strlen(cons_seqs[c]) != codon_sequence_length) {
        die("Sequences from which to derive the consensus sequence aren't all "
            "the same length.");
      }
    }
    codon_sequence_length = codon_sequence_length/3;
  }

  if(root_path != NULL) {
    printf("Loading sequences to obtain root...\n");
    num_root = load_seqs(root_path, &root_seqs, &root_cap);
    printf("Loaded %i sequences to determine root.\n", num_root);
    
    if(cons_path == NULL)
      die("Did not pass a file to find the consensus sequence.");
    
    if((int) (strlen(root_seqs[0])/3) != codon_sequence_length)
      die("Sequences used to determine the root are different lengths to those used for the consensus.");

    for(c = 0; c < num_root; c++) {
      if((int) strlen(root_seqs[c]) != 3*codon_sequence_length) {
        die("Sequences from which to derive the root sequence aren't all "
            "the same length.");
      }
    }
  }

  Decimal *internal_node_times = my_malloc(max_internal_node_index * sizeof(Decimal) , __FILE__, __LINE__);
  Decimal *leaf_node_times = my_malloc(max_leaf_node_index * sizeof(Decimal), __FILE__, __LINE__);
  int *seen_or_unseen = my_malloc(max_internal_node_index * sizeof(int), __FILE__, __LINE__);

  birth_death_simulation_backwards(max_internal_node_index, max_leaf_node_index,
                                   internal_node_times, 
                                   leaf_node_times,
                                   &internal_node_index, &leaf_node_index,
                                   seen_or_unseen,
                                   N, M, lambda, mu_tree, past_sampling);

  for(i = 0; i < internal_node_index; i++) sum_seen_or_unseen += seen_or_unseen[i];
  
  int total_nodes = (2 * leaf_node_index) - 1 + internal_node_index - sum_seen_or_unseen;
  Tree *tree = my_malloc((total_nodes+1) * sizeof(Tree), __FILE__, __LINE__);
  // Now malloc the memory that this points to.
  int *HLAs_in_tree = my_malloc((total_nodes+1) * ploidy * n_genes * sizeof(int), __FILE__, __LINE__);

  for(i = 0; i < total_nodes; i++) 
    tree[i].HLAs = &HLAs_in_tree[i * ploidy * n_genes];

  construct_birth_death_tree(leaf_node_index, internal_node_index,
                             leaf_node_times, internal_node_times,
                             M, seen_or_unseen,
                             tree);

  // Reverse the direction that time is measured in the tree.
  // DEV: Don't need to do this, waste of computation - sort.
  // DEV: The parent times are wrong when there are unseen nodes.
  for(i = 0; i < total_nodes; i++)
    tree[i].node_time = tree[total_nodes-1].node_time - tree[i].node_time;
   
  int root_node = tree[total_nodes-1].node;
  
  if(write_newick_tree_to_file == true) {
    write_newick_tree(newick_tree_data_file, tree, root_node, 1);
    fclose(newick_tree_data_file);
  }

  Decimal S_portion[NUM_CODONS];
  Decimal NS_portion[NUM_CODONS];

  for(c = 0; c < NUM_CODONS; c++) {
    S_portion[c] = kappa * beta_S[c] + beta_V[c];
    NS_portion[c] = kappa * alpha_S[c] + alpha_V[c];
  }
  
  int n_HLA[n_genes];

  printf("Total number of HLA types: %i.\n", total_n_HLA);

  Decimal HLA_prevalences[total_n_HLA];
  int wildtype_sequence[codon_sequence_length];
  Decimal *R = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__);
  Decimal *omega = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__);
  Decimal *reversion_selection = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__);

  memory_allocation(num_cons, num_root, codon_sequence_length,
                    max_internal_node_index, max_leaf_node_index, 
                    total_nodes, ploidy, n_genes, total_n_HLA,
                    leaf_node_index);

  int (*codon_sequence_matrix)[codon_sequence_length] = my_malloc(total_nodes *
                                                                  sizeof(int[codon_sequence_length]),
                                                                  __FILE__, __LINE__);
  Decimal (*HLA_selection_profiles)[codon_sequence_length] = my_malloc(total_n_HLA * sizeof(Decimal[codon_sequence_length]),
                                                                       __FILE__, __LINE__);

  load_parameters_for_simulation_from_json(json_parameters_path, codon_sequence_length,
                                           omega, R, reversion_selection, total_n_HLA,
                                           n_genes, n_HLA, HLA_prevalences,
                                           HLA_selection_profiles);
  
  Decimal sum_check;
  for(i = 0, k = 0; i < n_genes; i++) {
    sum_check = 0;
    for(h = 0; h < n_HLA[i]; h++, k++) {
      sum_check += HLA_prevalences[k];
    }
    if(sum_check > 1.00001 || sum_check < 0.9999) die("HLA prevalences for gene %i do not sum to 1\n", i+1);
  }
  
  if(cons_path != NULL) {
    printf("Mapping gaps to consensus...\n");
    // Set the consensus sequence - the consensus of the optional sequence file 
    // that is passed.
    char wildtype_sequence_dummy[3*codon_sequence_length+1];
    generate_consensus(cons_seqs, num_cons, 3*codon_sequence_length, wildtype_sequence_dummy);
    printf("Wildtype sequence:\n%s\n", wildtype_sequence_dummy);
    // By default, set the root as the wildtype sequence.
    for(i = 0; i < codon_sequence_length; i++)
      wildtype_sequence[i] = (int) amino_to_code(wildtype_sequence_dummy+i*3);

    if(root_path == NULL) {
      for(i = 0; i < codon_sequence_length; i++)
        codon_sequence_matrix[root_node][i] = wildtype_sequence[i];
    } else {
      printf("Mapping gaps to root...\n");
      char root_sequence_dummy[3*codon_sequence_length+1];
      generate_consensus(root_seqs, num_root, 3*codon_sequence_length, root_sequence_dummy);
      printf("Root sequence:\n%s\n", root_sequence_dummy);
      
      for(i = 0; i < codon_sequence_length; i++)
        codon_sequence_matrix[root_node][i] = (int) amino_to_code(root_sequence_dummy+i*3);
      printf("Number of root sequences: %i.\n", num_root);
      for(c = 0; c < num_root; c++) free(root_seqs[c]);
      free(root_seqs);
    }
    printf("Number of consensus sequences: %i.\n", num_cons);
    for(c = 0; c < num_cons; c++) free(cons_seqs[c]);
    free(cons_seqs);
  
  } else {
    for(i = 0; i < codon_sequence_length; i++) {
      // Sample the root sequence according to the HIV codon usage information.
      codon_sequence_matrix[root_node][i] = discrete_sampling_dist(NUM_CODONS, prior_C1);
      // As default, set the root node to the consensus sequence.  
      wildtype_sequence[i] = codon_sequence_matrix[root_node][i];
    }
  }
  
  // No matter what is read in, there is no recombination simulated - so make sure it's set to 0.
  for(i = 0; i < codon_sequence_length; i++) R[i] = 0;

  write_summary_json(json_summary_file,
                     mu, codon_sequence_length, ploidy,
                     n_genes, n_HLA, total_n_HLA,
                     HLA_prevalences,
                     omega, R, reversion_selection,
                     HLA_selection_profiles);

  free(R);
  
  fprintf(simulated_root_file, ">root_sequence\n");
  for(i = 0; i < codon_sequence_length; i++)
    fprintf(simulated_root_file, "%s", code_to_char(codon_sequence_matrix[root_node][i]));
  fprintf(simulated_root_file, "\n");

  int root_HLA[ploidy * n_genes];
  int cumulative_n_HLA = 0;

  for(i = 0, k = 0; i < n_genes; i++) {
    for(p = 0; p < ploidy; p++, k++) {
      root_HLA[k] = cumulative_n_HLA + 
                    discrete_sampling_dist(n_HLA[i], &HLA_prevalences[cumulative_n_HLA]);
      tree[root_node].HLAs[k] = root_HLA[k];
    }
    cumulative_n_HLA = cumulative_n_HLA + n_HLA[i];
  }

  printf("Passing HLA information...\n");
  pass_HLA(ploidy, n_genes, root_node,
                           tree, leaf_node_index, total_n_HLA,
                           n_HLA, HLA_prevalences);
  printf("Passed HLA information\n");
  
  // printf("Printing the tree\n");
  // for(i = 0; i < total_nodes; i++) {
  //   printf("%i %i %i "DECPRINT" %i", tree[i].node, tree[i].daughter_nodes[0],
  //          tree[i].daughter_nodes[1], tree[i].node_time,
  //          tree[i].seen_or_unseen);
  //   for(j = 0; j < (ploidy * n_genes); j++) {
  //     printf(" %i", tree[i].HLAs[j]);
  //   }
  //   printf("\n");
  // }

  if(write_tree_to_file == true) {
    write_tree(tree_data_file, tree, root_node, ploidy, n_genes);
    fclose(tree_data_file);
  }

  printf("Passing sequence information...\n");
  
  pass_codon_sequence_change(codon_sequence_length, ploidy,
                             n_genes, total_n_HLA, 
                             root_node, mu,
                             codon_sequence_matrix,
                             tree,
                             leaf_node_index,
                             S_portion, NS_portion,
                             HLA_selection_profiles,
                             wildtype_sequence,
                             omega, reversion_selection);

  printf("Passed sequence information\n"
         "Now generating .fasta files of reference and query sequences, and\n"
         "a .csv file of the HLA information associated to the query sequences.\n");

  if(num_queries < 0) {
    // Set the number of query sequences.
    num_queries = (int) (query_fraction * leaf_node_index);
    printf("Number of queries: %i.\n", num_queries);
  } else {
    printf("Number of queries: %i.\n", num_queries);
  }

  if(num_queries > leaf_node_index) die("Number of query sequences larger than the number of leaves");
  int *all_sequences = my_malloc(leaf_node_index * sizeof(int), __FILE__, __LINE__);
  int num_refs = leaf_node_index - num_queries;

  for(i = 0; i < leaf_node_index; i++) all_sequences[i] = i;

  save_simulated_ref_and_query_fasta(num_queries, num_refs, leaf_node_index,
                                     all_sequences, codon_sequence_length, codon_sequence_matrix,
                                     tree, ploidy, n_genes);

  // Now save the hla types to a .csv file.
  fprintf(hla_query_file, "\"\",");
  for(h = 0; h < total_n_HLA-1; h++) fprintf(hla_query_file, "\"%i\",", h+1);
  fprintf(hla_query_file, "\"%i\"\n", total_n_HLA);
  
  // Write the HLA types of the leaves to a file.
  int (*hla_types)[total_n_HLA] = my_malloc(leaf_node_index * sizeof(int[total_n_HLA]), __FILE__, __LINE__);

  for(i = 0; i < leaf_node_index; i++)
  {
    for(h = 0; h < total_n_HLA; h++)
      hla_types[i][h] = 0;
    for(j = 0; j < (n_genes * ploidy); j++)
      hla_types[i][tree[i].HLAs[j]] = 1;
  }

  // Write the query HLA types to a .csv file.
  for(i = num_refs; i < leaf_node_index; i++)
  {
    fprintf(hla_query_file,"\"simulated_seq_%i_HLA", all_sequences[i]+1);
    for(h = 0; h < (ploidy * n_genes); h++) fprintf(hla_query_file, "_%i", tree[all_sequences[i]].HLAs[h]);
    fprintf(hla_query_file, "\"");
    for(h = 0; h < total_n_HLA; h++) {
      fprintf(hla_query_file, ",%i", hla_types[all_sequences[i]][h]);
    }
    fprintf(hla_query_file, "\n");
  }

  free(hla_types); 
  free(internal_node_times);
  free(leaf_node_times);
  free(seen_or_unseen);
  free(codon_sequence_matrix);
  free(HLA_selection_profiles);
  free(all_sequences);
  free(omega);
  free(reversion_selection);

  free(tree[0].HLAs);
  free(tree);

  fclose(summary_file);
  fclose(json_summary_file);
  fclose(simulated_refs_file);
  fclose(simulated_root_file);
  fclose(simulated_queries_file);
  fclose(hla_query_file);

  clearup_gsl_dgen();
  return EXIT_SUCCESS;
}

コード例 #4

ファイルを表示

ファイル: hpc-clust.cpp プロジェクト: jfmrod/hpc-clust

void actionMakeReps()
{
  ldieif(argvc<3,"syntax: "+efile(argv[0]).basename()+" -makereps <alignment> <otu>");
  estrhashof<INDTYPE> seqind;

  estrarray uarr;

  cout << "# loading seqs file: " << argv[1] << endl;
  load_seqs_compressed(argv[1],arr,seqind,seqlen);
  load_seqs(argv[1],uarr);

  earray<ebasicarray<INDTYPE> > otus;

  efile f;
  estr line;
  estrarray parts;
  f.open(argv[2],"r");
  while (!f.eof()){
    f.readln(line);
    if (line.len()==0 || line[0]=='#') continue;
    if (line[0]=='>'){
      otus.add(ebasicarray<INDTYPE>());
      continue;
    }
    ldieif(otus.size()==0,"first entry not start of OTU or missing '>'");
    parts=line.explode("\t");
    ldieif(parts.size()==0,"array empty: "+line);
    ldieif(!seqind.exists(parts[0]),"sequence not found: "+parts[0]);
    otus[otus.size()-1].add(seqind[parts[0]]);
  }

  cerr << endl;

  ebasicarray<INDTYPE> tuniqind;
  earray<ebasicarray<INDTYPE> > dupslist;
  finduniq(tuniqind,dupslist);

  eintarray uniqmask;
  uniqmask.init(arr.size(),0);
  for (long i=0; i<tuniqind.size(); ++i)
    uniqmask[tuniqind[i]]=dupslist[i].size();


//  ebasicarray<INDTYPE> uniqind;
  taskman.createThread(nthreads);

  ebasicarray<INDTYPE> uniqind;
  const float t=0.0;
  efloatarray avgdist;
  for (long j=0; j<otus.size(); ++j){
//    cout << "# computing distances for otu "<< j << " size: " << otus[j].size() <<  endl;
    if (otus[j].size()==1){
      cout << ">OTU" << j << " " << arr.keys(otus[j][0]) << " avg_id=1.0 otu_size=1" << endl;
      cout << uarr.values(otus[j][0]) << endl;
      continue;
    }
    uniqind.clear();
    for (long l=0; l<otus[j].size(); ++l){
      if (uniqmask[otus[j][l]]!=0)
        uniqind.add(otus[j][l]);
    }
//    uniqind=otus[j];
    ldieif(uniqind.size()==0,"empty OTU");

    if (uniqind.size()==1){
      cout << ">OTU" << j << " " << arr.keys(uniqind[0]) << " avg_id=1.0 otu_size=" << otus[j].size() << endl;
      cout << uarr.values(uniqind[0]) << endl;
      continue;
    }
    avgdist.clear();
    avgdist.init(arr.size(),0.0);
    dists.clear();
  
    partsTotal=10000;
    if (partsTotal>(uniqind.size()-1l)*uniqind.size()/20l) partsTotal=(uniqind.size()-1l)*uniqind.size()/20l; // make fewer tasks if to few calculations per task
    if (partsTotal<=0) partsTotal=1;
    
    taskman.clear();
    for (long i=0; i<partsTotal; ++i)
      taskman.addTask(dfunc.value().calcfunc,evararray(mutex,uniqind,arr,dists,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen));
    taskman.wait();
    for (long i=0; i<dists.size(); ++i){
      eseqdist& d(dists[i]);
      avgdist[d.x]+=d.dist*uniqmask[d.y];
      avgdist[d.y]+=d.dist*uniqmask[d.x];
//      cout << "# "<< arr.keys(d.x) << " " << arr.keys(d.y) << " " << d.dist << " " << uniqmask[d.x] << " " << uniqmask[d.y] << endl;
    }
    long k=uniqind[0];
    for (long i=0; i<uniqind.size(); ++i){
      long ti=uniqind[i];
      avgdist[ti]+=uniqmask[ti]-1;
      if (avgdist[k]<avgdist[ti]) {
//        cout << "# " << arr.keys(ti) << " " << ti << " " << uniqmask[ti] << " " << avgdist[ti] << " " << counts[ti] << endl;
        k=ti;
      }
    }
//    cout << "OTU" << j << " " << otus[j].size() << " " << arr.keys(k) << " " << avgdist[k]/(otus[j].size()-1) << " " << dists.size() << endl;
    cout << ">OTU" << j << " " << arr.keys(k) << " avg_id=" << avgdist[k]/(otus[j].size()-1) << " otu_size=" << otus[j].size() << endl;
    cout << uarr.values(k) << endl;
  }
  cerr << endl;

  exit(0);
}