Beispiel #1
0
static int
bin_reclaim_compound(Symbol* tsym, Reclaim* reclaimer)
{
    int stat = NC_NOERR;
    int nfields;
    size_t fid, i, arraycount;
    ptrdiff_t saveoffset;

    reclaimer->offset = read_alignment(reclaimer->offset,tsym->typ.cmpdalign);
    saveoffset = reclaimer->offset;

    /* Get info about each field in turn and reclaim it */
    nfields = listlength(tsym->subnodes);
    for(fid=0;fid<nfields;fid++) {
	Symbol* field = listget(tsym->subnodes,fid);
	int ndims = field->typ.dimset.ndims;
	/* compute the total number of elements in the field array */
	for(i=0;i<ndims;i++) arraycount *= field->typ.dimset.dimsyms[i]->dim.declsize;
	reclaimer->offset = read_alignment(reclaimer->offset,field->typ.alignment);
	for(i=0;i<arraycount;i++) {
	    if((stat = bin_reclaim_datar(field->typ.basetype, reclaimer))) goto done;
	}		
    }
    reclaimer->offset = saveoffset;
    reclaimer->offset += tsym->typ.size;
done:
    return stat;
}
Beispiel #2
0
shared_ptr<VectorSiteContainer> SiteContainerBuilder::read_alignment(string filename,
        string file_format, bool interleaved)
                throw (Exception) {
    try {
        return read_alignment(filename, file_format, "nt", interleaved);
    }
    catch (AlphabetException &e) {
        return read_alignment(filename, file_format, "aa", interleaved);
    }
}
void read_counts(Tree &T, Counts &data, std::string fname, long nalpha) {
  int i;
  unsigned int j;
  long id;
  State sta, sta2;
  std::string str;
  std::vector<long> d;

  // The number of letters is fixed to 4 in read_fasta.
  if (nalpha != 4) {
    std::cout << "Reading counts only implemented for 4 letters." << std::endl;
  }

  // reads in the alignment. The data is in _orbits and _couts.
  read_alignment(fname);

  // Now we fill in the Counts structure.
  data.nalpha = 4;
  data.nspecies = g_numSpecies;
  data.nstates = 1;

  // calculate the power:
  for (i=0; i < data.nspecies; i++) {
    data.nstates = data.nstates*data.nalpha;
  }

  // Creates a state of a given dimension
  create_state(sta, data.nspecies, data.nalpha);
  create_state(sta2, data.nspecies, data.nalpha);

  // Matches the species in the fasta with the ones in the tree.
  if (!match_species(T, g_nameSpecies, g_numSpecies, d)) {
    throw std::length_error( "Could not match species in tree with species in fasta." );
  }

  // Stores the counts in the Counts structure.
  data.c.resize(data.nstates);
  data.N = 0;
  for (j=0; j < _orbitals.size(); j++) {
    str = transform_adn_chain_val_to_string(_orbitals[j]);
    string2state(str, sta);
    permute_state(d, sta, sta2);
    id = state2index(sta2);
    data.c[id] = (double) _counts[j];
    data.N = data.N + data.c[id];
  }

  // Stores the species names
  data.species.resize(data.nspecies);
  for (i=0; i < T.nleaves; i++) {
    data.species[i] = T.names[i];
  }
}
Beispiel #4
0
void ExternalAligner::align_seqs_impl(Strings& seqs) const {
    std::string input = tmp_file();
    ASSERT_FALSE(input.empty());
    std::string output = tmp_file();
    ASSERT_FALSE(output.empty());
    {
        boost::shared_ptr<std::ostream> file = name_to_ostream(input);
        std::ostream& out = *file;
        for (int i = 0; i < seqs.size(); i++) {
            write_fasta(out, TO_S(i), "", seqs[i], 60);
        }
    }
    align_file(input, output);
    Strings rows;
    read_alignment(rows, output);
    ASSERT_EQ(rows.size(), seqs.size());
    seqs.swap(rows);
    if (!go("NPGE_DEBUG").as<bool>()) {
        remove_file(input);
        remove_file(output);
    }
}
Beispiel #5
0
static int
bin_reclaim_vlen(Symbol* tsym, Reclaim* reclaimer)
{
    int stat = NC_NOERR;
    size_t i;
    Symbol* basetype = tsym->typ.basetype;
    nc_vlen_t* vl = (nc_vlen_t*)(reclaimer->memory+reclaimer->offset);

    /* Free up each entry in the vlen list */
    if(vl->p != NULL) {
	Reclaim vreclaimer;
	vreclaimer.memory = vl->p;
	vreclaimer.offset = 0;
        for(i=0;i<vl->len;i++) {
	    vreclaimer.offset = read_alignment(vreclaimer.offset,basetype->typ.alignment);
	    if((stat = bin_reclaim_datar(basetype,&vreclaimer))) goto done;
	    vreclaimer.offset += basetype->typ.size;
	}
	reclaimer->offset += tsym->typ.size;
	efree(vl->p);
    }
done:
    return stat;
}
Beispiel #6
0
struct alignment* detect_and_read_sequences(struct alignment* aln,struct parameters* param)
{
	
	int feature = 0;
	char **input = 0;
	unsigned short int* input_type = 0;
	unsigned short int* input_numseq = 0;
	
	int num_input = 0;
	int i = 0;
	int j = 0;
	int c = 0;
	int a,b;
	int free_read = 1;
	unsigned int numseq = get_kalign_context()->numseq;
	while(free_read == 1 || param->infile[i]){
		num_input++;
		i++;
		free_read = 0;
	}
	numseq = 0;

	
	input = malloc(sizeof(char*) * num_input);
	input_type = malloc(sizeof(unsigned short int) * num_input);
	input_numseq = malloc(sizeof(unsigned short int) * num_input);
	
	for (i = 0; i < num_input;i++){
		input[i] = 0;
		input_type[i] = 0;
		input_numseq[i] = 0;
	}

	free_read = 0;
	
	if(param->quiet){
		c = 1;
	}else{
		c = 0;
	}
	
	
	for (i = c; i < num_input;i++){
		if(!param->infile[i]){
			k_printf("reading from STDIN: ");
		}else{
			k_printf("reading from %s: ",param->infile[i]);
		}
		input[i] = get_input_into_string(input[i],param->infile[i]);
		if(input[i]){
			free_read++;
			if (byg_start("<macsim>",input[i]) != -1){
				input_numseq[i] = count_sequences_macsim(input[i]);
				feature = 1;
				input_type[i] = 1;
			}else if (byg_start("<uniprot",input[i]) != -1){
				input_numseq[i] = count_sequences_uniprot(input[i]);
				input_type[i] = 2;
			}else if(byg_start("This SWISS-PROT",input[i]) != -1){
				input_numseq[i] = count_sequences_swissprot(input[i]);
				input_type[i] = 3;
			}else if (byg_start("This Swiss-Prot",input[i]) != -1){
				input_numseq[i] = count_sequences_swissprot(input[i]);
				input_type[i] = 3;
			}else if (byg_start("CLUSTAL W",input[i]) != -1){
				input_numseq[i] = count_sequences_clustalw(input[i]);
				input_type[i] = 4;
			}else if (byg_start("PileUp",input[i]) != -1){
				input_numseq[i] = count_sequences_clustalw(input[i]);
				input_type[i] = 4;
			}else if (byg_start("MSF:",input[i]) != -1){
				input_numseq[i] = count_sequences_clustalw(input[i]);
				input_type[i] = 4;
			}else if (byg_start("STOCKHOLM",input[i]) != -1){
				input_numseq[i] = count_sequences_stockholm(input[i]);
				input_type[i] = 5;
			}else{
				input_numseq[i]  = count_sequences_fasta(input[i]);
				input_type[i] = 0;
			}
			k_printf("found %d sequences\n",input_numseq[i]);
			
			if(input_numseq[i] < 1){
				free(input[i]);
				input[i] = 0;
			}else{
				numseq += input_numseq[i];
			}
		}else{
			k_printf("found no sequences.\n");
			if(!param->outfile && i){
				param->outfile = param->infile[i];
				k_printf("-> output file, in ");
				//try to set format.... 
				if(!param->format){
					if (byg_start("msf",param->outfile) != -1){
						param->format = "msf";
					}else if (byg_start("clustal",param->outfile) != -1){
						param->format = "clustal";
					}else if (byg_start("aln",param->outfile) != -1){
						param->format = "clustal";
					}else if (byg_start("macsim",param->outfile) != -1){
						param->format = "macsim";
					}else{
						param->format = "fasta";
					}
					if(param->reformat){
						k_printf("unaligned fasta format\n");
					}else if(param->format){
						k_printf("%s format\n",param->format);
					}else{
						k_printf("fasta format\n");
					}
				}
			}
			k_printf("\n");
		}
	}

	
	if(numseq < 2){
		k_printf("%s\n", usage);
		if(!numseq){
		k_printf("\nWARNING: No sequences found.\n\n");
		}else{
		k_printf("\nWARNING: Only one sequence found.\n\n");
		}
		for (i = 0; i < num_input;i++){
			free(input[i]);
		}
		free(input_numseq);
		free(input_type);
		free(input);
		free_param(param);
		exit(0);
	}

	if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){
		if( free_read  < 2){
			k_printf("\nWARNING: You are trying to perform a profile - profile alignment but ony one input file was detected.\n\n");
			param->alignment_type = "default";
		}
	}

	
	if (param->feature_type && !feature){
		for (i = 0; i < num_input;i++){
			free(input[i]);
		}
		free(input_numseq);
		free(input_type);
		free(input);
		free_param(param);
		throwKalignException(k_printf("\nWARNING: You are trying to perform a feature alignment but the input format(s) do not contain feature information.\n"));
	}
	
	get_kalign_context()->numprofiles = (numseq << 1) - 1;
	aln = aln_alloc(aln);
	//numseq = 0;
	if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){
		j = 0;
		for (i = 0; i < num_input;i++){
			
			if(input[i]){
					
				switch(input_type[i]){
					case 0:
						aln = read_alignment(aln,input[i]);
						break;
					case 1:
						aln = read_alignment_macsim_xml(aln,input[i]);
						break;
					case 2:
						aln = read_alignment_uniprot_xml(aln,input[i]);
						break;
					case 3:

						aln = read_alignment_from_swissprot(aln, input[i]);
						break;
					case 4:
						aln = read_alignment_clustal(aln,input[i]);
						break;
					case 5:
						aln = read_alignment_stockholm(aln,input[i]);
						break;
					
					default:
						aln = read_alignment(aln,input[i]);
						break;
				}
				input[i] = 0;
				//create partial profile....
				aln->nsip[numseq+j] = input_numseq[i];
				aln->sip[numseq+j] = malloc(sizeof(int)*aln->nsip[numseq+j]);
				
				//k_printf("%d	%d\n",numseq+j,aln->sl[numseq+j]);
				j++;
			}
		}
		num_input = j;
		c = 0;
		for (i = 0;i < num_input;i++){
		//	
			for ( j = 0; j < aln->nsip[numseq+i];j++){
				aln->sip[numseq+i][j] = c;
				c++;
		//		k_printf("%d ",aln->sip[numseq+i][j]);
			}
			aln->sl[numseq+i] = aln->sl[aln->sip[numseq+i][0]];
		//	k_printf("PROFILE:%d	contains: %d long:%d\n",i+numseq,aln->nsip[numseq+i],aln->sl[numseq+i]);
	//		k_printf("\n");
		}
		
		//sanity check -are all input 
		
		for (i = 0;i < num_input;i++){
			for ( j = 0; j < aln->nsip[numseq+i]-1;j++){
				a = aln->sip[numseq+i][j];
				a = aln->sl[a];
				for (c =  j+1; j < aln->nsip[numseq+i];j++){
					b = aln->sip[numseq+i][c];
					b = aln->sl[b];
					if(a != b){
						
						for (i = 0; i < num_input;i++){
							free(input[i]);
						}
						free(input_numseq);
						free(input_type);
						free(input);
						free_aln(aln);
						free_param(param);
						throwKalignException(k_printf("Unaligned sequences in input %s.\n",param->infile[i]));
					}
				}
				
			}

		}
		
		//exit(0);
		
		/*for (i = 0; i < numseq;i++){
			k_printf("len%d:%d\n",i,aln->sl[i]);	
			for ( j =0 ; j < aln->sl[i];j++){
				//if(aln->s[i][j]> 23 || aln->s[i][j] < 0){
				//	 aln->s[i][j] = -1;
				//}
				k_printf("%d ",aln->s[i][j]);
			}
		//	k_printf("\n");
		}
		exit(0);*/
	}else{
		for (i = 0; i < num_input;i++){
			if(input[i]){
				switch(input_type[i]){
					case 0:
						aln = read_sequences(aln,input[i]);
						break;
					case 1:
						aln = read_sequences_macsim_xml(aln,input[i]);
						break;
					case 2:
						aln = read_sequences_uniprot_xml(aln,input[i]);
						break;
					case 3:
						aln = read_sequences_from_swissprot(aln, input[i]);
						break;
					case 4:
						aln = read_sequences_clustal(aln,input[i]);
						break;
					case 5:
						aln = read_sequences_stockholm(aln,input[i]);
						break;
					
					default:
						aln = read_sequences(aln,input[i]);
						break;
				}
				/*if (byg_start("<macsim>",input[i]) != -1){
					aln = read_sequences_macsim_xml(aln,input[i]);
				}else if (byg_start("<uniprot",input[i]) != -1){
					aln = read_sequences_uniprot_xml(aln,input[i]);
				}else if(byg_start("This SWISS-PROT entry is copyright.",input[i]) != -1){
					aln = read_sequences_from_swissprot(aln, input[i]);
				}else if (byg_start("This Swiss-Prot entry is copyright.",input[i]) != -1){
					aln = read_sequences_from_swissprot(aln, input[i]);
				}else if (byg_start("CLUSTAL W",input[i]) != -1){
					aln = read_sequences_clustal(aln,input[i]);
				}else if (byg_start("PileUp",input[i]) != -1){
					aln = read_sequences_clustal(aln,input[i]);
				}else if (byg_start("MSF:",input[i]) != -1){
					aln = read_sequences_clustal(aln,input[i]);
				}else if (byg_start("STOCKHOLM",input[i]) != -1){
					aln = read_sequences_stockholm(aln,input[i]);
				}else{
					aln = read_sequences(aln,input[i]);
				}*/
				input[i] = 0;
			}
		}
	}
	if(numseq < 2){
		free_param(param);
		throwKalignException(k_printf("\nNo sequences could be read.\n"));
	}
	if(!param->format && param->outfile){
			if (byg_start("msf",param->outfile) != -1){
				param->format = "msf";
			}else if (byg_start("clustal",param->outfile) != -1){
				param->format = "clustal";
			}else if (byg_start("aln",param->outfile) != -1){
				param->format = "clustal";
			}else if (byg_start("macsim",param->outfile) != -1){
				param->format = "macsim";
			}
			k_printf("Output file: %s, in %s format.\n",param->outfile,param->format);
	}
	
	
	free(input);
	free(input_type);
	free(input_numseq);
	return aln;
}