Example #1
0
void test_read_fasta_file(struct test *t) {
  t_set_msg(t, "Testing reading a fasta File...");
  struct genome_sequence *sequence_table = NULL;
  char filename[30] = "test/data/test.fasta";
  int err = read_fasta_file(&sequence_table, filename);
  t_assert_msg(t, err == E_SUCCESS, "Parsing failed");
  t_log(t, "Error: %d\n", err);

  struct genome_sequence *s = NULL;
  struct genome_sequence *tmp = NULL;
  HASH_ITER(hh, sequence_table, s, tmp) {
    s->data[s->n] = 0;
    t_log(t, "%s %s \n\n", s->chrom, s->data);
  }
Example #2
0
void ReadGenome(const vector<string>& chrom_files, Genome& genome) {
  fprintf(stderr, "[READING CHROMOSOMES]\n");
  vector<string> chrom_names;
  vector<string> chrom_seqs;
  uint32_t all_chroms_len = 0;
  for (uint32_t i = 0; i < chrom_files.size(); ++i) {
    vector<string> tmp_chrom_names;
    vector<string> tmp_chrom_seqs;

    /* read chromosome name and seqeunce from the chromosome file */
    read_fasta_file(chrom_files[i].c_str(), tmp_chrom_names, tmp_chrom_seqs);

    for (uint32_t j = 0; j < tmp_chrom_seqs.size(); ++j) {
      chrom_names.push_back(tmp_chrom_names[j]);
      chrom_seqs.push_back(tmp_chrom_seqs[j]);
      all_chroms_len += tmp_chrom_seqs[j].size();
    }
  }

  /* copy chroms sequences to genome */
  genome.num_of_chroms = chrom_seqs.size();
  genome.length_of_genome = all_chroms_len;
  fprintf(stderr, "[THERE ARE %u CHROMOSOMES IN THE GENOME]\n",
          genome.num_of_chroms);
  fprintf(stderr, "[THE TOTAL LENGTH OF ALL CHROMOSOMES IS %u]\n",
          all_chroms_len);

  genome.name.resize(genome.num_of_chroms);
  genome.length.resize(genome.num_of_chroms);

  /* The last element of genome.start-index is the length of all chroms */
  genome.start_index.resize(genome.num_of_chroms + 1);
  genome.strand = '+';
  genome.sequence.resize(all_chroms_len);
  uint32_t k = 0;
  for (uint32_t i = 0; i < genome.num_of_chroms; ++i) {
    genome.name[i] = chrom_names[i];
    genome.length[i] = chrom_seqs[i].size();
    genome.start_index[i] = k;

    for (uint32_t j = 0; j < chrom_seqs[i].size(); ++j) {
      genome.sequence[k] = toupper(chrom_seqs[i][j]);
      genome.sequence[k] = toACGT(genome.sequence[k]);
      k++;
    }
  }
  genome.start_index[genome.num_of_chroms] = k;
}
Example #3
0
// unit test: read fasta file and print all entries
int main(int argc, char * argv[])
{

  if (argc < 2)
  {
    puts ("ERROR: no FASTA-file specified!");
    return 1;
  }
  multifasta *mf = read_fasta_file (argv[1]);

  if (mf == NULL)
    return 1;
  
  int i;
  for (i = 0; i < mf->length; i++)
  {
    printf("Entry %d \n", i);
    printf("Header: %s \n", mf->entries[i]->header);
    printf("Sequenz: %s \n", mf->entries[i]->sequence);
  }
  multifasta_delete (mf);
  return 0;
}
Example #4
0
int main ( int argc, char ** argv)
 {
   struct TSwitch  sw;
   char   * out_file;

   unsigned int MAXnumgaps;	//input argument		
   double gap_open_pen;		//input argument
   double gap_extend_pen;	//input argument
   unsigned int scoring_matrix; //input argument
   unsigned int MAXgap;	
   unsigned int num_mat;        //input argument + 1

   double MAXscore;		//to be computed
   unsigned int MINnumgaps;	//to be computed		
   
   unsigned int istart;		//where to start backtracing
   unsigned int jstart;		//where to start backtracing
   unsigned int iend;		//where to end backtracing
   unsigned int jend;		//where to end backtracing
   unsigned int * gaps_pos;	//position of the gap(s)
   unsigned int * gaps_len;	//len of the gap(s)
   unsigned int * where;	//where is the gap(s): text [1] or pattern [2]
   
   struct TSeq * t;		//text t
   unsigned int n; 		//length of text
   struct TSeq * p;		//pattern p
   unsigned int m; 		//length of pattern
   unsigned int L;		//local alignment	
   
   double ***       G; 		//dynamic programming matrix
   int *** 	 H; 		//backtracing matrix
   	
   unsigned int swap;           //swap the text and the pattern in case m < n        
   unsigned int i, j;

   /* checks the arguments */
   i = decode_switches ( argc, argv, &sw );

   if ( i < 5 || ! sw . seq_a || ! sw . seq_b ) 
    {
      usage ();
      return ( 1 );
    }
   else 
    {
      gap_open_pen   = - sw . gap_open_pen;	//the penalties should have a negative value
      gap_extend_pen = - sw . gap_extend_pen;
      out_file       =   sw . out_file;
      L		     =   sw . L;

      if ( ! strcmp ( "EDNAFULL", sw . matrix ) )       scoring_matrix = 0;
      else if ( ! strcmp ( "EBLOSUM62", sw . matrix ) ) scoring_matrix = 1;
      else
       {
         fprintf ( stderr, "Error: scoring matrix argument should be `EDNAFULL' for nucleotide sequences or `EBLOSUM62' for protein sequences!!!\n" );
         return ( 1 );
       }
      if ( L > 1 )
       {
         fprintf (stderr, "Error: wrong argument with L: should be either 0 (semi-global) or 1 (local)\n" );
         return ( 1 );
       }
    }
   
   /* reads the text */
   t = read_fasta_file ( sw . seq_a );
   if ( ! t )
    {
      fprintf (stderr, "Error: cannot read file %s!!!\n", sw . seq_a );
      return ( 1 );
    }
   if ( ! t -> header ) t -> header = strdup ( "Seq A" );
   
   /* reads the pattern */
   p = read_fasta_file ( sw . seq_b );
   if ( ! p )
    {
      fprintf( stderr, "Error: cannot read file %s!!!\n", sw . seq_b );
      return ( 1 );
    }
   if ( ! p -> header ) p -> header = strdup ( "Seq B" );
   
   /* calculate text's and pattern's length */
   n = strlen ( t -> data );
   m = strlen ( p -> data );
   
   /* checks the lengths of text and pattern and swaps if needed */
   if ( m > n )
    {
      swap_txt_pat ( &t, &n, &p, &m );
      swap = 1;
    }
   else
      swap = 0;
   
   /* checks the max num of gaps: MAXnumgaps < n */
   MAXnumgaps =  ( sw . max_num_gaps <= -1 ) ?  2 : sw . max_num_gaps;
   if( MAXnumgaps >= n )
    {
      fprintf ( stderr, "Error: the max gap length should be less than the length of the text!!!\n" );
      return ( 1 );
    }

   if ( L == 1 ) num_mat = MAXnumgaps + 1;
   else num_mat = MAXnumgaps;

   MAXgap =  ( sw . max_gap <= -1 ) ?  n - 1 : sw . max_gap;
   if( MAXgap >= n )
    {
      fprintf ( stderr, "Error: the max gap length should be less than the length of the text!!!\n" );
      return ( 1 );
    }
   
   /* 3d dynamic memory allocation for matrices G and H*/
   if( ( G = ( double *** ) malloc ( ( num_mat ) * sizeof( double ** ) ) ) == NULL )
    {
      fprintf( stderr, "G could not be allocated\n" );
      return 0;
    } 
	
   for ( i = 0; i < num_mat; i ++ )
    {
      if( ( G[i] = ( double ** ) malloc ( ( n + 1 ) * sizeof( double * ) ) ) == NULL )
       {
         fprintf( stderr, "G could not be allocated\n" );
         return 0;
       }
 
      if( ( G[i][0] = ( double * ) calloc ( ( n + 1 ) * ( m + 1 ), sizeof( double ) ) ) == NULL )
       {
	  fprintf( stderr, "G could not be allocated\n" );
	  return 0;
       } 

      for ( j = 1; j < n + 1; j++ )
        G[i][j] = ( void * ) G[i][0] + j * ( m + 1 ) * sizeof( double );
    }

   if( ( H = ( int *** ) malloc ( ( num_mat ) * sizeof( int ** ) ) ) == NULL )
    {
      fprintf( stderr, "H could not be allocated\n" );
      return 0;
    } 
	
   for ( i = 0; i < num_mat; i ++ )
    {
      if( ( H[i] = ( int ** ) malloc ( ( n + 1 ) * sizeof( int * ) ) ) == NULL )
       {
         fprintf( stderr, "H could not be allocated\n" );
         return 0;
       }
 
      if( ( H[i][0] = ( int * ) calloc ( ( n + 1 ) * ( m + 1 ), sizeof( int ) ) ) == NULL )
       {
	  fprintf( stderr, "H could not be allocated\n" );
	  return 0;
       } 

      for ( j = 1; j < n + 1; j++ )
        H[i][j] = ( void* ) H[i][0] + j * ( m + 1 ) * sizeof( int );
    }

   /* dynamic programming algorithm */
   if ( L == 0 )	
    {
     if ( MAXgap == n - 1 )
      {
        if ( ! ( dp_algorithm( G, MAXnumgaps, H, t -> data, n, p -> data, m, scoring_matrix, gap_open_pen, gap_extend_pen ) ) )
         {
           fprintf ( stderr, "Error: dp_algorithm() failed!!!\n" );
           return ( 1 );	
         }
      }
     else
      {
        if ( ! ( dp_algorithm_pruned( G, MAXnumgaps, H, t -> data, n, p -> data, m, scoring_matrix, gap_open_pen, gap_extend_pen, MAXgap ) ) )
         {
           fprintf ( stderr, "Error: dp_algorithm_pruned() failed!!!\n" );
           return ( 1 );	
         }
      }
    }
   else
    {
     if ( MAXgap == n - 1 )
      {
        if ( ! ( dp_algorithm_lcl( G, MAXnumgaps, H, t -> data, n, p -> data, m, scoring_matrix, gap_open_pen, gap_extend_pen ) ) )
         {
           fprintf ( stderr, "Error: dp_algorithm() failed!!!\n" );
           return ( 1 );	
         }
      }
     else
      {
        fprintf ( stderr, "Error: option `- m' is only for global alignment!!!\n" );
        return ( 1 );	
      }
    }


   MINnumgaps = 0;	//to be computed		
   /* finds the optimal alignment based on the matrices scores */
   if ( L == 0 ) opt_solution ( G, MAXnumgaps, n, m, &MAXscore, &istart, &MINnumgaps );
   if ( L == 1 ) opt_solution_lcl ( G, MAXnumgaps, n, m, &MAXscore, &istart, &jstart, &MINnumgaps );

   if( ( gaps_pos = ( unsigned int * ) calloc ( MINnumgaps, sizeof( unsigned int ) ) ) == NULL )
    {
      fprintf( stderr, "gaps_pos could not be allocated\n" );
      return 0;
    }

   if( ( gaps_len = ( unsigned int * ) calloc ( MINnumgaps, sizeof( unsigned int ) ) ) == NULL )
    {
      fprintf( stderr, "gaps_pos could not be allocated\n" );
      return 0;
    }

   if( ( where = ( unsigned int * ) calloc ( MINnumgaps, sizeof( unsigned int ) ) ) == NULL )
    {
      fprintf( stderr, "where could not be allocated\n" );
      return 0;
    }
 
   /* computes the position of the gap */
   if ( L == 0 ) backtracing ( H[MINnumgaps - 1], m, n, istart, gaps_pos, MINnumgaps, gaps_len, where );
   if ( L == 1 ) backtracing_lcl ( G[MINnumgaps], m, n, H[MINnumgaps], istart, jstart, gaps_pos, MINnumgaps, gaps_len, where, &iend, &jend );

   #if 0
   int s;
   for ( s = 0;  s < num_mat; s++ )
        {

                for(i = 0; i < n+1; i++)                //Matrix G output
                {
                        for(j = 0; j < m+1; j++)
                        {
                                fprintf( stderr,"%d\t", (int) G[s][i][j] );
                        }
                        fprintf(stderr,"\n");
                }

                fprintf(stderr,"\n");

                #if 1
                for(i = 0; i < n+1; i++)                //Matrix H output
                {
                        for(j = 0; j < m+1; j++)
                        {
                                fprintf( stderr,"%d\t", H[s][i][j]);
                        }
                        fprintf(stderr,"\n");
                }

                fprintf(stderr,"\n");
                #endif
        }
   if ( L == 1 ) fprintf( stderr,"\n[%d,%d]-->[%d,%d]\n", istart, jstart, iend, jend );
   #endif

   /* outputs the results */
   if ( L == 0 )
     if ( ! ( results ( out_file, t, n, p, m, MAXscore, gaps_pos, MINnumgaps, gaps_len, where, swap, scoring_matrix, gap_open_pen, gap_extend_pen, L ) ) )
      {
        fprintf(stderr, "Error: results() failed!!!\n");
        return ( 1 );	
      }

   if ( L == 1 )
     if ( ! ( results_lcl ( out_file, t, n, p, m, MAXscore, gaps_pos, MINnumgaps, gaps_len, where, istart, iend, jstart, jend, swap, scoring_matrix, gap_open_pen, gap_extend_pen, L ) ) )
      {
        fprintf(stderr, "Error: results() failed!!!\n");
        return ( 1 );	
      }

   for ( i = 0;  i < num_mat; i++ )
    {
      free ( G[i][0] );
      free ( G[i] );
      free ( H[i][0] );
      free ( H[i] );
    }
   free ( G );
   free ( H );
   free ( gaps_pos );
   free ( gaps_len );
   free ( where );
   free ( t -> header );
   free ( p -> header );
   free ( t );
   free ( p );
   free ( sw . out_file );
   free ( sw . matrix );
   return ( 0 );
 }