void test_read_fasta_file(struct test *t) { t_set_msg(t, "Testing reading a fasta File..."); struct genome_sequence *sequence_table = NULL; char filename[30] = "test/data/test.fasta"; int err = read_fasta_file(&sequence_table, filename); t_assert_msg(t, err == E_SUCCESS, "Parsing failed"); t_log(t, "Error: %d\n", err); struct genome_sequence *s = NULL; struct genome_sequence *tmp = NULL; HASH_ITER(hh, sequence_table, s, tmp) { s->data[s->n] = 0; t_log(t, "%s %s \n\n", s->chrom, s->data); }
void ReadGenome(const vector<string>& chrom_files, Genome& genome) { fprintf(stderr, "[READING CHROMOSOMES]\n"); vector<string> chrom_names; vector<string> chrom_seqs; uint32_t all_chroms_len = 0; for (uint32_t i = 0; i < chrom_files.size(); ++i) { vector<string> tmp_chrom_names; vector<string> tmp_chrom_seqs; /* read chromosome name and seqeunce from the chromosome file */ read_fasta_file(chrom_files[i].c_str(), tmp_chrom_names, tmp_chrom_seqs); for (uint32_t j = 0; j < tmp_chrom_seqs.size(); ++j) { chrom_names.push_back(tmp_chrom_names[j]); chrom_seqs.push_back(tmp_chrom_seqs[j]); all_chroms_len += tmp_chrom_seqs[j].size(); } } /* copy chroms sequences to genome */ genome.num_of_chroms = chrom_seqs.size(); genome.length_of_genome = all_chroms_len; fprintf(stderr, "[THERE ARE %u CHROMOSOMES IN THE GENOME]\n", genome.num_of_chroms); fprintf(stderr, "[THE TOTAL LENGTH OF ALL CHROMOSOMES IS %u]\n", all_chroms_len); genome.name.resize(genome.num_of_chroms); genome.length.resize(genome.num_of_chroms); /* The last element of genome.start-index is the length of all chroms */ genome.start_index.resize(genome.num_of_chroms + 1); genome.strand = '+'; genome.sequence.resize(all_chroms_len); uint32_t k = 0; for (uint32_t i = 0; i < genome.num_of_chroms; ++i) { genome.name[i] = chrom_names[i]; genome.length[i] = chrom_seqs[i].size(); genome.start_index[i] = k; for (uint32_t j = 0; j < chrom_seqs[i].size(); ++j) { genome.sequence[k] = toupper(chrom_seqs[i][j]); genome.sequence[k] = toACGT(genome.sequence[k]); k++; } } genome.start_index[genome.num_of_chroms] = k; }
// unit test: read fasta file and print all entries int main(int argc, char * argv[]) { if (argc < 2) { puts ("ERROR: no FASTA-file specified!"); return 1; } multifasta *mf = read_fasta_file (argv[1]); if (mf == NULL) return 1; int i; for (i = 0; i < mf->length; i++) { printf("Entry %d \n", i); printf("Header: %s \n", mf->entries[i]->header); printf("Sequenz: %s \n", mf->entries[i]->sequence); } multifasta_delete (mf); return 0; }
int main ( int argc, char ** argv) { struct TSwitch sw; char * out_file; unsigned int MAXnumgaps; //input argument double gap_open_pen; //input argument double gap_extend_pen; //input argument unsigned int scoring_matrix; //input argument unsigned int MAXgap; unsigned int num_mat; //input argument + 1 double MAXscore; //to be computed unsigned int MINnumgaps; //to be computed unsigned int istart; //where to start backtracing unsigned int jstart; //where to start backtracing unsigned int iend; //where to end backtracing unsigned int jend; //where to end backtracing unsigned int * gaps_pos; //position of the gap(s) unsigned int * gaps_len; //len of the gap(s) unsigned int * where; //where is the gap(s): text [1] or pattern [2] struct TSeq * t; //text t unsigned int n; //length of text struct TSeq * p; //pattern p unsigned int m; //length of pattern unsigned int L; //local alignment double *** G; //dynamic programming matrix int *** H; //backtracing matrix unsigned int swap; //swap the text and the pattern in case m < n unsigned int i, j; /* checks the arguments */ i = decode_switches ( argc, argv, &sw ); if ( i < 5 || ! sw . seq_a || ! sw . seq_b ) { usage (); return ( 1 ); } else { gap_open_pen = - sw . gap_open_pen; //the penalties should have a negative value gap_extend_pen = - sw . gap_extend_pen; out_file = sw . out_file; L = sw . L; if ( ! strcmp ( "EDNAFULL", sw . matrix ) ) scoring_matrix = 0; else if ( ! strcmp ( "EBLOSUM62", sw . matrix ) ) scoring_matrix = 1; else { fprintf ( stderr, "Error: scoring matrix argument should be `EDNAFULL' for nucleotide sequences or `EBLOSUM62' for protein sequences!!!\n" ); return ( 1 ); } if ( L > 1 ) { fprintf (stderr, "Error: wrong argument with L: should be either 0 (semi-global) or 1 (local)\n" ); return ( 1 ); } } /* reads the text */ t = read_fasta_file ( sw . seq_a ); if ( ! t ) { fprintf (stderr, "Error: cannot read file %s!!!\n", sw . seq_a ); return ( 1 ); } if ( ! t -> header ) t -> header = strdup ( "Seq A" ); /* reads the pattern */ p = read_fasta_file ( sw . seq_b ); if ( ! p ) { fprintf( stderr, "Error: cannot read file %s!!!\n", sw . seq_b ); return ( 1 ); } if ( ! p -> header ) p -> header = strdup ( "Seq B" ); /* calculate text's and pattern's length */ n = strlen ( t -> data ); m = strlen ( p -> data ); /* checks the lengths of text and pattern and swaps if needed */ if ( m > n ) { swap_txt_pat ( &t, &n, &p, &m ); swap = 1; } else swap = 0; /* checks the max num of gaps: MAXnumgaps < n */ MAXnumgaps = ( sw . max_num_gaps <= -1 ) ? 2 : sw . max_num_gaps; if( MAXnumgaps >= n ) { fprintf ( stderr, "Error: the max gap length should be less than the length of the text!!!\n" ); return ( 1 ); } if ( L == 1 ) num_mat = MAXnumgaps + 1; else num_mat = MAXnumgaps; MAXgap = ( sw . max_gap <= -1 ) ? n - 1 : sw . max_gap; if( MAXgap >= n ) { fprintf ( stderr, "Error: the max gap length should be less than the length of the text!!!\n" ); return ( 1 ); } /* 3d dynamic memory allocation for matrices G and H*/ if( ( G = ( double *** ) malloc ( ( num_mat ) * sizeof( double ** ) ) ) == NULL ) { fprintf( stderr, "G could not be allocated\n" ); return 0; } for ( i = 0; i < num_mat; i ++ ) { if( ( G[i] = ( double ** ) malloc ( ( n + 1 ) * sizeof( double * ) ) ) == NULL ) { fprintf( stderr, "G could not be allocated\n" ); return 0; } if( ( G[i][0] = ( double * ) calloc ( ( n + 1 ) * ( m + 1 ), sizeof( double ) ) ) == NULL ) { fprintf( stderr, "G could not be allocated\n" ); return 0; } for ( j = 1; j < n + 1; j++ ) G[i][j] = ( void * ) G[i][0] + j * ( m + 1 ) * sizeof( double ); } if( ( H = ( int *** ) malloc ( ( num_mat ) * sizeof( int ** ) ) ) == NULL ) { fprintf( stderr, "H could not be allocated\n" ); return 0; } for ( i = 0; i < num_mat; i ++ ) { if( ( H[i] = ( int ** ) malloc ( ( n + 1 ) * sizeof( int * ) ) ) == NULL ) { fprintf( stderr, "H could not be allocated\n" ); return 0; } if( ( H[i][0] = ( int * ) calloc ( ( n + 1 ) * ( m + 1 ), sizeof( int ) ) ) == NULL ) { fprintf( stderr, "H could not be allocated\n" ); return 0; } for ( j = 1; j < n + 1; j++ ) H[i][j] = ( void* ) H[i][0] + j * ( m + 1 ) * sizeof( int ); } /* dynamic programming algorithm */ if ( L == 0 ) { if ( MAXgap == n - 1 ) { if ( ! ( dp_algorithm( G, MAXnumgaps, H, t -> data, n, p -> data, m, scoring_matrix, gap_open_pen, gap_extend_pen ) ) ) { fprintf ( stderr, "Error: dp_algorithm() failed!!!\n" ); return ( 1 ); } } else { if ( ! ( dp_algorithm_pruned( G, MAXnumgaps, H, t -> data, n, p -> data, m, scoring_matrix, gap_open_pen, gap_extend_pen, MAXgap ) ) ) { fprintf ( stderr, "Error: dp_algorithm_pruned() failed!!!\n" ); return ( 1 ); } } } else { if ( MAXgap == n - 1 ) { if ( ! ( dp_algorithm_lcl( G, MAXnumgaps, H, t -> data, n, p -> data, m, scoring_matrix, gap_open_pen, gap_extend_pen ) ) ) { fprintf ( stderr, "Error: dp_algorithm() failed!!!\n" ); return ( 1 ); } } else { fprintf ( stderr, "Error: option `- m' is only for global alignment!!!\n" ); return ( 1 ); } } MINnumgaps = 0; //to be computed /* finds the optimal alignment based on the matrices scores */ if ( L == 0 ) opt_solution ( G, MAXnumgaps, n, m, &MAXscore, &istart, &MINnumgaps ); if ( L == 1 ) opt_solution_lcl ( G, MAXnumgaps, n, m, &MAXscore, &istart, &jstart, &MINnumgaps ); if( ( gaps_pos = ( unsigned int * ) calloc ( MINnumgaps, sizeof( unsigned int ) ) ) == NULL ) { fprintf( stderr, "gaps_pos could not be allocated\n" ); return 0; } if( ( gaps_len = ( unsigned int * ) calloc ( MINnumgaps, sizeof( unsigned int ) ) ) == NULL ) { fprintf( stderr, "gaps_pos could not be allocated\n" ); return 0; } if( ( where = ( unsigned int * ) calloc ( MINnumgaps, sizeof( unsigned int ) ) ) == NULL ) { fprintf( stderr, "where could not be allocated\n" ); return 0; } /* computes the position of the gap */ if ( L == 0 ) backtracing ( H[MINnumgaps - 1], m, n, istart, gaps_pos, MINnumgaps, gaps_len, where ); if ( L == 1 ) backtracing_lcl ( G[MINnumgaps], m, n, H[MINnumgaps], istart, jstart, gaps_pos, MINnumgaps, gaps_len, where, &iend, &jend ); #if 0 int s; for ( s = 0; s < num_mat; s++ ) { for(i = 0; i < n+1; i++) //Matrix G output { for(j = 0; j < m+1; j++) { fprintf( stderr,"%d\t", (int) G[s][i][j] ); } fprintf(stderr,"\n"); } fprintf(stderr,"\n"); #if 1 for(i = 0; i < n+1; i++) //Matrix H output { for(j = 0; j < m+1; j++) { fprintf( stderr,"%d\t", H[s][i][j]); } fprintf(stderr,"\n"); } fprintf(stderr,"\n"); #endif } if ( L == 1 ) fprintf( stderr,"\n[%d,%d]-->[%d,%d]\n", istart, jstart, iend, jend ); #endif /* outputs the results */ if ( L == 0 ) if ( ! ( results ( out_file, t, n, p, m, MAXscore, gaps_pos, MINnumgaps, gaps_len, where, swap, scoring_matrix, gap_open_pen, gap_extend_pen, L ) ) ) { fprintf(stderr, "Error: results() failed!!!\n"); return ( 1 ); } if ( L == 1 ) if ( ! ( results_lcl ( out_file, t, n, p, m, MAXscore, gaps_pos, MINnumgaps, gaps_len, where, istart, iend, jstart, jend, swap, scoring_matrix, gap_open_pen, gap_extend_pen, L ) ) ) { fprintf(stderr, "Error: results() failed!!!\n"); return ( 1 ); } for ( i = 0; i < num_mat; i++ ) { free ( G[i][0] ); free ( G[i] ); free ( H[i][0] ); free ( H[i] ); } free ( G ); free ( H ); free ( gaps_pos ); free ( gaps_len ); free ( where ); free ( t -> header ); free ( p -> header ); free ( t ); free ( p ); free ( sw . out_file ); free ( sw . matrix ); return ( 0 ); }