CodonFrequency * flat_CodonFrequency(CodonTable * ct) { int number[26]; CodonFrequency * out; register int i; out = CodonFrequency_alloc(); for(i=0;i<26;i++) number[i] = 0; for(i=0;i<64;i++) out->freq[i]= 0.0; for(i=0;i<125;i++) if( has_random_bases(i) == FALSE && is_stop_codon(i,ct) == FALSE) number[aminoacid_no_from_codon(ct,i)]++; for(i=0;i<64;i++) { if( is_stop_codon(codon_from_base4_codon(i),ct) == FALSE && number[aminoacid_no_from_codon(ct,codon_from_base4_codon(i))] != 0) out->freq[i] = 1.0 / number[aminoacid_no_from_codon(ct,codon_from_base4_codon(i))]; } return out; }
int is_nonsense_clean(GFF_Feature *feat, MSA *msa, List *problems) { int i, j, len; char seq[feat->end - feat->start + 2]; for (j = 1; j < msa->nseqs; j++) { /* no need to check reference seq */ /* first copy entire sequence without gaps */ for (i = feat->start - 1, len = 0; i < feat->end; i++) if (ss_get_char_pos(msa, i, j, 0) != GAP_CHAR) seq[len++] = ss_get_char_pos(msa, i, j, 0); seq[len] = '\0'; if (feat->strand == '-') msa_reverse_compl_seq(seq, len); /* now scan for stop codons */ for (i = (3 - feat->frame) % 3; i <= len - 3; i += 3) if (is_stop_codon(&seq[i])) { int problem_start; if(feat->strand == '+') problem_start = feat->start+i; else problem_start = feat->end-i-2; problem_add(problems, feat, NONSENSE, problem_start, problem_start+2); return 0; } } return 1; }
void map_codon_array_CodonMapper(double * codon_array,double * protein_array,double stop,CodonMapper * cm) { register int i; for(i=0;i<125;i++) { if( is_stop_codon(i,cm->ct)== TRUE ) codon_array[i] = stop; else codon_array[i] = map_codon_CodonMapper(i,protein_array,cm); } }
int est_stop_pot(void * data,ComplexSequence *seq,int jposition) { EstEvidence * est; int i; est = (EstEvidence *)data; if( is_stop_codon(CSEQ_GENOMIC_CODON(seq,jposition),est->ct) ) { return 100; } else { return -1000; } }
CodonMatrix * naive_CodonMatrix(CodonTable * ct,CompProb * comp) { int i; int j; CodonMatrix * out; out = CodonMatrix_alloc(); for(i=0; i<125; i++) for(j=i; j<125; j++) { if( has_random_bases(i) == TRUE || has_random_bases(j) == TRUE ) { out->prob[i][j] = out->prob[j][i] = 1.0; } else if ( is_stop_codon(i,ct) == TRUE || is_stop_codon(j,ct) == TRUE ) { out->prob[i][j] = out->prob[j][i] = 0.00001; } else { out->prob[i][j] = out->prob[j][i] = comp->comp[aminoacid_no_from_codon(ct,i)][aminoacid_no_from_codon(ct,j)]; } } return out; }
RandomCodon * flat_RandomCodon(CodonTable * ct) { int i; RandomCodon * rc; rc = RandomCodon_alloc(); for(i=0;i<125;i++) { if( is_stop_codon(i,ct) ) { rc->codon[i] = 0.0; } else { rc->codon[i] = 1.0 / 61.0; } } return rc; }
int est_start_pot(void * data,ComplexSequence *seq,int jposition) { EstEvidence * est; int i; int codon; int atg = (BASE_A*25+BASE_T*5+BASE_G); est = (EstEvidence *)data; codon = CSEQ_GENOMIC_CODON(seq,jposition); if( is_stop_codon(codon,est->ct) ) { return -10000; } else if( codon == atg ) { return 1200; } else { return 0; } }
int est_cds_pot(void * data,ComplexSequence *seq,int jposition) { int i; EstEvidence * est; int relative_frame; est = (EstEvidence *)data; for(i=0;i<est->len;i++) { if( est->exon[i]->start <= jposition && jposition <= est->exon[i]->end ) { if( is_stop_codon(CSEQ_GENOMIC_CODON(seq,jposition),est->ct) ) { return -1000000; } else { if( est->exon[i]->is_coding == TRUE ) { /* phase calculation. difference between start and position */ /* more complex than it looks due to convention of where a codon lies and the phase convention */ relative_frame = (jposition-est->exon[i]->start)%3; if( relative_frame == 2 && est->exon[i]->phase == 0 ) { return 125; } else if ( relative_frame == 1 && est->exon[i]->phase == 1 ) { return 125; } else if ( relative_frame == 0 && est->exon[i]->phase == 2) { return 125; } } else { /* not coding exon - return 45 */ return 80; } } } } /* we have to return same as stop codon penalty, otherwise we can just dodge stop codons using evidence lines */ return -1000000; }
/* dump sub-alignment corresponding to feature */ void dump_aln(FILE *F, GFF_Feature *feat, MSA *msa, int show_frame) { int i, j; MSA *sub = msa_sub_alignment(msa, NULL, 0, feat->start - 1, feat->end); if (sub->seqs == NULL && sub->ss != NULL) ss_to_msa(sub); if (feat->strand == '-') msa_reverse_compl(sub); for (i = 0; i < sub->nseqs; i++) { fprintf(F, "%-12s ", sub->names[i]); for (j = 0; j < sub->length; j++) { if (show_frame && (j + feat->frame) % 3 == 0) { if (is_stop_codon(&(sub->seqs[i][j]))) fprintf(F, "*"); /* this may not be right when there are frameshift gaps (e.g., temporary ones with compensatory indels) or gaps interrupting stop codons, but it will be good enough to see what's going on most of the time */ else fprintf(F, "."); } fprintf(F, "%c", sub->seqs[i][j]); } fprintf(F, "\n"); } msa_free(sub); }
CodonFrequency * CodonFrequence_from_raw_counts(double * codon,CodonTable * ct) { double total[26]; CodonFrequency * cf; register int i; int c; register int j; for(i=0;i<26;i++) { total[i] = 0.0; for(j=0;j<64;j++) { c = codon_from_base4_codon(j); if( ct->codon_str[c] == ('A' + i) ) { total[i] += codon[j]; } } } cf = CodonFrequency_alloc(); for(i=0;i<64;i++) { c = codon_from_base4_codon(i); if( is_stop_codon(c,ct) ) continue; if( codon[i] < 0.0000000001) cf->freq[i] = 0.0; else { if( total[ct->codon_str[c] -'A'] < 0.00000001 ) { warn("For codon %d, amino acid %c, we have no frequency",i,ct->codon_str[i]); } else cf->freq[i] = codon[i] / total[ct->codon_str[c]-'A']; } } return cf; }
void flatten_diagonal_PairBaseCodonModel(PairBaseCodonModel * m,CodonTable * ct) { int a,b,c; pairbase_type seq[3]; int codon_a; int p; for(a=0;a<5;a++) { for(b=0;b<5;b++) { for(c=0;c<5;c++) { if( a < 4 && b < 4 && c < 4 ) { codon_a = (25 * a) + (5 * b) + c; if( is_stop_codon(codon_a,ct) ) { m->codon[p] = 0.0; continue; } } /* build the sequence */ seq[0] = MAKE_PAIRBASE(a,a); seq[1] = MAKE_PAIRBASE(b,b); seq[2] = MAKE_PAIRBASE(c,c); p = pairbase_codon_from_seq(seq); if( a == 4 && b == 4 && c== 4 ) { m->codon[p] = 0.5; } else { m->codon[p] = 1.0; } } } } }
Protein * get_Protein_from_Translation(Translation * ts,CodonTable * ct) { cDNA * cd; int i,j; Sequence * seq; char buffer[64]; assert(ts); assert(ct); /* fprintf(stderr,"Codon table is %d\n",ct);*/ if( ts->protein != NULL) return ts->protein; if( ts->parent == NULL ) { warn("Cannot get Protein from translation as no parent!"); return NULL; } cd = get_cDNA_from_Transcript(ts->parent); if( cd == NULL ) { warn("Cannot make translation as can't get transcript!"); return NULL; } if( cd->baseseq == NULL ) { warn("A bad error - a non NULL cDNA with a null sequence object. No translation here!"); return NULL; } if( cd->baseseq->len == 0 ) { warn("Attempting to translate a zero length cDNA. Yikes!"); return NULL; } seq = Sequence_alloc(); sprintf(buffer,"%s.tr",cDNA_name(cd)); seq->name = stringalloc(buffer); seq->seq = ckcalloc((cd->baseseq->len/3) + 2,sizeof(char)); seq->type = SEQUENCE_PROTEIN; if( cd->baseseq->len%3 != 0 ) { warn("Problem in making translation, cDNA is not mod3! - length is %d - transcript id %s",cd->baseseq->len,seq->name); } for(i=0,j=0;i<cd->baseseq->len;i+=3,j++) { if( is_stop_codon(codon_from_seq(cd->baseseq->seq+i),ct) == TRUE ) { if( i+3 >= cd->baseseq->len ) break; else { warn("Got a stop codon in the middle of a translation at postion [%d]. Yuk!",i); seq->seq[j] = '*'; } } else { seq->seq[j] = aminoacid_from_seq(ct,cd->baseseq->seq+i); } } seq->seq[j]='\0'; make_len_type_Sequence(seq); /*write_fasta_Sequence(seq,stdout);*/ seq->type = SEQUENCE_PROTEIN; ts->protein = Protein_from_Sequence(seq); return ts->protein; }
int main(int argc,char ** argv) { Sequence * gen; Genomic * genomic; CodonTable * ct = NULL; GenomeEvidenceSet * ges = NULL; RandomCodonScore * rcs; FILE * ifp = NULL; ComplexSequence * cs = NULL; ComplexSequenceEvalSet * cses = NULL; AlnBlock * alb; PackAln * pal; GenomicRegion * gr; int i; Protein * trans; cDNA * cdna; int kbyte = 10000; int stop_codon_pen = 200; int start_codon_pen = 30; int new_gene = 5000; int switch_cost = 100; int smell = 8; DPRunImpl * dpri = NULL; EstEvidence * est; boolean show_trans = TRUE; boolean show_cdna = FALSE; boolean show_genes = TRUE; boolean show_alb = FALSE; boolean show_pal = FALSE; boolean show_gff = TRUE; boolean show_debug = FALSE; boolean show_geneu = TRUE; char * divide_string = "//"; strip_out_boolean_def_argument(&argc,argv,"geneutr",&show_geneu); strip_out_boolean_def_argument(&argc,argv,"genes",&show_genes); strip_out_boolean_def_argument(&argc,argv,"trans",&show_trans); strip_out_boolean_def_argument(&argc,argv,"gff",&show_gff); strip_out_boolean_def_argument(&argc,argv,"alb",&show_alb); strip_out_boolean_def_argument(&argc,argv,"pal",&show_pal); strip_out_boolean_def_argument(&argc,argv,"debug",&show_debug); strip_out_boolean_def_argument(&argc,argv,"cdna",&show_cdna); strip_out_integer_argument(&argc,argv,"stop",&stop_codon_pen); strip_out_integer_argument(&argc,argv,"start",&start_codon_pen); strip_out_integer_argument(&argc,argv,"gene",&new_gene); strip_out_integer_argument(&argc,argv,"switch",&switch_cost); strip_out_integer_argument(&argc,argv,"smell",&smell); dpri = new_DPRunImpl_from_argv(&argc,argv); if( dpri == NULL ) { fatal("Unable to build DPRun implementation. Bad arguments"); } strip_out_standard_options(&argc,argv,show_help,show_version); if( argc != 3 ) { show_help(stdout); exit(12); } ct = read_CodonTable_file("codon.table"); gen = read_fasta_file_Sequence(argv[1]); ifp = openfile(argv[2],"r"); ges = read_est_evidence(ifp,ct); for(i=0;i<ges->len;i++) { est = (EstEvidence *) ges->geu[i]->data; est->in_smell = smell; } rcs= RandomCodonScore_alloc(); for(i=0;i<125;i++) { if( is_stop_codon(i,ct) ) { rcs->codon[i] = -1000000; } else { rcs->codon[i] = 0; } /* fprintf(stderr,"Got %d for %d\n",rcs->codon[i],i); */ } cses = default_genomic_ComplexSequenceEvalSet(); cs = new_ComplexSequence(gen,cses); pal = PackAln_bestmemory_GenomeWise9(ges,cs,-switch_cost,-new_gene,-start_codon_pen,-stop_codon_pen,rcs,NULL,dpri); alb = convert_PackAln_to_AlnBlock_GenomeWise9(pal); genomic = Genomic_from_Sequence(gen); gr = new_GenomicRegion(genomic); add_Genes_to_GenomicRegion_GeneWise(gr,1,gen->len,alb,gen->name,0,NULL); if( show_genes ) { show_pretty_GenomicRegion(gr,0,stdout); fprintf(stdout,"%s\n",divide_string); } if( show_gff ) { show_GFF_GenomicRegion(gr,gen->name,"genomwise",stdout); fprintf(stdout,"%s\n",divide_string); } if( show_trans ) { for(i=0;i<gr->len;i++) { if( gr->gene[i]->ispseudo == TRUE ) { fprintf(stdout,"#Gene %d is a pseudo gene - no translation possible\n",i); } else { trans = get_Protein_from_Translation(gr->gene[i]->transcript[0]->translation[0],ct); write_fasta_Sequence(trans->baseseq,stdout); } } fprintf(stdout,"%s\n",divide_string); } if( show_cdna ) { for(i=0;i<gr->len;i++) { cdna = get_cDNA_from_Transcript(gr->gene[i]->transcript[0]); write_fasta_Sequence(cdna->baseseq,stdout); } fprintf(stdout,"%s\n",divide_string); } if( show_geneu ) { show_utr_exon_genomewise(alb,stdout); fprintf(stdout,"%s\n",divide_string); } if( show_alb ) { mapped_ascii_AlnBlock(alb,id,1,stdout); fprintf(stdout,"%s\n",divide_string); } if( show_debug ) { debug_genomewise(alb,ges,ct,gen,stdout); fprintf(stdout,"%s\n",divide_string); } if( show_pal ) { show_simple_PackAln(pal,stdout); fprintf(stdout,"%s\n",divide_string); } return 0; }
/* checks to see if reference sequence looks okay wrt a given list of features */ int ref_seq_okay(List *features, MSA *msa, int offset3, int indel_strict, int splice_strict, List *problems) { List *signals = NULL; char *seq = NULL; int seqalloc = 0; int idx, retval = TRUE; GFF_Feature *feat, *lastfeat_helper = NULL; if (indel_strict) { signals = lst_new_ptr(10); str_split(str_new_charstr(SIGNALS), ",", signals); } for (idx = 0; idx < lst_size(features); idx++) { int i, j, len, has_gaps = 0; feat = lst_get_ptr(features, idx); if (seqalloc <= feat->end - feat->start + 2) { seqalloc = (feat->end - feat->start) * 2; seq = srealloc(seq, seqalloc * sizeof(char)); } for (i = feat->start - 1, len = 0; i < feat->end; i++) { if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) seq[len++] = ss_get_char_pos(msa, i, 0, 0); else if (!has_gaps) has_gaps = 1; } seq[len] = '\0'; if (feat->strand == '-') msa_reverse_compl_seq(seq, len); if (str_equals_charstr(feat->feature, GFF_START_TYPE) && strcmp(seq, "ATG") != 0) { problem_add(problems, feat, BAD_REF_START, -1, -1); retval = FALSE; } else if (str_equals_charstr(feat->feature, GFF_STOP_TYPE) && (feat->frame != 0 || !is_stop_codon(seq))) { problem_add(problems, feat, BAD_REF_STOP, -1, -1); retval = FALSE; } else if (str_starts_with_charstr(feat->feature, SPLICE_5) && !is_valid_5splice(seq, splice_strict)) { problem_add(problems, feat, BAD_REF_5_SPLICE, -1, -1); retval = FALSE; } else if (str_starts_with_charstr(feat->feature, SPLICE_3) && !is_valid_3splice(&seq[offset3], splice_strict)) { problem_add(problems, feat, BAD_REF_3_SPLICE, -1, -1); retval = FALSE; } else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (i = (3 - feat->frame) % 3; i <= len - 3; i += 3) { if (is_stop_codon(&seq[i])) { problem_add(problems, feat, BAD_REF_ORF, -1, -1); retval = FALSE; break; } } } if (indel_strict) { int strict_okay = TRUE; List *signals = lst_new_ptr(10); str_split(str_new_charstr(SIGNALS), ",", signals); if (str_in_list(feat->feature, signals)) { /* reject any signal feature with gaps in the ref seq, unless they appear in a non-critical part of a splice site or in a "prestart" feature */ if (has_gaps) { if (str_starts_with_charstr(feat->feature, SPLICE_5)) { if (ss_get_char_pos(msa, feat->start-1, 0, 0) == GAP_CHAR || ss_get_char_pos(msa, feat->start, 0, 0) == GAP_CHAR) strict_okay = FALSE; } else if (str_starts_with_charstr(feat->feature, SPLICE_3)) { if (ss_get_char_pos(msa, feat->end-1, 0, 0) == GAP_CHAR || ss_get_char_pos(msa, feat->end-2, 0, 0) == GAP_CHAR) strict_okay = FALSE; } else if (!str_equals_charstr(feat->feature, "prestart")) strict_okay = FALSE; } /* in addition, if two signals occur consec. with gaps and only gaps between them, assume a violation of --indel-strict */ if (lastfeat_helper != NULL && lastfeat_helper->end < feat->start-1) { int allgaps = 1; for (j = lastfeat_helper->end; allgaps && j < feat->start-1; j++) /* note indexing: -1+1 for end and -1 for start */ if (ss_get_char_pos(msa, j, 0, 0) != GAP_CHAR) allgaps = 0; if (allgaps) strict_okay = FALSE; } lastfeat_helper = feat; } else lastfeat_helper = NULL; /* also exclude CDS exons of length less than 6 in indel_strict case -- these cause problems in exoniphy training because start_codon is adjacent to cds5ss */ if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && len <= 6) strict_okay = FALSE; if (!strict_okay) { problem_add(problems, feat, BAD_REF_INDEL_STRICT_FAIL, -1, -1); retval = FALSE; } lst_free_strings(signals); lst_free(signals); } } if (seq != NULL) sfree(seq); return retval; }
PairBaseCodonModel * make_PairBaseCodonModel(CodonMatrix * codon_matrix,Probability nonm,Probability gap,CodonTable * ct) { PairBaseCodonModel * out; int a,b,c,x,y,z; int i; int codon_a; int codon_b; int p; pairbase_type seq[3]; assert(codon_matrix); assert(ct); out = PairBaseCodonModel_alloc(); for(i=0;i<PAIRBASE_CODON_LENGTH;i++) { out->codon[i] = 0.0; } for(a=0;a<5;a++) { for(b=0;b<5;b++) { for(c=0;c<5;c++) { for(x=0;x<5;x++) { for(y=0;y<5;y++) { for(z=0;z<5;z++) { /* build the sequence */ seq[0] = MAKE_PAIRBASE(a,x); seq[1] = MAKE_PAIRBASE(b,y); seq[2] = MAKE_PAIRBASE(c,z); p = pairbase_codon_from_seq(seq); codon_a = (a * 25) + (b * 5) + c; codon_b = (x * 25) + (y * 5) + z; if( is_stop_codon(codon_a,ct) || is_stop_codon(codon_b,ct) ) { out->codon[p] = 0.0; continue; } /* else */ out->codon[p] = codon_matrix->prob[codon_a][codon_b]; } } } } } } /* now to do blank and gap scores */ for(a=0;a<5;a++) { for(b=0;b<5;b++) { for(c=0;c<5;c++) { codon_a = (a * 25) + (b * 5) + c; seq[0] = MAKE_PAIRBASE(a,BASE_GAP); seq[1] = MAKE_PAIRBASE(b,BASE_GAP); seq[2] = MAKE_PAIRBASE(c,BASE_GAP); p = pairbase_codon_from_seq(seq); if( is_stop_codon(codon_a,ct) ) { out->codon[p] = 0.0; } else { out->codon[p] = gap; } seq[0] = MAKE_PAIRBASE(BASE_GAP,a); seq[1] = MAKE_PAIRBASE(BASE_GAP,b); seq[2] = MAKE_PAIRBASE(BASE_GAP,c); p = pairbase_codon_from_seq(seq); if( is_stop_codon(codon_a,ct) ) { out->codon[p] = 0.0; } else { out->codon[p] = gap; } seq[0] = MAKE_PAIRBASE(a,BASE_OPEN); seq[1] = MAKE_PAIRBASE(b,BASE_OPEN); seq[2] = MAKE_PAIRBASE(c,BASE_OPEN); p = pairbase_codon_from_seq(seq); if( is_stop_codon(codon_a,ct) ) { out->codon[p] = 0.0; } else { out->codon[p] = nonm; } seq[0] = MAKE_PAIRBASE(BASE_OPEN,a); seq[1] = MAKE_PAIRBASE(BASE_OPEN,b); seq[2] = MAKE_PAIRBASE(BASE_OPEN,c); p = pairbase_codon_from_seq(seq); if( is_stop_codon(codon_a,ct) ) { out->codon[p] = 0.0; } else { out->codon[p] = nonm; } } } } return out; }
CodonMapper * new_CodonMapper(CodonTable * ct,CodonFrequency * cf) { register int i; register int j; int k; base one; base two; base three; int base4; int oi,ti,ri; double total_freq; CodonMapper * out; out = CodonMapper_alloc(); out->ct = hard_link_CodonTable(ct); for(i=0;i<125;i++) { for(j=0;j<26;j++) out->codon_map[i][j] =0.0; if( has_random_bases(i) == FALSE ) { if( is_stop_codon(i,ct) == TRUE ) { for(k=0;k<26;k++) out->codon_map[i][k] = (0.0); } else { out->codon_map[i][aminoacid_no_from_codon(ct,i)] = cf->freq[base4_codon_from_codon(i)]; } } else { /*** is a random base ***/ /*** sneaky stuff. What we want to do is loop over all possible codons, adding up their frequencies for the amino acids they represent. This is done by looping over all possible bases for each position and then letting through ones which either have an N at this position or is the actual base. ***/ all_bases_from_codon(i,&one,&two,&three); total_freq = 0.0; for(oi=0;oi<4;oi++) for(ti=0;ti<4;ti++) for(ri=0;ri<4;ri++) { if( (one == BASE_N || one == oi) && (two == BASE_N || two == ti) && (three == BASE_N || three == ri) ) { base4 = codon_from_base4_codon(oi*16+ti*4+ri); if( !is_stop_codon(base4,ct) ) { out->codon_map[i][aminoacid_no_from_codon(ct,base4)] += cf->freq[base4_codon_from_codon(base4)]; } } /* end of if one == BASE_N || one == oi */ } /* end of for oi,ti,ri */ } /* end of else */ } return out; }