cDNA * get_cDNA_from_Transcript(Transcript * trs) { Genomic * gn; Sequence * base; int i; char buffer[64]; if( trs->cDNA != NULL) return trs->cDNA; if( trs->parent == NULL ) { warn("Cannot get cDNA, as no parent Gene!"); return NULL; } if ( (gn = get_Genomic_from_Gene(trs->parent)) == NULL ) { warn("Cannot get cDNA, as cannot get Genomic sequence from Gene"); return NULL; } base = Sequence_alloc(); sprintf(buffer,"%s.sp",Genomic_name(gn)); base->name = stringalloc(buffer); base->seq = ckcalloc(length_Transcript(trs)+1,sizeof(char)); base->seq[0]='\0'; for(i=0;i<trs->ex_len;i++) { strncat(base->seq,gn->baseseq->seq+trs->exon[i]->start,trs->exon[i]->end-trs->exon[i]->start); } make_len_type_Sequence(base); base->type = SEQUENCE_CDNA; trs->cDNA = cDNA_from_Sequence(base); return trs->cDNA; }
boolean is_simple_prediction_Gene(Gene * g) { if( g->len > 1 ) return FALSE; if( g->transcript[0]->len > 1 ) return FALSE; if( g->transcript[0]->translation[0]->start != 0 || g->transcript[0]->translation[0]->end != length_Transcript(g->transcript[0]) ) return FALSE; return TRUE; }
Gene * read_EMBL_feature_Gene(char * buffer,int maxlen,FILE * ifp) { Gene * gene; Transcript * tr; Translation * ts; Exon * exon; char * runner; char * base; char * next; int i; int exon_start[MAX_EMBL_EXON_PARSE]; int exon_end[MAX_EMBL_EXON_PARSE]; int number; int exon_no = 0; int isstart = 1; int is_complement = 0; int is_cds = 0; int break_at_end = 0; if( strstartcmp(buffer,"FT") != 0 ) { warn("passed in a bad line [%s] to be used for feature table parsing",buffer); return NULL; } if( (runner=strtok(buffer+2,spacestr)) == NULL ) { warn("Bad embl feature line [%s]",buffer); return NULL; } if( strcmp(runner,"CDS") != 0 && strcmp(runner,"mRNA") != 0 ) { warn("passed in a feature line to read_EMBL_feature_Gene with a %s tag. This only handles CDS and mRNA tags",runner); return NULL; } if( strcmp(runner,"CDS") == 0 ) { is_cds = TRUE; } runner = strtok(NULL,spacestr); if( runner == NULL ) { warn("Bad embl feature line [%s]",buffer); return NULL; } if( strstartcmp(runner,"complement") == 0 ) { runner = strchr(runner,'('); if( runner == NULL) { warn("Could not find bracket on EMBL feature complement line"); return NULL; } is_complement = 1; runner++; } if( strstartcmp(runner,"join") == 0 ) { runner = strchr(runner,'('); runner++; } else if( isdigit((int)*runner) || *runner == '<' ) { /** ok - starts with the numbers. We'll cope!**/ } else { warn("Expecting a join statement, got a [%s]",runner); return NULL; } /*** ok, now the major number loop ***/ for(;;) { base= runner; for(;*runner && *runner != ')' && *runner != '.' && *runner != ',' && *runner != '>' && !isspace((int)*runner);runner++) ; /*fprintf(stderr,"Got a runner of %s\n ",runner); */ if( *runner == '\0' ) next = runner; else next = runner+1; if( *runner == ')' ) { break_at_end = TRUE; /* out of reading exons */ } *runner='\0'; if( strstartcmp(base,"complement(") == 0 ) { is_complement = TRUE; for(;*base != '(';base++) ; base++; break_at_end = FALSE; /* we found an bracket too early! */ } if( is_integer_string(base,&number) == FALSE ) { warn("Got a non integer [%s] in the middle of a join statement in EMBL parsing",runner); return NULL; } /** put this number away **/ if( isstart ) { exon_start[exon_no] = number; isstart = 0; } else { exon_end[exon_no++] = number; isstart = 1; } if( break_at_end == TRUE) break; for(runner=next;*runner && (*runner == '.' || isspace((int)*runner));runner++) ; if( *runner == '\0' ) { if( next_feature_tab_line(buffer,maxlen,ifp) == FALSE) { warn("In the middle of getting a join statement, got a [%s]. Yuk!",buffer); return NULL; } if( !isdigit((int)buffer[0]) && buffer[0] != '.' && buffer[0] != ',') { /*** ok - sometimes people very boring end things in here ***/ /* warn("In the middle of getting a join statement, got a [%s]. Ugh!",buffer); */ break; } runner = buffer; } } if( isstart == 0 ) { warn("I have read an uneven number of start-end points in the exon thing. Yuk!"); return NULL; } /** runner should now be on bracket **/ if( is_complement == 1 ) { /** ok . should be another bracket. Do we care? **/ } gene = Gene_alloc_len(1); tr = Transcript_alloc_len(exon_no); add_Gene(gene,tr); tr->parent = gene; if( is_complement == 1 ) { gene->start = exon_end[exon_no-1]-1; gene->end = exon_start[0] -1; for(i=exon_no -1;i >= 0;i--) { exon = Exon_alloc(); exon->start = (gene->start+1) - exon_end[i]; exon->end = (gene->start+1) - exon_start[i] +1; add_ex_Transcript(tr,exon); } } else { gene->start = exon_start[0] -1; gene->end = exon_end[exon_no-1] -1; for(i=0;i<exon_no;i++) { exon = Exon_alloc(); exon->start = exon_start[i] - (gene->start+1); exon->end = exon_end[i] - (gene->start+1)+1; add_ex_Transcript(tr,exon); } } if( is_cds == TRUE ) { ts = Translation_alloc(); ts->start = 0; ts->end = length_Transcript(tr); ts->parent = tr; add_Transcript(tr,ts); } /*** read the rest of this feature ***/ while( next_feature_tab_line(buffer,maxlen,ifp) == TRUE) ; return gene; }