static Int4 wo (Int4 len, SeqPortPtr spp, Int4 iseg, DCURLOC PNTR cloc, Int4 PNTR invrescount, UcharPtr seq) { Int4 i, flen; cloc->curlevel = 0; cloc->curstart = 0; cloc->curend = 0; /* get the chunk of sequence in triplets */ SeqPortSeek (spp, iseg, SEEK_SET); MemSet (seq,0,len+2); /* Zero the triplet buffer */ flen = dusttripfind (spp, seq, iseg, len, invrescount); /* dust the chunk */ for (i = 0; i < flen; i++) { wo1 (flen-i, seq+i, i, cloc); } cloc->curend += cloc->curstart; return flen; }
/* WARNING not called and not tested... */ static Int4 FillCSANWithSeq(PCSAN pcsanThis, BioseqPtr pbsq, Int4 iLen) { SeqPortPtr spp = NULL; Uint1 code = Seq_code_ncbieaa; Uint1 residue; Int4 iCount = 0; CharPtr pcA; if (!pcsanThis) return 0; if (!ISA_aa(pbsq->mol)) return 0; if (!iLen) return 0; spp = SeqPortNew(pbsq, 0, -1, 0, code); if (!spp) return 0; SeqPortSeek(spp, 0, SEEK_SET); pcsanThis->pcSeqAln = (CharPtr)MemNew((size_t) (1+ sizeof(char) * iLen)); pcA = pcsanThis->pcSeqAln; residue = SeqPortGetResidue(spp); iCount = 0; while ((residue != SEQPORT_EOF) && (residue != '\0') && (iLen < iCount)) { iCount++; *pcA = (char) residue; pcA++; residue = SeqPortGetResidue(spp); } while (iCount < iLen) { *pcA = '-'; pcA++; iCount++; } pcsanThis->pcSeqAln[iLen] = '\0'; SeqPortFree(spp); return iCount; }
NLM_EXTERN Int4 print_protein_for_cds(SeqFeatPtr sfp, CharPtr buf, SeqLocPtr loc, Boolean reverse_minus) { CdRegionPtr crp; Int4 frame_offset, start_offset; Uint1 f_strand; Boolean reverse; Int4 cd_len; GatherRange gr; Int2 p_pos, buf_len; Int4 a_left, a_right; Int4 aa, val; SeqLocPtr slp; SeqPortPtr spp; ByteStorePtr p_data; Int4 end_pos, start_pos = -1; Uint1 residue; Boolean seal_ends = FALSE; Boolean reverse_order; if(sfp == NULL || sfp->data.choice != 3) return -1; if(buf == NULL || loc == NULL) return -1; crp = sfp->data.value.ptrvalue; if(crp == NULL) return -1; if(buf[0] == '\0') seal_ends = TRUE; spp = NULL; p_data = NULL; if(sfp->product !=NULL && !IS_BOGO_Product(sfp->ext)) { spp = SeqPortNewByLoc(sfp->product, Seq_code_ncbieaa); if(spp !=NULL) { SeqPortSeek(spp, 0, SEEK_SET); end_pos = spp->totlen-1; } } if(spp == NULL) { p_data = ProteinFromCdRegion(sfp, TRUE); /* p_data = ProteinFromCdRegion(sfp, FALSE); */ if(p_data !=NULL) { BSSeek(p_data, 0, SEEK_SET); end_pos = BSLen(p_data)-1; } } if(spp == NULL && p_data == NULL) return -1; if(crp->frame == 0) frame_offset = 0; else frame_offset = (Int4)crp->frame-1; start_offset = frame_offset; f_strand = SeqLocStrand(sfp->location); reverse = ck_reverse(f_strand, SeqLocStrand(loc)); /*if reverse == TRUE, the translated protein is written backwards*/ if(reverse && reverse_minus) reverse_order = TRUE; else reverse_order = FALSE; slp = NULL; cd_len = 0; aa = 0; buf_len = SeqLocLen(loc); if(reverse_order) { p_pos = buf_len -1; if(seal_ends) { buf[p_pos+1] = '\0'; seal_ends = FALSE; } } else p_pos = 0; while((slp = SeqLocFindNext(sfp->location, slp))!=NULL) { if(SeqLocOffset(loc, slp, &gr, 0)) { if(reverse_order) { if(gr.right < p_pos) p_pos = (Int2)(gr.right); } else { if(p_pos < gr.left) p_pos = (Int2)(gr.left); } SeqLocOffset(slp, loc, &gr, 0); a_left = gr.left + cd_len; a_right = gr.right + cd_len; /* if(reverse_order) { temp = a_right; a_right = -a_left; a_left = -temp; } */ for(; a_left<=a_right; ++a_left) { val = ABS(a_left) - start_offset; aa = val/3; if(aa < 0 || aa > end_pos)/*stop & partial codon*/ { buf[p_pos] = '^'; } else { if(val%3==1)/*label aa in the middle of 3-bp codon*/ { if(start_pos == -1) start_pos = aa; if(spp !=NULL) { SeqPortSeek(spp, aa, SEEK_SET); residue = SeqPortGetResidue(spp); } else { BSSeek(p_data, aa, SEEK_SET); residue = (Uint1)BSGetByte(p_data); } if(IS_ALPHA(residue) || residue == '*' || residue == '-') buf[p_pos] = residue; else buf[p_pos] = '?'; } else buf[p_pos] = ' '; } if(reverse_order) -- p_pos; else { ++p_pos; if (p_pos > buf_len) break; } } } cd_len += SeqLocLen(slp); /*frame_offset = (cd_len - start_offset)%3; if(frame_offset > 0) --frame_offset;*/ } if(spp != NULL) SeqPortFree(spp); if(p_data != NULL) BSFree(p_data); if(p_pos == 0) /*all the residues are introns*/ { if(seal_ends) { end_pos = buf_len; MemSet((Pointer)buf, '~', (size_t)(end_pos) * sizeof(Char)); buf[end_pos] = '\0'; } } else { if(seal_ends) { buf[p_pos] = '\0'; } if(start_pos == -1) start_pos = aa; } return start_pos; }
/****************************************************************** * * aa_to_codon(sfp, aa_start, aa_stop) * generate a list of CodonVecotr to show the codons of an * amino acid sequence * sfp: the Seq-feat for cds * aa_start: the start position of protein sequence * aa_stop the stop position of protein sequence * ******************************************************************/ NLM_EXTERN ValNodePtr aa_to_codon(SeqFeatPtr sfp, Int4 aa_start, Int4 aa_stop) { BioseqPtr bsp; Int4 frame_offset, start_offset; SeqLocPtr slp = NULL; SeqLocPtr cdloc; CdRegionPtr crp; Uint1 frame; Boolean is_end; /**is the end for process reached?**/ Int4 p_start=0, p_stop=0; /**protein start & stop in defined corresponding CdRegion Seq-loc**/ Int4 line_len; Int4 cur_pos; /**current protein position in process**/ Int4 cd_len; /**length of the cDNA for the coding region**/ Int2 i, j; Int2 k, n; CharPtr PNTR buf; Boolean is_new; /**Is cur_pos at the begin of new Seq-loc?**/ CharPtr temp; SeqPortPtr spp; Uint1 residue; Boolean end_partial; Int4 d_start, seq_pos; Int2 pos; ValNodePtr head= NULL; CodonVectorPtr cvp; Boolean prt_stop_codon; Uint2 exon; if(sfp->data.choice !=3) return NULL; crp = sfp->data.value.ptrvalue; if(!crp) return NULL; frame = crp->frame; cdloc = sfp->location; if(cdloc == NULL ) return NULL; if(frame>0) frame_offset = frame-1; else frame_offset = 0; start_offset = frame_offset; prt_stop_codon = (aa_stop == SeqLocStop(sfp->product)); line_len = (aa_stop - aa_start + 1) + 1; /* +1 for the possible partial start codon*/ if(prt_stop_codon)/*can be either as a stop codon or partial stop*/ ++line_len; buf = MemNew((size_t)3 * sizeof(CharPtr)); for(i =0; i<3; ++i) buf[i] = MemNew((size_t)(line_len + 1) * sizeof (Char)); cur_pos= aa_start; cd_len = 0; is_end = FALSE; p_start = 0; slp = NULL; exon = 0; while(!is_end && ((slp = SeqLocFindNext(cdloc, slp))!=NULL)) { ++exon; cd_len += SeqLocLen(slp); end_partial = ((cd_len - start_offset)%3 != 0); p_stop = (cd_len - start_offset)/3 -1; if(end_partial) ++p_stop; if(p_stop > aa_stop || (p_stop == aa_stop && !end_partial)) { p_stop = aa_stop; /**check if the end is reached**/ is_end = TRUE; } if(p_stop >= cur_pos) /*get the exon*/ { bsp = BioseqLockById(SeqLocId(slp)); if(bsp) { is_new = (p_start == cur_pos); /*start a new exon?*/ cvp = MemNew(sizeof(CodonVector)); cvp->sip = SeqIdDup(find_sip(bsp->id)); cvp->strand = SeqLocStrand(slp); cvp->exonCount = exon; if(is_new) { if(frame_offset == 0) cvp->frame = 0; else cvp->frame = 3- (Uint1)frame_offset; } else cvp->frame = 0; if(cur_pos==0 && frame_offset > 0) /*partial start codon*/ cvp->aa_index = 0; else cvp->aa_index = 1; if(is_new) /**special case of the first partial**/ d_start = SeqLocStart(slp); else { if(frame_offset && p_start >0) ++p_start; d_start = SeqLocStart(slp) + 3*(cur_pos - p_start) + frame_offset; } /**p_start is the start position of aa in the current Seq-loc cur_pos is the current aa that is in process. The offset will help to located the position on the DNA Seq-loc for translation d_start is the position of the starting DNA in the coordinates of DNA segment, used for mark the sequence **/ seq_pos = d_start - SeqLocStart(slp); /**the pos in spp**/ if(SeqLocStrand(slp)== Seq_strand_minus) d_start = SeqLocStop(slp) - seq_pos; cvp->dna_pos = d_start; n = (Int2)cur_pos - (Int2)aa_start + cvp->aa_index; /*position in buffer*/ for(i =0; i<3; ++i) make_empty(buf[i], (Int2)line_len); spp = SeqPortNewByLoc(slp, Seq_code_iupacna); SeqPortSeek(spp, seq_pos, SEEK_SET); /**store the partial codons**/ if(is_new && frame_offset > 0) { k = (Int2)frame_offset; while(k > 0) { residue = SeqPortGetResidue(spp); temp = buf[3-k]; /**the position**/ pos = n; temp[pos] = TO_LOWER(residue); --k; } ++n; if(cur_pos!=0) ++cur_pos; } /**load the codons**/ k =0; while((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF && cur_pos <= p_stop) { j= (Uint1)k%3; temp = buf[j]; temp[n] = TO_LOWER(residue); if(j ==2) { /**the last base**/ ++n; if(!prt_stop_codon|| !is_end) /*for the last codon*/ /**prt_end controls to print the whole loc**/ ++cur_pos; } ++k; } /**end of while**/ SeqPortFree(spp); for(i =0; i<3; ++i) cvp->buf[i] = StringSave(buf[i]); ValNodeAddPointer(&head, 0, (Pointer)cvp); BioseqUnlock(bsp); }/*end of if(bsp)*/ }/**end of if for matched intervals**/ if(end_partial) p_start = p_stop; else p_start = p_stop +1; frame_offset = (cd_len - start_offset)%3; if(frame_offset >0) frame_offset = 3-frame_offset; }/**end of while(slp && !is_end) **/ for(i=0; i<3; ++i) MemFree(buf[i]); MemFree(buf); return head; }
static Int4 dusttripfind (SeqPortPtr spp, UcharPtr s1, Int4 icur, Int4 max, Int4 PNTR invrescount) { Int4 pos; Int4 n; UcharPtr s2, s3; Int2 c; Boolean flagVD; n = 0; s2 = s1 + 1; s3 = s1 + 2; SeqPortSeek (spp, icur, SEEK_SET); /* set up needs streamlining */ /* start again at segment or virtual sequence bounderies */ /* set up 1 */ if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF) return n; if (c == SEQPORT_EOS || c == SEQPORT_VIRT) return n; if (!IS_residue (c)) { c = 0; /* 255 it's 'A' */ if (*invrescount < 3) { pos = SeqPortTell (spp); ErrPostEx (SEV_INFO, 5, 1, "Invalid residue converted to 'A': %ld", (long) pos); ErrShow (); } (*invrescount)++; } *s1 |= c; *s1 <<= 2; /* set up 2 */ if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF) return n; if (c == SEQPORT_EOS || c == SEQPORT_VIRT) return n; if (!IS_residue (c)) { c = 0; /* 255 it's 'A' */ if (*invrescount < 3) { pos = SeqPortTell (spp); ErrPostEx (SEV_INFO, 5, 1, "Invalid residue converted to 'A': %ld", (long) pos); ErrShow (); } (*invrescount)++; } *s1 |= c; *s2 |= c; /* triplet fill loop */ flagVD = TRUE; while ((c = SeqPortGetResidue (spp)) != SEQPORT_EOF && n < max) { if (c == INVALID_RESIDUE) { c = 0; /* 255 it's 'A' */ if (*invrescount < 3) { pos = SeqPortTell (spp); ErrPostEx (SEV_INFO, 5, 1, "Invalid residue converted to 'A': %ld", (long) pos); ErrShow (); } (*invrescount)++; } if (IS_residue (c)) { *s1 <<= 2; *s2 <<= 2; *s1 |= c; *s2 |= c; *s3 |= c; s1++; s2++; s3++; n++; } else { switch (c) { case SEQPORT_EOS: /* 252 rare */ break; /* VIRT if there is an undetermined segment of sequence */ case SEQPORT_VIRT: /* 251 ignore ? */ default: /* flagVD = TRUE; dust across v-seg */ flagVD = FALSE; /* don't dust across */ break; } if (!flagVD) break; } } /* end while */ return n; }
void BioseqRawToRaw(BioseqPtr bsp, Boolean idonly, short whichSeq, short *seqnum, char **seq, char **seqid, long *seqlen) { SeqPortPtr spp; SeqIdPtr bestid; Uint1 repr, code, residue; CharPtr tmp, title; long outlen, outmax; char localid[256], *sp; /* !!! this may be called several times for a single sequence because SeqEntryExplore looks for parts and joins them... assume seq, seqid, seqlen may contain data (or NULL) */ if (bsp == NULL) return; repr = Bioseq_repr(bsp); if (!(repr == Seq_repr_raw || repr == Seq_repr_const)) return; (*seqnum)++; if (!(whichSeq == *seqnum || whichSeq == 0)) return; bestid = SeqIdFindBest(bsp->id, (Uint1) 0); title = BioseqGetTitle(bsp); if (idonly) { sprintf(localid, " %d) ", *seqnum); tmp= localid + strlen(localid)-1; } else { strcpy(localid," "); tmp= localid; } tmp = SeqIdPrint(bestid, tmp, PRINTID_FASTA_SHORT); tmp = StringMove(tmp, " "); StringNCpy(tmp, title, 200); /* fprintf(stderr,"BioseqRawToRaw: localid='%s'\n",localid); */ /* < seqid is fixed storage */ /* strcpy( *seqid, localid); */ /* < seqid is variable sized */ outmax= strlen(localid) + 3; if (*seqid==NULL) { *seqid= (char*) malloc(outmax); if (*seqid==NULL) return; strcpy(*seqid, localid); } else { outmax += strlen(*seqid) + 2; *seqid= (char*) realloc( *seqid, outmax); if (*seqid==NULL) return; if (!idonly) strcat(*seqid, "; "); strcat(*seqid, localid); } if (idonly) { strcat(*seqid,"\n"); return; } if (ISA_na(bsp->mol)) code = Seq_code_iupacna; else code = Seq_code_iupacaa; spp = SeqPortNew(bsp, 0, -1, 0, code); SeqPortSeek(spp, 0, SEEK_SET); sp= *seq; if (sp==NULL) { /* this is always true now !? */ outlen= 0; outmax= 500; sp= (char*) malloc(outmax); } else { outlen= strlen(sp); outmax= outlen + 500; sp= (char*) realloc( sp, outmax); } if (sp==NULL) return; while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) { if (outlen>=outmax) { outmax= outlen + 500; sp= (char*) realloc(sp, outmax); if (sp==NULL) return; } sp[outlen++] = residue; } sp= (char*) realloc(sp, outlen+1); if (sp!=NULL) sp[outlen]= '\0'; *seq= sp; *seqlen= outlen; SeqPortFree(spp); return; }
extern DustRegionPtr DustSeqPort (SeqPortPtr spp, Int4 start, Int4 stop, DustDataPtr ddp) { Int4 i, posn, pos; Uint1 c; Uint1Ptr shead, s1, s2, s3; Boolean flagVS; Int4 this_start, this_stop; DustRegionPtr drphead = NULL, drp = NULL; Int4 invrescount = 0; if (spp == NULL || ddp == NULL) return NULL; if (stop-start+1 > MAXSEQCHUNK) { this_start = start; this_stop = this_start + MAXSEQCHUNK; } else { this_start = start; this_stop = stop; } while (this_stop <= stop) { shead = (Uint1Ptr) MemNew ((size_t)(sizeof (Uchar)*(stop-start+1))); if (shead == NULL) return NULL; MemSet (shead, 0, (size_t) (sizeof (Uint1) * (stop-start+1))); s1 = shead; s2 = s1 + 1; s3 = s2 + 1; posn = start-1; SeqPortSeek (spp, this_start, SEEK_SET); /* set up 1 */ if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF) return NULL; if (c == SEQPORT_EOS || c == SEQPORT_VIRT) return NULL; if (!IS_residue (c)) { c = 0; /* 255 it's 'A' */ pos = SeqPortTell (spp); if (invrescount < 3) { ErrPostEx (SEV_INFO, 5, 1, "Invalid residue converted to 'A': %ld", (long) pos); ErrShow (); } invrescount++; } *s1 |= c; /* set up 2 */ if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF) return NULL; if (c == SEQPORT_EOS || c == SEQPORT_VIRT) return NULL; if (!IS_residue (c)) { c = 0; /* 255 it's 'A' */ pos = SeqPortTell (spp); if (invrescount < 3) { ErrPostEx (SEV_INFO, 5, 1, "Invalid residue converted to 'A': %ld", (long) pos); ErrShow (); } invrescount++; } *s1 <<= 2; *s1 |= c; *s2 |= c; /* triplet fill loop */ flagVS = FALSE; for (i = this_start+2; i <= this_stop; i++) { if ((c = SeqPortGetResidue (spp)) != SEQPORT_EOF) { if (c == INVALID_RESIDUE) /* 255 */ { c = 0; /* ping! -- it's 'A' */ pos = SeqPortTell (spp); if (invrescount < 3) { ErrPostEx (SEV_INFO, 5, 1, "Invalid residue converted to 'A': %ld", (long) pos); ErrShow (); } invrescount++; } if (IS_residue (c)) { *s1 <<= 2; *s2 <<= 2; *s1 |= c; *s2 |= c; *s3 |= c; s1++; s2++; s3++; posn++; } else { switch (c) { /* this should be okay unless segements are being used weirdly */ case SEQPORT_EOS: /* 252 */ break; /* start again at virtual sequence bounderies */ case SEQPORT_VIRT: /* 251 */ flagVS = TRUE; break; /* and/or just ignore anything odd at this point */ default: break; } if (flagVS) break; } } } if (this_stop != posn+2+this_start) { ErrPostEx (SEV_WARNING, 1, 10, "Possible seqport read error: Expected: %ld Actual: %ld", (long) this_stop, (long) posn); ErrShow (); } drp = Dust (shead, this_start, this_stop, ddp, drp); if (drphead == NULL) { drphead = drp; } MemFree (shead); if (this_stop == stop) { break; } else { this_start = posn+3; this_stop = this_start + MAXSEQCHUNK; if (this_stop > stop) this_stop = stop; } } if (invrescount > 0) { ErrPostEx (SEV_INFO, 3, 2, "Total invalid residues found: %ld", (long) invrescount); ErrShow (); } return drphead; }