示例#1
0
int main(int argc, char **argv)
{
    ajint begin, end;
    AjPSeqall seqall;
    AjPSeq seq;
    EmbPShow ss;
    AjPFile outfile;
    AjPStr tablename;
    ajint table;
    AjPRange uppercase;
    AjPRange highlight;
    AjBool threeletter;
    AjBool numberseq;
    AjBool nameseq;
    ajint width;
    ajint length;
    ajint margin;
    AjBool description;
    ajint offset;
    AjBool html;
    AjPStr descriptionline;
    ajint orfminsize;
    AjPTrn trnTable;
    AjBool translation;
    AjBool reverse;
    AjBool cutlist;
    AjBool flat;
    EmbPMatMatch mm = NULL;

    AjPStr *framelist;
    AjBool frames[6];   /* frames to be translated 1 to 3, -1 to -3 */
	 
    /* stuff for tables and lists of enzymes and hits */
    ajint default_mincuts = 1;
    ajint default_maxcuts = 2000000000;
    AjPTable hittable; /* enzyme hits */

    /* stuff lifted from Alan's 'restrict.c' */
    AjPStr enzymes = NULL;
    ajint mincuts;
    ajint maxcuts;
    ajint sitelen;
    AjBool single;
    AjBool blunt;
    AjBool sticky;
    AjBool ambiguity;
    AjBool plasmid;
    AjBool commercial;
    AjBool limit;
    AjBool methyl;
    AjPFile enzfile  = NULL;
    AjPFile equfile  = NULL;
    AjPFile methfile = NULL;
    AjPTable retable = NULL;
    ajint hits;
    AjPList restrictlist = NULL;

    embInit("remap", argc, argv);

    seqall      = ajAcdGetSeqall("sequence");
    outfile     = ajAcdGetOutfile("outfile");
    tablename   = ajAcdGetListSingle("table");
    uppercase   = ajAcdGetRange("uppercase");
    highlight   = ajAcdGetRange("highlight");
    threeletter = ajAcdGetBoolean("threeletter");
    numberseq   = ajAcdGetBoolean("number");
    width       = ajAcdGetInt("width");
    length      = ajAcdGetInt("length");
    margin      = ajAcdGetInt("margin");
    nameseq     = ajAcdGetBoolean("name");
    description = ajAcdGetBoolean("description");
    offset      = ajAcdGetInt("offset");
    html        = ajAcdGetBoolean("html");
    orfminsize  = ajAcdGetInt("orfminsize");
    translation = ajAcdGetBoolean("translation");
    reverse     = ajAcdGetBoolean("reverse");
    cutlist     = ajAcdGetBoolean("cutlist");
    flat        = ajAcdGetBoolean("flatreformat");
    framelist   = ajAcdGetList("frame");
    
    /*  restriction enzyme stuff */
    mincuts    = ajAcdGetInt("mincuts");
    maxcuts    = ajAcdGetInt("maxcuts");
    sitelen    = ajAcdGetInt("sitelen");
    single     = ajAcdGetBoolean("single");
    blunt      = ajAcdGetBoolean("blunt");
    sticky     = ajAcdGetBoolean("sticky");
    ambiguity  = ajAcdGetBoolean("ambiguity");
    plasmid    = ajAcdGetBoolean("plasmid");
    commercial = ajAcdGetBoolean("commercial");
    limit      = ajAcdGetBoolean("limit");
    enzymes    = ajAcdGetString("enzymes");
    methfile   = ajAcdGetDatafile("mfile");
    methyl     = ajAcdGetBoolean("methylation");
    
    if(!blunt  && !sticky)
	ajFatal("Blunt/Sticky end cutters shouldn't both be disabled.");

    /* get the number of the genetic code used */
    ajStrToInt(tablename, &table);
    trnTable = ajTrnNewI(table);

    /* read the local file of enzymes names */
    remap_read_file_of_enzyme_names(&enzymes);

    /* get the frames to be translated */
    remap_GetFrames(framelist, frames);
	 
    while(ajSeqallNext(seqall, &seq))
    {
	/* get begin and end positions */
	begin = ajSeqGetBegin(seq)-1;
	end   = ajSeqGetEnd(seq)-1;

	/* do the name and description */
	if(nameseq)
	{
	    if(html)
		ajFmtPrintF(outfile, "<H2>%S</H2>\n",
				   ajSeqGetNameS(seq));
	    else
		ajFmtPrintF(outfile, "%S\n", ajSeqGetNameS(seq));
	}

	if(description)
	{
	    /*
	    **  wrap the description line at the width of the sequence
	    **  plus margin
	    */
	    if(html)
		ajFmtPrintF(outfile, "<H3>%S</H3>\n",
				   ajSeqGetDescS(seq));
	    else
	    {
		descriptionline = ajStrNew();
		ajStrAssignS(&descriptionline, ajSeqGetDescS(seq));
		ajStrFmtWrap(&descriptionline, width+margin);
		ajFmtPrintF(outfile, "%S\n", descriptionline);
		ajStrDel(&descriptionline);
	    }
	}

	/* get the restriction cut sites */
	/*
	**  most of this is lifted from the program 'restrict.c' by Alan
	**  Bleasby
	 */
	if(single)
	    maxcuts=mincuts=1;
	retable = ajTablestrNew(EQUGUESS);
	enzfile = ajDatafileNewInNameC(ENZDATA);
	if(!enzfile)
	    ajFatal("Cannot locate enzyme file. Run REBASEEXTRACT");

	if(limit)
	{
	    equfile = ajDatafileNewInNameC(EQUDATA);
	    if(!equfile)
		limit = ajFalse;
	    else
		remap_read_equiv(&equfile, &retable, commercial);
	}

	ajFileSeek(enzfile, 0L, 0);
	restrictlist = ajListNew();
	/* search for hits, but don't use mincuts and maxcuts criteria yet */
	hits = embPatRestrictMatch(seq, begin+1, end+1, enzfile, methfile,
                                   enzymes, sitelen,plasmid, ambiguity,
                                   default_mincuts, default_maxcuts, blunt,
                                   sticky, commercial, methyl,
				   restrictlist);

	ajDebug("Remap found %d hits\n", hits);

	if(hits)
	{
	    /* this bit is lifted from printHits */
	    embPatRestrictRestrict(restrictlist, hits, !limit,
					  ajFalse);
	    if(limit)
		remap_RestrictPreferred(restrictlist,retable);
	}


	ajFileClose(&enzfile);
	ajFileClose(&methfile);


	/*
	** Remove those violating the mincuts and maxcuts
	** criteria, but save them in hittable for printing out later.
	** Keep a count of how many hits each enzyme gets in hittable.
	*/
        hittable = ajTablestrNewCase(TABLEGUESS);
	remap_RemoveMinMax(restrictlist, hittable, mincuts, maxcuts);


	/* make the Show Object */
	ss = embShowNew(seq, begin, end, width, length, margin, html, offset);

	if(html)
	    ajFmtPrintF(outfile, "<PRE>");

	/* create the format to display */
	embShowAddBlank(ss);
	embShowAddRE(ss, 1, restrictlist, plasmid, flat);
	embShowAddSeq(ss, numberseq, threeletter, uppercase, highlight);

	if(!numberseq)
	    embShowAddTicknum(ss);
	embShowAddTicks(ss);

	if(reverse)
	{
	    embShowAddComp(ss, numberseq);
	    embShowAddRE(ss, -1, restrictlist, plasmid, flat);
	}


	if(translation)
	{
	    if(reverse)
		embShowAddBlank(ss);

            if(frames[0])	    
	      embShowAddTran(ss, trnTable, 1, threeletter,
			     numberseq, NULL, orfminsize,
			     AJFALSE, AJFALSE, AJFALSE, AJFALSE);
            if(frames[1])
	      embShowAddTran(ss, trnTable, 2, threeletter,
			     numberseq, NULL, orfminsize,
			     AJFALSE, AJFALSE, AJFALSE, AJFALSE);
            if(frames[2])
	      embShowAddTran(ss, trnTable, 3, threeletter,
			     numberseq, NULL, orfminsize,
			     AJFALSE, AJFALSE, AJFALSE, AJFALSE);
	    
	    if(reverse)
	    {
		embShowAddTicks(ss);
                if(frames[5])
		  embShowAddTran(ss, trnTable, -3, threeletter,
			         numberseq, NULL, orfminsize,
			         AJFALSE, AJFALSE, AJFALSE, AJFALSE);
                if(frames[4])
		  embShowAddTran(ss, trnTable, -2, threeletter,
			         numberseq, NULL, orfminsize,
			         AJFALSE, AJFALSE, AJFALSE, AJFALSE);
                if(frames[3])
		  embShowAddTran(ss, trnTable, -1, threeletter,
			         numberseq, NULL, orfminsize,
			         AJFALSE, AJFALSE, AJFALSE, AJFALSE);
	    }
	}

	embShowPrint(outfile, ss);

	/* display a list of the Enzymes that cut and don't cut */
	if(cutlist)
	{
	    remap_CutList(outfile, hittable,
	    		limit, html, mincuts, maxcuts);
	    remap_NoCutList(outfile, hittable, html, enzymes, blunt,
			sticky, sitelen, commercial, ambiguity, 
			limit, retable);
	}

	/* add a gratuitous newline at the end of the sequence */
	ajFmtPrintF(outfile, "\n");

	/* tidy up */
	embShowDel(&ss);

	while(ajListPop(restrictlist,(void **)&mm))
	    embMatMatchDel(&mm);
	ajListFree(&restrictlist);

        remap_DelTable(&hittable);

	ajTablestrFree(&retable);
    }


    ajTrnDel(&trnTable);

    ajSeqallDel(&seqall);
    ajSeqDel(&seq);
    ajFileClose(&outfile);
    ajStrDel(&tablename);
    ajStrDel(&enzymes);
    ajStrDelarray(&framelist);

    ajRangeDel(&uppercase);
    ajRangeDel(&highlight);

    embExit();

    return 0;
}
示例#2
0
int main(int argc, char **argv)
{
    AjPSeqall nucseq;		/* input nucleic sequences */
    AjPSeqset protseq;		/* input aligned protein sequences */
    AjPSeqout seqout;
    AjPSeq nseq;		/* next nucleic sequence to align */
    const AjPSeq pseq;		/* next protein sequence use in alignment */
    AjPTrn trnTable;
    AjPSeq pep;			/* translation of nseq */
    AjPStr tablelist;
    ajint table;
    AjPSeqset outseqset;	/* set of aligned nucleic sequences */
    ajint proteinseqcount = 0;
    AjPStr degapstr = NULL;
    /* used to check if it matches with START removed */
    AjPStr degapstr2 = NULL;
    AjPStr codon = NULL;	/* holds temporary codon to check if is START */
    char aa;			/* translated putative START codon */
    ajint type;			/* returned type of the putative START codon */
    /* start position of guide protein in translation */
    ajlong pos = 0;
    AjPSeq newseq = NULL;	/* output aligned nucleic sequence */
    ajint frame;

    embInit("tranalign", argc, argv);

    nucseq    = ajAcdGetSeqall("asequence");
    protseq   = ajAcdGetSeqset("bsequence");
    tablelist = ajAcdGetListSingle("table");
    seqout    = ajAcdGetSeqoutset("outseq");

    outseqset = ajSeqsetNew();
    degapstr  = ajStrNew();

    /* initialise the translation table */
    ajStrToInt(tablelist, &table);
    trnTable = ajTrnNewI(table);

    ajSeqsetFill(protseq);

    while(ajSeqallNext(nucseq, &nseq))
    {
    	if((pseq = ajSeqsetGetseqSeq(protseq, proteinseqcount++)) == NULL)
    	    ajErr("No guide protein sequence available for "
		  "nucleic sequence %S",
		  ajSeqGetNameS(nseq));

	ajDebug("Aligning %S and %S\n",
		ajSeqGetNameS(nseq), ajSeqGetNameS(pseq));

        /* get copy of pseq string with no gaps */
        ajStrAssignS(&degapstr, ajSeqGetSeqS(pseq));
        ajStrRemoveGap(&degapstr);

        /*
	** for each translation frame look for subset of pep that
	** matches pseq
	*/
        for(frame = 1; frame <4; frame++)
	{
	    ajDebug("trying frame %d\n", frame);
            pep = ajTrnSeqOrig(trnTable, nseq, frame);
            degapstr2 = ajStrNew();
            ajStrAssignRef(&degapstr2, degapstr);
            pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr);

            /* 
            ** we might have a START codon that should be translated as 'M'
            ** we need to check if there is a match after a possible START
            ** codon 
            */
            if(pos == -1 && ajStrGetLen(degapstr) > 1 && 
                (ajStrGetPtr(degapstr)[0] == 'M' ||
		 ajStrGetPtr(degapstr)[0] == 'm'))
	      {
                /* see if pep minus the first character is a match */
                ajStrCutStart(&degapstr2, 1);
                pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr2); 

                /*
		** pos is >= 1 if we have a match that is after the first
		** residue
		*/
                if(pos >= 1)
		{
                    /* point back at the putative START Methionine */
                    pos--;
                    /* test if first codon is a START */
                    codon = ajStrNew();
                    ajStrAssignSubS(&codon, ajSeqGetSeqS(nseq), 
                                (pos*3)+frame-1, (pos*3)+frame+2);
                    type = ajTrnCodonstrTypeS(trnTable, codon, &aa);

                    if(type != 1)
                    {
                        /* first codon is not a valid START, force a mismatch */
                        pos = -1;
                    }
                    ajStrDel(&codon);
                
            	}
		else
		{
                    /* force 'pos == 0' to be treated as a mismatch */
            	    pos = -1;
		}
            }

            ajStrDel(&degapstr2);
            ajSeqDel(&pep);

            if(pos != -1)
            	break;
        }

        if(pos == -1)
	    ajErr("Guide protein sequence %S not found in nucleic sequence %S",
		  ajSeqGetNameS(pseq), ajSeqGetNameS(nseq));
	else
	{
	    ajDebug("got a match with frame=%d\n", frame);
            /* extract the coding region of nseq with gaps */
            newseq = ajSeqNew();
            ajSeqSetNuc(newseq);
            ajSeqAssignNameS(newseq, ajSeqGetNameS(nseq));
            ajSeqAssignDescS(newseq, ajSeqGetDescS(nseq));
            tranalign_AddGaps(newseq, nseq, pseq, (pos*3)+frame-1);

            /* output the gapped nucleic sequence */
            ajSeqsetApp(outseqset, newseq);

            ajSeqDel(&newseq);
        }

        ajStrRemoveWhiteExcess(&degapstr);
    }

    ajSeqoutWriteSet(seqout, outseqset);
    ajSeqoutClose(seqout);

    ajTrnDel(&trnTable);
    ajSeqsetDel(&outseqset);
    ajStrDel(&degapstr);
    ajStrDel(&degapstr2);

    ajSeqallDel(&nucseq);
    ajSeqDel(&nseq);
    ajSeqoutDel(&seqout);
    ajSeqsetDel(&protseq);
    ajStrDel(&tablelist);

    embExit();

    return 0;
}
示例#3
0
static AjPList silent_mismatch(const AjPStr sstr, AjPList relist,
			       AjPStr* tailstr,
			       const AjPStr sname, ajint RStotal, ajint begin,
			       ajint radj,AjBool rev, ajint end,
			       AjBool tshow)
{
    PSilent res;
    AjPList results;
    AjPStr str;                          /*holds RS patterns*/
    AjPStr tstr;
    AjBool dummy = ajFalse;              /*need a bool for ajPatClassify*/
    ajint mm;                            /*number of mismatches*/
    ajint hits;                          /*number of pattern matches found*/
    ajint i;
    ajint aw;
    ajint start;
    AjPList patlist = NULL;            /*a list for pattern matches.
                                             a list of ..*/
    EmbPMatMatch match;                /*..AjMatMatch structures*/
    PRinfo rlp = NULL;
    AjPStr pep   = NULL;               /*string to hold protein*/
    AjPTrn table = NULL;               /*object to hold translation table*/

    str   = ajStrNew();
    tstr  = ajStrNew();
    pep   = ajStrNew();
    table = ajTrnNewI(0);                /*0 stands for standard DNA-
                                         see fuzztran.acd*/
    results = ajListNew();
    start = 1;
    if(!rev)                           /* forward strand */
    {
	ajTrnSeqFrameS(table,sstr,1,&pep); /*1 stands for frame number*/
    	if(tshow)
    	{
                silent_fmt_sequence("TRANSLATED SEQUENCE",
				    pep,tailstr,start,ajFalse);
     	}
    }
    else                               /* reverse strand */
    {
	ajStrAssignC(&tstr,ajStrGetPtr(sstr)+(end-begin+1)%3);
	ajTrnSeqFrameS(table,tstr,1,&pep);
        if(tshow)
	{
		silent_fmt_sequence("REVERSE TRANSLATED SEQUENCE",
				    pep,tailstr,start,ajFalse);
	}
    }

    for(aw=0;aw<RStotal;aw++)          /* loop to read in restriction
					  enzyme patterns */
    {
	/* Pop off the first RE */
	ajListPop(relist,(void **)&rlp);
        /* ignore unknown cut sites and zero cutters */
       	if(*ajStrGetPtr(rlp->site)=='?'||!rlp->ncuts)
        {
	    ajListPushAppend(relist,(void*)rlp);
	    continue;
	}
	ajStrFmtUpper(&rlp->site);   /* convert all RS to upper case */
	ajStrAssignS(&str,rlp->site);      /* str now holds RS patterns */

	patlist = ajListNew();
        if(!embPatClassify(str,&str,
			   &dummy,&dummy,&dummy,&dummy,&dummy,&dummy,0))
	    continue;


	mm = 0;
	hits=embPatBruteForce(sstr,str,ajFalse,ajFalse,patlist,begin+1,mm,
                              sname);

	if(hits)
	    while(ajListPop(patlist,(void**)&match))
		embMatMatchDel(&match);
 	else
        {
	    mm = 1;
	    hits = embPatBruteForce(sstr,str,ajFalse,ajFalse,patlist,
				  begin+1,mm,sname);

	    if(hits)
		for(i=0;i<hits;++i)
		{
		    ajListPop(patlist,(void**)&match);
		    res = silent_checktrans(sstr,match,rlp,begin,radj,
					    rev,end);
		    if (res)
			ajListPushAppend(results,(void *)res);
		    embMatMatchDel(&match);
         	}
   	}

        ajListPushAppend(relist,(void *)rlp);

	ajListFree(&patlist);
    }
    
    ajStrDel(&str);
    ajStrDel(&tstr);
    ajStrDel(&pep);
    ajTrnDel(&table);
    return (results);
}
示例#4
0
static PSilent silent_checktrans(const AjPStr seq,const EmbPMatMatch match,
				const PRinfo rlp, ajint begin, ajint radj,
				AjBool rev, ajint end)
{
    PSilent ret;
    const char *p = NULL;
    const char *q = NULL;
    const char *s = NULL;
    char *t;
    const char *u;
    ajint matchpos;
    ajint framep;

    ajint count;
    AjPTrn table = NULL;
    AjPStr s1 = NULL;
    AjPStr s2 = NULL;
    char c;
    char rc;
    ajint  min = INT_MAX;          /* Reverse sense intentional! */
    ajint  max = -INT_MAX;
    ajint fpos;
    ajint rpos;
    ajint x;
    AjPStr tstr = NULL;

    matchpos = match->start;
    fpos = matchpos;
    rpos=radj-fpos-match->len;

    tstr = ajStrNewS(seq);
    t = ajStrGetuniquePtr(&tstr);

    p = t+fpos-(begin+1);

    u = q = ajStrGetPtr(rlp->site);

    /* Test here for whether cut site is within sequence substring */
    if(rlp->ncuts==4)
    {
	min = AJMIN(rlp->cut1,rlp->cut2);
	max = AJMAX(rlp->cut3,rlp->cut4);
    }
    else if(rlp->ncuts==2)
    {
	min = AJMIN(rlp->cut1,rlp->cut2);
	max = AJMAX(rlp->cut1,rlp->cut2);
    }
    else
    {
        ajWarn("Possibly corrupt RE file");
	ajStrDel(&tstr);
        return NULL;
    }

    if(!rev)                  /* forward strand */
    {
	if(matchpos+min<0||matchpos+max>end+1)
	{
	    /*Cut site not in sequence range*/
	    ajStrDel(&tstr);
	    return NULL;
	}
    }
    else                       /* reverse strand */
    {
	if(radj-matchpos-1-min>end+1||radj-matchpos-1-max<begin)
	{
	    /*Cut site not in sequence range*/
	    ajStrDel(&tstr);
	    return NULL;
	}
    }


    count=0;
    while(ajBaseAlphaToBin(*q++) & ajBaseAlphaToBin(*p++))
          ++count;

    /* Changed base postion */
    x = fpos+count-(begin+1);
    /* Where the frame starts on the reverse strand */
    framep = (end-begin+1)%3;

    c  = t[x];
    rc = u[count];

    if(!rev)            /* forward strand */
	s = t+x-x%3;
    else                /* reverse strand */
        s = t+x-(x-framep)%3;

    table = ajTrnNewI(0);

    /* translates codon pointed to by s (original seq) */
    s1 = ajStrNewK(ajTrnCodonC(table,s));

    t[x] = rc;

    /*  translates codon pointed to by s (mutated base from RS pattern */
    s2 = ajStrNewK(ajTrnCodonC(table,s));

    t[x] = c;  /* changes mutated base in seq back to original base */

    AJNEW(ret);
    ret->obase = c;
    ret->nbase = rc;
    ret->code  = ajStrNewC(ajStrGetPtr(rlp->code));
    ret->site  = ajStrNewC(ajStrGetPtr(rlp->site));
    ret->seqaa = ajStrNewC(ajStrGetPtr(s1));
    ret->reaa  = ajStrNewC(ajStrGetPtr(s2));
    if(ajStrMatchS(s1,s2))
	ret->issilent = ajTrue;
    else
	ret->issilent = ajFalse;
    if(!rev)
    {
       	ret->match = matchpos;
        ret->base  = matchpos+count;
    }
    else
    {
	ret->match = rpos;
	ret->base  = rpos+match->len-1-count;
    }

    ajStrDel(&tstr);
    ajStrDel(&s1);
    ajStrDel(&s2);
    ajTrnDel(&table);

    return ret;
}
示例#5
0
AjPSeqout get_orf(
  AjPSeqout seqout,
  AjPSeqall seqall,
  AjPStr tablestr,
  ajuint minsize,
  ajuint maxsize,
  AjPStr findstr,
  AjBool methionine,
  AjBool circular,
  AjBool reverse,
  ajint around
) {
  ajint table;
  ajint find;
  AjPSeq seq = NULL;
  AjPTrn trnTable;
  AjPStr sseq = NULL;    /* sequence string */

  /* ORF number to append to name of sequence to create unique name */
  ajint orf_no;


  AjBool sense;    /* ajTrue = forward sense */
  ajint len;

  /* initialise the translation table */
  ajStrToInt(tablestr, &table);
  trnTable = ajTrnNewI(table);

  /* what sort of ORF are we looking for */
  ajStrToInt(findstr, &find);

  /*
  ** get the minimum size converted to protein length if storing
  ** protein sequences
  */
  if (find == P_STOP2STOP || find == P_START2STOP || find == AROUND_START) {
    minsize /= 3;
    maxsize /= 3;
  }

  while (ajSeqallNext(seqall, &seq)) {
    orf_no = 1;           /* number of the next ORF */
    sense = ajTrue;           /* forward sense initially */

    /* get the length of the sequence */
    len = ajSeqGetLen(seq);

    /*
    ** if the sequence is circular, append it to itself to triple its
    **  length so can deal easily with wrapped ORFs, but don't update
    ** len
    */
    if (circular) {
      ajStrAssignS(&sseq, ajSeqGetSeqS(seq));
      ajStrAppendS(&sseq, ajSeqGetSeqS(seq));
      ajStrAppendS(&sseq, ajSeqGetSeqS(seq));
      ajSeqAssignSeqS(seq, sseq);
    }

    /* find the ORFs */
    getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense,
                    circular, find, &orf_no, methionine, around, &record);

    /* now reverse complement the sequence and do it again */
    if (reverse) {
      sense = ajFalse;
      ajSeqReverseForce(seq);
      getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense,
                      circular, find, &orf_no, methionine,
                      around);
    }
  }
  ajTrnDel(&trnTable);
  ajSeqDel(&seq);
  ajStrDel(&sseq);
}