コード例 #1
0
/* @funcstatic seqwords_keysearch  ********************************************
**
** Search swissprot with terms structure and writes a hitlist structure
**
** @param [r] inf   [AjPFile]     File pointer to swissprot database
** @param [r] terms [AjPTerms]    Terms object pointer
** @param [w] hits  [EmbPHitlist*] Hitlist object pointer
**
** @return [AjBool] True on success
** @@
******************************************************************************/
static AjBool seqwords_keysearch(AjPFile inf, 
				 AjPTerms terms,
				 EmbPHitlist *hits)
{
    AjPStr   line           =NULL;	/* Line of text. */
    AjPStr   id             =NULL;	/* Line of text. */
    AjPStr   temp           =NULL;
    ajint    s              =0;         /* Temp. start of hit value. */
    ajint    e              =0;         /* Temp. end of hit value. */
    AjPInt   start          =NULL;      /* Array of start of hit(s). */
    AjPInt   end            =NULL;      /* Array of end of hit(s). */
    ajint    nhits          =0;         /* Number of hits. */
    ajint    x              =0;         
    AjBool   foundkw        =ajFalse;
    AjBool   foundft        =ajFalse;


    /* Check for valid args. */
    if(!inf)
	return ajFalse;
    

    

    /* Allocate strings and arrays. */
    line       = ajStrNew();
    id         = ajStrNew();
    start      = ajIntNew();
    end        = ajIntNew();



    /* Start of main loop. */
    while((ajReadlineTrim(inf,&line)))
    {
	/* Parse the AC line. */
	if(ajStrPrefixC(line,"AC"))
	{
	    /* Copy accesion number and remove the ';' from the end. */
	    ajFmtScanS(line, "%*s %S", &id);
	    ajStrExchangeCC(&id, ";", "\0");
	    
	    
	    /* Reset flags & no. hits. */
	    foundkw=ajFalse;
	    foundft=ajFalse;
	    nhits=0;
	}
	
	
	/* Search the description and keyword lines with search terms. */
	else if((ajStrPrefixC(line,"DE") || (ajStrPrefixC(line,"KW"))))
	{
	    /* 
	     ** Search terms have a leading and trailing space to prevent 
	     ** them being found as substrings within other words.  To 
	     ** catch cases where a DE or KW line begins with a search 
	     ** term, we must add a leading and trailing space to line.
	     ** We must first remove punctation from the line to be parsed.
	     */
	    ajStrExchangeSetCC(&line, ".,;:", "    ");
	    ajStrAppendK(&line, ' ');
	    ajStrInsertC(&line, 0, " ");


	    for (x = 0; x < terms->N; x++)
		/* Search term is found. */
		if((ajStrFindCaseS(line, terms->Keywords[x])!=-1))
		{	
		    foundkw=ajTrue;
		    break;
		}
	}

	
	/* Search the feature table line with search terms. */
	else if((ajStrPrefixC(line,"FT   DOMAIN")))
	{
	    /*	
	     ** Search terms have a leading and trailing space to prevent 
	     ** them being found as substrings within other words.  To 
	     ** catch cases where a FT line ends with a search 
	     ** term, we must add a  trailing space to line 
	     ** We must first remove punctation from the line to be parsed.
	     */
	    ajStrExchangeSetCC(&line, ".,;:", "    ");
	    ajStrAppendK(&line, ' ');
	    

	    for (x = 0; x < terms->N; x++)
		if((ajStrFindCaseS(line, terms->Keywords[x])!=-1))
		{
		    /* Search term is found. */
		    foundft = ajTrue;
		    nhits++;
		    
		    /* Assign start and end of hit. */
		    ajFmtScanS(line, "%*s %*s %d %d", &s, &e);


		    ajIntPut(&start, nhits-1, s);
		    ajIntPut(&end, nhits-1, e);
		    break;
		}
	}
	

	/* Parse the sequence. */
	else if((ajStrPrefixC(line,"SQ") && ((foundkw == ajTrue) ||
					     (foundft == ajTrue))))
	{
	    /* Allocate memory for temp. sequence. */
	    temp       = ajStrNew();


	    /* Read the sequence into hitlist structure. */
	    while((ajReadlineTrim(inf,&line)) && !ajStrPrefixC(line,"//"))
		/* Read sequence line into temp. */
		ajStrAppendC(&temp,ajStrGetPtr(line)+3);
 

	    /* Clean up temp. sequence. */
	    ajStrRemoveWhite(&temp);


	    /*Priority is given to domain (rather than full length) sequence.*/
	    if(foundft)
	    {
		for(x=0;x<nhits;x++)
		{
		    /* Increment counter of hits for subsequent hits*/
		    (*hits)->N++;

		    
		    /* Reallocate memory for array of hits in hitlist
                       structure. */
		    AJCRESIZE((*hits)->hits, (*hits)->N);
		    (*hits)->hits[(*hits)->N-1]=embHitNew();
		    ajStrAssignC(&(*hits)->hits[(*hits)->N-1]->Model,
				 "KEYWORD");
		    

		    /* Assign start and end of hit. */
		    (*hits)->hits[(*hits)->N-1]->Start = ajIntGet(start, x);
		    (*hits)->hits[(*hits)->N-1]->End = ajIntGet(end, x);
				

		    /* Extract sequence within specified range */
		    ajStrAssignSubS(&(*hits)->hits[(*hits)->N - 1]->Seq, temp, 
				(*hits)->hits[(*hits)->N - 1]->Start - 1, 
				(*hits)->hits[(*hits)->N - 1]->End - 1);
		    

		    /* Put id into structure */
		    ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Acc, id);
		}
	    }
	    else
	    {
		/* Increment counter of hits */
		(*hits)->N++;

		    
		/* Reallocate memory for array of hits in hitlist structure */
		AJCRESIZE((*hits)->hits, (*hits)->N);
		(*hits)->hits[(*hits)->N-1]=embHitNew();
		ajStrAssignC(&(*hits)->hits[(*hits)->N-1]->Model, "KEYWORD");

		/* Extract whole sequence */
		ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Seq, temp); 
		(*hits)->hits[(*hits)->N - 1]->Start = 1; 
		(*hits)->hits[(*hits)->N - 1]->End =
		    ajStrGetLen((*hits)->hits[(*hits)->N - 1]->Seq); 


		/* Put id into structure */
		ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Acc, id);
	    }

	    /* Free temp. sequence */
	    ajStrDel(&temp);
	}
    }


    /* Clean up */
    ajStrDel(&line);
    ajStrDel(&id);
    ajIntDel(&start);
    ajIntDel(&end);

    return ajTrue;
}
コード例 #2
0
ファイル: tranalign.c プロジェクト: WenchaoLin/JAMg
int main(int argc, char **argv)
{
    AjPSeqall nucseq;		/* input nucleic sequences */
    AjPSeqset protseq;		/* input aligned protein sequences */
    AjPSeqout seqout;
    AjPSeq nseq;		/* next nucleic sequence to align */
    const AjPSeq pseq;		/* next protein sequence use in alignment */
    AjPTrn trnTable;
    AjPSeq pep;			/* translation of nseq */
    AjPStr tablelist;
    ajint table;
    AjPSeqset outseqset;	/* set of aligned nucleic sequences */
    ajint proteinseqcount = 0;
    AjPStr degapstr = NULL;
    /* used to check if it matches with START removed */
    AjPStr degapstr2 = NULL;
    AjPStr codon = NULL;	/* holds temporary codon to check if is START */
    char aa;			/* translated putative START codon */
    ajint type;			/* returned type of the putative START codon */
    /* start position of guide protein in translation */
    ajlong pos = 0;
    AjPSeq newseq = NULL;	/* output aligned nucleic sequence */
    ajint frame;

    embInit("tranalign", argc, argv);

    nucseq    = ajAcdGetSeqall("asequence");
    protseq   = ajAcdGetSeqset("bsequence");
    tablelist = ajAcdGetListSingle("table");
    seqout    = ajAcdGetSeqoutset("outseq");

    outseqset = ajSeqsetNew();
    degapstr  = ajStrNew();

    /* initialise the translation table */
    ajStrToInt(tablelist, &table);
    trnTable = ajTrnNewI(table);

    ajSeqsetFill(protseq);

    while(ajSeqallNext(nucseq, &nseq))
    {
    	if((pseq = ajSeqsetGetseqSeq(protseq, proteinseqcount++)) == NULL)
    	    ajErr("No guide protein sequence available for "
		  "nucleic sequence %S",
		  ajSeqGetNameS(nseq));

	ajDebug("Aligning %S and %S\n",
		ajSeqGetNameS(nseq), ajSeqGetNameS(pseq));

        /* get copy of pseq string with no gaps */
        ajStrAssignS(&degapstr, ajSeqGetSeqS(pseq));
        ajStrRemoveGap(&degapstr);

        /*
	** for each translation frame look for subset of pep that
	** matches pseq
	*/
        for(frame = 1; frame <4; frame++)
	{
	    ajDebug("trying frame %d\n", frame);
            pep = ajTrnSeqOrig(trnTable, nseq, frame);
            degapstr2 = ajStrNew();
            ajStrAssignRef(&degapstr2, degapstr);
            pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr);

            /* 
            ** we might have a START codon that should be translated as 'M'
            ** we need to check if there is a match after a possible START
            ** codon 
            */
            if(pos == -1 && ajStrGetLen(degapstr) > 1 && 
                (ajStrGetPtr(degapstr)[0] == 'M' ||
		 ajStrGetPtr(degapstr)[0] == 'm'))
	      {
                /* see if pep minus the first character is a match */
                ajStrCutStart(&degapstr2, 1);
                pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr2); 

                /*
		** pos is >= 1 if we have a match that is after the first
		** residue
		*/
                if(pos >= 1)
		{
                    /* point back at the putative START Methionine */
                    pos--;
                    /* test if first codon is a START */
                    codon = ajStrNew();
                    ajStrAssignSubS(&codon, ajSeqGetSeqS(nseq), 
                                (pos*3)+frame-1, (pos*3)+frame+2);
                    type = ajTrnCodonstrTypeS(trnTable, codon, &aa);

                    if(type != 1)
                    {
                        /* first codon is not a valid START, force a mismatch */
                        pos = -1;
                    }
                    ajStrDel(&codon);
                
            	}
		else
		{
                    /* force 'pos == 0' to be treated as a mismatch */
            	    pos = -1;
		}
            }

            ajStrDel(&degapstr2);
            ajSeqDel(&pep);

            if(pos != -1)
            	break;
        }

        if(pos == -1)
	    ajErr("Guide protein sequence %S not found in nucleic sequence %S",
		  ajSeqGetNameS(pseq), ajSeqGetNameS(nseq));
	else
	{
	    ajDebug("got a match with frame=%d\n", frame);
            /* extract the coding region of nseq with gaps */
            newseq = ajSeqNew();
            ajSeqSetNuc(newseq);
            ajSeqAssignNameS(newseq, ajSeqGetNameS(nseq));
            ajSeqAssignDescS(newseq, ajSeqGetDescS(nseq));
            tranalign_AddGaps(newseq, nseq, pseq, (pos*3)+frame-1);

            /* output the gapped nucleic sequence */
            ajSeqsetApp(outseqset, newseq);

            ajSeqDel(&newseq);
        }

        ajStrRemoveWhiteExcess(&degapstr);
    }

    ajSeqoutWriteSet(seqout, outseqset);
    ajSeqoutClose(seqout);

    ajTrnDel(&trnTable);
    ajSeqsetDel(&outseqset);
    ajStrDel(&degapstr);
    ajStrDel(&degapstr2);

    ajSeqallDel(&nucseq);
    ajSeqDel(&nseq);
    ajSeqoutDel(&seqout);
    ajSeqsetDel(&protseq);
    ajStrDel(&tablelist);

    embExit();

    return 0;
}