struct bed *pslToBed(struct psl *psl)
/* Convert a psl format row of strings to a bed, very similar to customTrack.c::customTrackPsl*/
{
struct bed *bed;
int i, blockCount, *chromStarts, chromStart;

/* A tiny bit of error checking on the psl. */
if (psl->qStart >= psl->qEnd || psl->qEnd > psl->qSize 
    || psl->tStart >= psl->tEnd || psl->tEnd > psl->tSize)
    {
    errAbort("mangled psl format for %s", psl->qName);
    }

/* Allocate bed and fill in from psl. */
AllocVar(bed);
bed->chrom = cloneString(psl->tName);
bed->chromStart = bed->thickStart =  chromStart = psl->tStart;
bed->chromEnd = bed->thickEnd = psl->tEnd;
bed->score = 1000 - 2*pslCalcMilliBad(psl, TRUE);
if (bed->score < 0) bed->score = 0;
strncpy(bed->strand,  psl->strand, sizeof(bed->strand));
bed->blockCount = blockCount = psl->blockCount;
bed->blockSizes = (int *)cloneMem(psl->blockSizes,(sizeof(int)*psl->blockCount));
bed->chromStarts = chromStarts = (int *)cloneMem(psl->tStarts, (sizeof(int)*psl->blockCount));
bed->name = cloneString(psl->qName);

/* Switch minus target strand to plus strand. */
if (psl->strand[1] == '-')
    {
    int chromSize = psl->tSize;
    reverseInts(bed->blockSizes, blockCount);
    reverseInts(chromStarts, blockCount);
    for (i=0; i<blockCount; ++i)
	chromStarts[i] = chromSize - chromStarts[i];
    }

/* Convert coordinates to relative. */
for (i=0; i<blockCount; ++i)
    chromStarts[i] -= chromStart;
return bed;
}
struct bed *bedFromRow(
	char *chrom, 		  /* Chromosome bed is on. */
	char **row,  		  /* Row with other data for bed. */
	int fieldCount,		  /* Number of fields in final bed. */
	boolean isPsl, 		  /* True if in PSL format. */
	boolean isGenePred,	  /* True if in GenePred format. */
	boolean isBedWithBlocks,  /* True if BED with block list. */
	boolean *pslKnowIfProtein,/* Have we figured out if psl is protein? */
	boolean *pslIsProtein,    /* True if we know psl is protien. */
	struct lm *lm)		  /* Local memory pool */
/* Create bed from a database row when we already understand
 * the format pretty well.  The bed is allocated inside of
 * the local memory pool lm.  Generally use this in conjunction
 * with the results of a SQL query constructed with the aid
 * of the bedSqlFieldsExceptForChrom function. */
{
char *strand, tStrand, qStrand;
struct bed *bed;
int i, blockCount;

lmAllocVar(lm, bed);
bed->chrom = chrom;
bed->chromStart = sqlUnsigned(row[0]);
bed->chromEnd = sqlUnsigned(row[1]);

if (fieldCount < 4)
    return bed;
bed->name = lmCloneString(lm, row[2]);
if (fieldCount < 5)
    return bed;
bed->score = atoi(row[3]);
if (fieldCount < 6)
    return bed;
strand = row[4];
qStrand = strand[0];
tStrand = strand[1];
if (tStrand == 0)
    bed->strand[0] = qStrand;
else
    {
    /* psl: use XOR of qStrand,tStrand if both are given. */
    if (tStrand == qStrand)
	bed->strand[0] = '+';
    else
	bed->strand[0] = '-';
    }
if (fieldCount < 8)
    return bed;
bed->thickStart = sqlUnsigned(row[5]);
bed->thickEnd   = sqlUnsigned(row[6]);
if (fieldCount < 12)
    return bed;
bed->blockCount = blockCount = sqlUnsigned(row[7]);
lmAllocArray(lm, bed->blockSizes, blockCount);
sqlUnsignedArray(row[8], bed->blockSizes, blockCount);
lmAllocArray(lm, bed->chromStarts, blockCount);
sqlUnsignedArray(row[9], bed->chromStarts, blockCount);
if (isGenePred)
    {
    /* Translate end coordinates to sizes. */
    for (i=0; i<bed->blockCount; ++i)
	bed->blockSizes[i] -= bed->chromStarts[i];
    }
else if (isPsl)
    {
    if (!*pslKnowIfProtein)
	{
	/* Figure out if is protein using a rather elaborate but
	 * working test I think Angie or Brian must have figured out. */
	if (tStrand == '-')
	    {
	    int tSize = sqlUnsigned(row[10]);
	    *pslIsProtein = 
		   (bed->chromStart == 
		    tSize - (3*bed->blockSizes[bed->blockCount - 1]  + 
		    bed->chromStarts[bed->blockCount - 1]));
	    }
	else
	    {
	    *pslIsProtein = (bed->chromEnd == 
		    3*bed->blockSizes[bed->blockCount - 1]  + 
		    bed->chromStarts[bed->blockCount - 1]);
	    }
	*pslKnowIfProtein = TRUE;
	}
    if (*pslIsProtein)
	{
	/* if protein then blockSizes are in protein space */
	for (i=0; i<blockCount; ++i)
	    bed->blockSizes[i] *= 3;
	}
    if (tStrand == '-')
	{
	/* psl: if target strand is '-', flip the coords.
	 * (this is the target part of pslRcBoth from src/lib/psl.c) */
	int tSize = sqlUnsigned(row[10]);
	for (i=0; i<blockCount; ++i)
	    {
	    bed->chromStarts[i] = tSize - 
		    (bed->chromStarts[i] + bed->blockSizes[i]);
	    }
	reverseInts(bed->chromStarts, bed->blockCount);
	reverseInts(bed->blockSizes, bed->blockCount);
	}
    }
if (!isBedWithBlocks)
    {
    /* non-bed: translate absolute starts to relative starts */
    for (i=0;  i < bed->blockCount;  i++)
	bed->chromStarts[i] -= bed->chromStart;
    }
return bed;
}
Beispiel #3
0
static void pslShowAlignmentStranded2(struct psl *psl, boolean isProt,
	char *qName, bioSeq *qSeq, int qStart, int qEnd,
	char *tName, bioSeq *tSeq, int tStart, int tEnd, int exnStarts[], int exnEnds[], int exnCnt, FILE *f)
/* Show stamper gene and stamp elements alignment using genomic sequence.
 * The aligned exons' sequence of stamper gene are shown in colors as usual, but the
 * the unaligned exon's sequence of stamper gene are shown in red color.
 */
{
boolean tIsRc = (psl->strand[1] == '-');
boolean qIsRc = (psl->strand[0] == '-');
int mulFactor = (isProt ? 3 : 1);
DNA *dna = NULL;	/* Mixed case version of genomic DNA. */
int qSize = qSeq->size;
char *qLetters = cloneString(qSeq->dna);
int qbafStart, qbafEnd, tbafStart, tbafEnd;
int qcfmStart, qcfmEnd, tcfmStart, tcfmEnd;

tbafStart = psl->tStart;
tbafEnd   = psl->tEnd;
tcfmStart = psl->tStart;
tcfmEnd   = psl->tEnd;

qbafStart = qStart;
qbafEnd   = qEnd;
qcfmStart = qStart;
qcfmEnd   = qEnd;

/* Deal with minus strand. */
if (tIsRc)
    {
    int temp;
    reverseComplement(tSeq->dna, tSeq->size);
    temp = psl->tSize - tEnd;
    tEnd = psl->tSize - tStart;
    tStart = temp;
    
    tbafStart = psl->tEnd;
    tbafEnd   = psl->tStart;
    tcfmStart = psl->tEnd;
    tcfmEnd   = psl->tStart;
    }
if (qIsRc)
    {
    int temp, j;
    reverseComplement(qSeq->dna, qSeq->size);
    reverseComplement(qLetters, qSeq->size);

    qcfmStart = qEnd;
    qcfmEnd   = qStart;
    qbafStart = qEnd;
    qbafEnd   = qStart;
    
    temp = psl->qSize - qEnd;
    qEnd = psl->qSize - qStart;
    qStart = temp;
    for(j = 0; j < exnCnt; j++)
	{
	temp = psl->qSize - exnStarts[j];
	exnStarts[j] = psl->qSize - exnEnds[j];
	exnEnds[j] = temp;
	}
    reverseInts(exnEnds, exnCnt);
    reverseInts(exnStarts, exnCnt);
    }

dna = cloneString(tSeq->dna);

if (qName == NULL) 
    qName = psl->qName;
if (tName == NULL)
    tName = psl->tName;


fputs("Matching bases are colored blue and capitalized. " 
      "Light blue bases mark the boundaries of gaps in either aligned sequence. "
      "Red bases are unaligned exons' bases of the query gene. \n", f);

fprintf(f, "<H4><A NAME=cDNA></A>%s%s</H4>\n", qName, (qIsRc  ? " (reverse complemented)" : ""));
fprintf(f, "<PRE><TT>");
tolowers(qLetters);

/* Display query sequence. */
    {
    struct cfm *cfm;
    char *colorFlags = needMem(qSeq->size);
    int i = 0, j = 0, exnIdx = 0;
    int preStop = 0;
    
    for (i=0; i<psl->blockCount; ++i)
	{
	int qs = psl->qStarts[i] - qStart;
	int ts = psl->tStarts[i] - tStart;
	int sz = psl->blockSizes[i]-1;
	int end = 0;
	bool omitExon = FALSE;
	while(exnIdx < exnCnt && psl->qStarts[i] > exnEnds[exnIdx])
	    {
	    if(omitExon)
		{
		for( j = exnStarts[exnIdx] - qStart; j < exnEnds[exnIdx]-qStart; j++)
		    {
		    colorFlags[j] = socRed;
		    }
		}
	    exnIdx++;
	    preStop = exnStarts[exnIdx] - qStart;
	    omitExon = TRUE;
	    }

	/*mark the boundary bases */
	colorFlags[qs] = socBrightBlue;
	qLetters[qs] = toupper(qLetters[qs]);
	colorFlags[qs+sz] = socBrightBlue;
	qLetters[qs+sz] = toupper(qLetters[qs+sz]);
	
	/* determine block end */
	if( i < psl->blockCount -1)
	    end = psl->qStarts[i+1] < exnEnds[exnIdx] ? psl->qStarts[i+1] - qStart : exnEnds[exnIdx] - qStart;
	else
	    end = qs + sz;
	    
	for (j=preStop; j < end; j++)
	    {
	    if(j == 82)
		fprintf(stderr, "right here\n");
	    if (j > qs && j < qs+sz)
		{
		if (qSeq->dna[j] == tSeq->dna[ts+j-qs])
		    {
		    colorFlags[j] = socBlue;
		    qLetters[j] = toupper(qLetters[j]);
		    }		
		}
	    else if(colorFlags[j] != socBrightBlue && colorFlags[j] != socBlue)
		colorFlags[j] = socRed;
	    }
	preStop = end;
	}
    cfm = cfmNew(10, 60, TRUE, qIsRc, f, qcfmStart);
    for (i=0; i<qSize; ++i)
	cfmOut(cfm, qLetters[i], seqOutColorLookup[(int)colorFlags[i]]);
    cfmFree(&cfm);
    freez(&colorFlags);
    htmHorizontalLine(f);
    }
fprintf(f, "</TT></PRE>\n");
fprintf(f, "<H4><A NAME=genomic></A>%s %s:</H4>\n", 
	tName, (tIsRc ? "(reverse strand)" : ""));
fprintf(f, "<PRE><TT>");

/* Display DNA sequence. */
    {
    struct cfm *cfm;
    char *colorFlags = needMem(tSeq->size);
    int i,j;
    int curBlock = 0;

    for (i=0; i<psl->blockCount; ++i)
	{
	int qs = psl->qStarts[i] - qStart;
	int ts = psl->tStarts[i] - tStart;
	int sz = psl->blockSizes[i];
	if (isProt)
	    {
	    for (j=0; j<sz; ++j)
		{
		AA aa = qSeq->dna[qs+j];
		int codonStart = ts + 3*j;
		DNA *codon = &tSeq->dna[codonStart];
		AA trans = lookupCodon(codon);
		if (trans != 'X' && trans == aa)
		    {
		    colorFlags[codonStart] = socBlue;
		    colorFlags[codonStart+1] = socBlue;
		    colorFlags[codonStart+2] = socBlue;
		    toUpperN(dna+codonStart, 3);
		    }
		}
	    }
	else
	    {
	    for (j=0; j<sz; ++j)
		{
		if (qSeq->dna[qs+j] == tSeq->dna[ts+j])
		    {
		    colorFlags[ts+j] = socBlue;
		    dna[ts+j] = toupper(dna[ts+j]);
		    }
		}
	    }
	colorFlags[ts] = socBrightBlue;
	colorFlags[ts+sz*mulFactor-1] = socBrightBlue;
	}

    cfm = cfmNew(10, 60, TRUE, tIsRc, f, tcfmStart);
	
    for (i=0; i<tSeq->size; ++i)
	{
	/* Put down "anchor" on first match position in haystack
	 * so user can hop here with a click on the needle. */
	if (curBlock < psl->blockCount && psl->tStarts[curBlock] == (i + tStart) )
	    {
	    fprintf(f, "<A NAME=%d></A>", ++curBlock);
	    /* Watch out for (rare) out-of-order tStarts! */
	    while (curBlock < psl->blockCount &&
		   psl->tStarts[curBlock] <= tStart + i)
		curBlock++;
	    }
	cfmOut(cfm, dna[i], seqOutColorLookup[(int)colorFlags[i]]);
	}
    cfmFree(&cfm);
    freez(&colorFlags);
    htmHorizontalLine(f);
    }

/* Display side by side. */
fprintf(f, "</TT></PRE>\n");
fprintf(f, "<H4><A NAME=ali></A>Side by Side Alignment*</H4>\n");
fprintf(f, "<PRE><TT>");
    {
    struct baf baf;
    int i,j;

    bafInit(&baf, qSeq->dna, qbafStart, qIsRc,
	    tSeq->dna, tbafStart, tIsRc, f, 60, isProt);
		
    if (isProt)
	{
	for (i=0; i<psl->blockCount; ++i)
	    {
	    int qs = psl->qStarts[i] - qStart;
	    int ts = psl->tStarts[i] - tStart;
	    int sz = psl->blockSizes[i];

	    bafSetPos(&baf, qs, ts);
	    bafStartLine(&baf);
	    for (j=0; j<sz; ++j)
		{
		AA aa = qSeq->dna[qs+j];
		int codonStart = ts + 3*j;
		DNA *codon = &tSeq->dna[codonStart];
		bafOut(&baf, ' ', codon[0]);
		bafOut(&baf, aa, codon[1]);
		bafOut(&baf, ' ', codon[2]);
		}
	    bafFlushLine(&baf);
	    }
	fprintf( f, "<I>*when aa is different, BLOSUM positives are in green, BLOSUM negatives in red</I>\n");
	}
    else
	{
	int lastQe = psl->qStarts[0] - qStart;
	int lastTe = psl->tStarts[0] - tStart;
	int maxSkip = 20;
	bafSetPos(&baf, lastQe, lastTe);
	bafStartLine(&baf);
	for (i=0; i<psl->blockCount; ++i)
	    {
	    int qs = psl->qStarts[i] - qStart;
	    int ts = psl->tStarts[i] - tStart;
	    int sz = psl->blockSizes[i];
	    boolean doBreak = TRUE;
	    int qSkip = qs - lastQe;
	    int tSkip = ts - lastTe;

	    if (qSkip >= 0 && qSkip <= maxSkip && tSkip == 0)
		{
		for (j=0; j<qSkip; ++j)
		    bafOut(&baf, qSeq->dna[lastQe+j], '-');
		doBreak = FALSE;
		}
	    else if (tSkip > 0 && tSkip <= maxSkip && qSkip == 0)
		{
		for (j=0; j<tSkip; ++j)
		    bafOut(&baf, '-', tSeq->dna[lastTe+j]);
		doBreak = FALSE;
		}
	    if (doBreak)
		{
		bafFlushLine(&baf);
		bafSetPos(&baf, qs, ts);
		bafStartLine(&baf);
		}
	    for (j=0; j<sz; ++j)
		bafOut(&baf, qSeq->dna[qs+j], tSeq->dna[ts+j]);
	    lastQe = qs + sz;
	    lastTe = ts + sz;
	    }
	bafFlushLine(&baf);

	fprintf( f, "<I>*Aligned Blocks with gaps <= %d bases are merged for this display</I>\n", maxSkip);
	}
    }
fprintf(f, "</TT></PRE>");
if (qIsRc)
    reverseComplement(qSeq->dna, qSeq->size);
if (tIsRc)
    reverseComplement(tSeq->dna, tSeq->size);
freeMem(dna);
freeMem(qLetters);
}