Пример #1
0
void writeGap(struct dyString *aRes, int aGap, char *aSeq, struct dyString *bRes, int bGap, char *bSeq)
/* Write double - gap.  Something like:
 *     ....123.... or --c
 *     ...4123....    ag-  */

{
char abbrev[16];
int minToAbbreviate = 16;
if (doShort && (aGap >= minToAbbreviate || bGap >= minToAbbreviate))
    {
    fillShortGapString(abbrev, aGap, '.', 13);
    dyStringAppend(aRes, abbrev);
    fillShortGapString(abbrev, bGap, '.', 13);
    dyStringAppend(bRes, abbrev);
    }
else
    {
#ifdef OLD
    dyStringAppendMultiC(aRes, '-', aGap);
    dyStringAppendN(bRes, bSeq, aGap);
    dyStringAppendN(aRes, aSeq, bGap);
    dyStringAppendMultiC(bRes, '-', bGap);
#endif /* OLD */
    dyStringAppendMultiC(aRes, '-', bGap);
    dyStringAppendN(bRes, bSeq, bGap);
    dyStringAppendN(aRes, aSeq, aGap);
    dyStringAppendMultiC(bRes, '-', aGap);
    }
}
Пример #2
0
void writeGap(struct dyString *aRes, int aGap, char *aSeq, struct dyString *bRes, int bGap, char *bSeq)
/* Write double - gap.  Something like:
 *         --c
 *         ag-  */

{
dyStringAppendMultiC(aRes, '-', bGap);
dyStringAppendN(bRes, bSeq, bGap);
dyStringAppendN(aRes, aSeq, aGap);
dyStringAppendMultiC(bRes, '-', aGap);
}
static void fillInMissing(struct oneOrg *nativeOrg, struct oneOrg *orgList,
	struct dnaSeq *native, int seqStart, int curPos, int aliStart)
/* Fill in alignment strings in orgList with native sequence
 * for first organism, and dots for rest. */
{
int fillSize = aliStart - curPos;
int offset = curPos - seqStart;
struct oneOrg *org;
if (nativeOrg == NULL)
    return;
dyStringAppendN(nativeOrg->dy, native->dna + offset, fillSize);
for (org = orgList; org != NULL; org = org->next)
    {
    if (org != nativeOrg)
	dyStringAppendMultiC(org->dy, '.', fillSize);
    }
}
Пример #4
0
void writeInsert(struct dyString *aRes, struct dyString *bRes, char *aSeq, int gapSize)
/* Write out gap, possibly shortened, to aRes, bRes. */
{
int minToAbbreviate = 16;
if (doShort && gapSize >= minToAbbreviate)
    {
    char abbrevGap[16];
    char abbrevSeq[16];
    fillSpliceSites(abbrevSeq, gapSize, aSeq, 15);
    dyStringAppend(aRes, abbrevSeq);
    fillShortGapString(abbrevGap, gapSize, '-', 15);
    dyStringAppend(bRes, abbrevGap);
    }
else
    {
    dyStringAppendN(aRes, aSeq, gapSize);
    dyStringAppendMultiC(bRes, '-', gapSize);
    }
}
Пример #5
0
static void saveAxtBundle(char *chromName, int chromSize, int chromOffset,
                          struct ffAli *ali,
                          struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq,
                          boolean qIsRc, boolean tIsRc,
                          enum ffStringency stringency, int minMatch, struct gfOutput *out)
/* Save alignment to axtBundle. */
{
    struct axtData *ad = out->data;
    struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL;
    struct axt *axt;
    struct dyString *q = newDyString(1024), *t = newDyString(1024);
    struct axtBundle *gab;
    struct trans3 *t3List = NULL;

    if (t3Hash != NULL)
        t3List = hashMustFindVal(t3Hash, tSeq->name);
    AllocVar(gab);
    gab->tSize = chromSize;
    gab->qSize = qSeq->size;
    for (sAli = ali; sAli != NULL; sAli = eAli)
    {
        eAli = ffNextBreak(sAli, 8, tSeq, t3List);
        dyStringClear(q);
        dyStringClear(t);
        for (ff = sAli; ff != eAli; ff = ff->right)
        {
            dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart);
            dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart);
            rt = ff->right;
            if (rt != eAli)
            {
                int nGap = rt->nStart - ff->nEnd;
                int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE)
                              + chromOffset;
                int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE)
                            + chromOffset;
                int hGap = nhStart - ohEnd;
                int gap = max(nGap, hGap);
                if (nGap < 0 || hGap < 0)
                {
                    errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name);
                }
                if (nGap == gap)
                {
                    dyStringAppendN(q, ff->nEnd, gap);
                    dyStringAppendMultiC(t, '-', gap);
                }
                else
                {
                    dyStringAppendN(t, ff->hEnd, gap);
                    dyStringAppendMultiC(q, '-', gap);
                }
            }
            eFf = ff;	/* Keep track of last block in bunch */
        }
        assert(t->stringSize == q->stringSize);
        AllocVar(axt);
        axt->qName = cloneString(qSeq->name);
        axt->qStart = sAli->nStart - qSeq->dna;
        axt->qEnd = eFf->nEnd - qSeq->dna;
        axt->qStrand = (qIsRc ? '-' : '+');
        axt->tName = cloneString(chromName);
        axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset;
        axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset;
        axt->tStrand = (tIsRc ? '-' : '+');
        axt->symCount = t->stringSize;
        axt->qSym = cloneString(q->string);
        axt->tSym = cloneString(t->string);
        axt->frame = trans3Frame(sAli->hStart, t3List);
        if (out->qIsProt)
            axt->score = axtScoreProteinDefault(axt);
        else
            axt->score = axtScoreDnaDefault(axt);
        slAddHead(&gab->axtList, axt);
    }
    slReverse(&gab->axtList);
    dyStringFree(&q);
    dyStringFree(&t);
    slAddHead(&ad->bundleList, gab);
}
struct mafAli *hgMafFrag(
	char *database,     /* Database, must already have hSetDb to this */
	char *track, 	    /* Name of MAF track */
	char *chrom, 	    /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
	char strand, 	    /* Chromosome strand. */
	char *outName, 	    /* Optional name to use in first component */
	struct slName *orderList /* Optional order of organisms. */
	)
/* mafFrag- Extract maf sequences for a region from database.
 * This creates a somewhat unusual MAF that extends from start
 * to end whether or not there are actually alignments.  Where
 * there are no alignments (or alignments missing a species)
 * a . character fills in.   The score is always zero, and
 * the sources just indicate the species.  You can mafFree this
 * as normal. */
{
int chromSize = hChromSize(database, chrom);
struct sqlConnection *conn = hAllocConn(database);
struct dnaSeq *native = hChromSeq(database, chrom, start, end);
struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end);
char masterSrc[128];
struct hash *orgHash = newHash(10);
struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL;
int curPos = start, symCount = 0;
struct slName *name;
int order = 0;

/* Check that the mafs are really copacetic, the particular
 * subtype we think is in the database that this (relatively)
 * simple code can handle. */
safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom);
mafCheckFirstComponentSrc(mafList, masterSrc);
mafCheckFirstComponentStrand(mafList, '+');
slSort(&mafList, mafCmp);

/* Prebuild organisms if possible from input orderList. */
for (name = orderList; name != NULL; name = name->next)
    {
    AllocVar(org);
    slAddHead(&orgList, org);
    hashAddSaveName(orgHash, name->name, org, &org->name);
    org->dy = dyStringNew(native->size*1.5);
    org->order = order++;
    if (nativeOrg == NULL)
        nativeOrg = org;
    }
if (orderList == NULL)
    {
    AllocVar(org);
    slAddHead(&orgList, org);
    hashAddSaveName(orgHash, database, org, &org->name);
    org->dy = dyStringNew(native->size*1.5);
    if (nativeOrg == NULL)
        nativeOrg = org;
    }

/* Go through all mafs in window, mostly building up
 * org->dy strings. */
for (maf = mafList; maf != NULL; maf = maf->next)
    {
    struct mafComp *mc, *mcMaster = maf->components;
    struct mafAli *subMaf = NULL;
    order = 0;
    if (curPos < mcMaster->start)
	{
	fillInMissing(nativeOrg, orgList, native, start,
		curPos, mcMaster->start);
	symCount += mcMaster->start - curPos;
	}
    if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst
    						    * backtracking */
	{
	if (mafNeedSubset(maf, masterSrc, curPos, end))
	    {
	    subMaf = mafSubset(maf, masterSrc, curPos, end);
	    if (subMaf == NULL)
	        continue;
	    }
	else
	    subMaf = maf;
	for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order)
	    {
	    /* Extract name up to dot into 'orgName' */
	    char buf[128], *e, *orgName;

	    if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */
		continue;

	    mc->leftStatus = mc->rightStatus = 0; /* squash annotation */

	    e = strchr(mc->src, '.');
	    if (e == NULL)
		orgName = mc->src;
	    else
		{
		int len = e - mc->src;
		if (len >= sizeof(buf))
		    errAbort("organism/database name %s too long", mc->src);
		memcpy(buf, mc->src, len);
		buf[len] = 0;
		orgName = buf;
		}

	    /* Look up dyString corresponding to  org, and create a
	     * new one if necessary. */
	    org = hashFindVal(orgHash, orgName);
	    if (org == NULL)
		{
		if (orderList != NULL)
		   errAbort("%s is not in orderList", orgName);
		AllocVar(org);
		slAddHead(&orgList, org);
		hashAddSaveName(orgHash, orgName, org, &org->name);
		org->dy = dyStringNew(native->size*1.5);
		dyStringAppendMultiC(org->dy, '.', symCount);
		if (nativeOrg == NULL)
		    nativeOrg = org;
		}
	    if (orderList == NULL && order > org->order)
		org->order = order;
	    org->hit = TRUE;

	    /* Fill it up with alignment. */
	    dyStringAppendN(org->dy, mc->text, subMaf->textSize);
	    }
	for (org = orgList; org != NULL; org = org->next)
	    {
	    if (!org->hit)
		dyStringAppendMultiC(org->dy, '.', subMaf->textSize);
	    org->hit = FALSE;
	    }
	symCount += subMaf->textSize;
	curPos = mcMaster->start + mcMaster->size;
	if (subMaf != maf)
	    mafAliFree(&subMaf);
	}
    }
if (curPos < end)
    {
    fillInMissing(nativeOrg, orgList, native, start, curPos, end);
    symCount += end - curPos;
    }
mafAliFreeList(&mafList);

slSort(&orgList, oneOrgCmp);
if (strand == '-')
    {
    for (org = orgList; org != NULL; org = org->next)
	reverseComplement(org->dy->string, org->dy->stringSize);
    }

/* Construct our maf */
AllocVar(maf);
maf->textSize = symCount;
for (org = orgList; org != NULL; org = org->next)
    {
    struct mafComp *mc;
    AllocVar(mc);
    if (org == orgList)
        {
	if (outName != NULL)
	    {
	    mc->src = cloneString(outName);
	    mc->srcSize = native->size;
	    mc->strand = '+';
	    mc->start = 0;
	    mc->size = native->size;
	    }
	else
	    {
	    mc->src = cloneString(masterSrc);
	    mc->srcSize = chromSize;
	    mc->strand = strand;
	    if (strand == '-')
	       reverseIntRange(&start, &end, chromSize);
	    mc->start = start;
	    mc->size = end-start;
	    }
	}
    else
        {
	int size = countAlpha(org->dy->string);
	mc->src = cloneString(org->name);
	mc->srcSize = size;
	mc->strand = '+';
	mc->start = 0;
	mc->size = size;
	}
    mc->text = cloneString(org->dy->string);
    dyStringFree(&org->dy);
    slAddHead(&maf->components, mc);
    }
slReverse(&maf->components);

slFreeList(&orgList);
freeHash(&orgHash);
hFreeConn(&conn);
return maf;
}
Пример #7
0
void writeInsert(struct dyString *aRes, struct dyString *bRes, char *aSeq, int gapSize)
/* Write out gap, possibly shortened, to aRes, bRes. */
{
dyStringAppendN(aRes, aSeq, gapSize);
dyStringAppendMultiC(bRes, '-', gapSize);
}
Пример #8
0
void outputBlocks(struct lineFile *lf,
	struct block *blockList, int score, FILE *f, boolean isRc, 
	char *qName, int qSize, char *qNibDir, struct dlList *qCache,
	char *tName, int tSize, char *tNibDir, struct dlList *tCache,
	boolean rescore)
/* Output block list as an axt to file f. */
{
int qStart = BIGNUM, qEnd = 0, tStart = BIGNUM, tEnd = 0;
struct block *lastBlock = NULL;
struct block *block;
struct dyString *qSym = newDyString(16*1024);
struct dyString *tSym = newDyString(16*1024);
struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seq = NULL;
struct axt axt;
boolean qIsTwoBit = twoBitIsFile(qNibDir);
boolean tIsTwoBit = twoBitIsFile(tNibDir);

if (blockList == NULL)
    return;

/* Figure overall dimensions. */
for (block = blockList; block != NULL; block = block->next)
    {
    if (qStart > block->qStart) qStart = block->qStart;
    if (qEnd < block->qEnd) qEnd = block->qEnd;
    if (tStart > block->tStart) tStart = block->tStart;
    if (tEnd < block->tEnd) tEnd = block->tEnd;
    }

/* Load sequence covering alignment from nib files. */
if (isRc)
    {
    reverseIntRange(&qStart, &qEnd, qSize);
    if (qIsFa)
        {
        for (seq = qFaList ; seq != NULL ; seq = seq->next)
            if (sameString(qName, seq->name))
                break;
        if (seq != NULL)
            {
            AllocVar(qSeq);
            qSeq->size = qEnd - qStart;
            qSeq->name = cloneString(qName);
            qSeq->dna = cloneMem((seq->dna)+qStart, qSeq->size);
            }
        else
            errAbort("sequence not found %s\n",qName);
        }
    else
        qSeq = readFromCache(qCache, qNibDir, qName, qStart, qEnd - qStart, qSize, qIsTwoBit);
    reverseIntRange(&qStart, &qEnd, qSize);
    reverseComplement(qSeq->dna, qSeq->size);
    }
else
    {    
    if (qIsFa)
        {
        for (seq = qFaList ; seq != NULL ; seq = seq->next)
	    {
            if (sameString(qName, seq->name))
                break;
	    }
	if (seq != NULL)
	    {
	    AllocVar(qSeq);
	    qSeq->size = qEnd - qStart;
	    qSeq->name = cloneString(qName);
	    qSeq->dna = (seq->dna)+qStart;
	    }
	else
	    errAbort("sequence not found %s\n",qName);
        }
    else
        qSeq = readFromCache(qCache, qNibDir, qName, qStart, qEnd - qStart, qSize, qIsTwoBit);
    }
    if (tIsFa)
        {
        for (seq = tFaList ; seq != NULL ; seq = seq->next)
            if (sameString(tName, seq->name))
                break;
        if (seq != NULL)
            {
            AllocVar(tSeq);
            tSeq->size = tEnd - tStart;
            tSeq->name = cloneString(tName);
            tSeq->dna = cloneMem((seq->dna)+tStart, tSeq->size);
            }
        else
            errAbort("sequence not found %s\n",tName);
        }
    else
        tSeq = readFromCache(tCache, tNibDir, tName, tStart, tEnd - tStart, tSize, tIsTwoBit);

/* Loop through blocks copying sequence into dynamic strings. */
for (block = blockList; block != NULL; block = block->next)
    {
    if (lastBlock != NULL)
        {
	int qGap = block->qStart - lastBlock->qEnd;
	int tGap = block->tStart - lastBlock->tEnd;
	if (qGap != 0 && tGap != 0)
	    {
	    errAbort("Gaps in both strand on alignment ending line %d of %s",
	    	lf->lineIx, lf->fileName);
	    }
	if (qGap > 0)
	    {
	    dyStringAppendMultiC(tSym, '-', qGap);
	    dyStringAppendN(qSym, qSeq->dna + lastBlock->qEnd - qStart, qGap);
	    }
	if (tGap > 0)
	    {
	    dyStringAppendMultiC(qSym, '-', tGap);
	    dyStringAppendN(tSym, tSeq->dna + lastBlock->tEnd - tStart, tGap);
	    }
	}
    if (qSeq->size < block->qStart - qStart)
        {
        errAbort("read past end of sequence %s size =%d block->qStart-qstart=%d block->qStart=%d qEnd=%d \n", qName, qSeq->size, block->qStart-qStart,block->qStart, block->qEnd );
        }
    dyStringAppendN(qSym, qSeq->dna + block->qStart - qStart,
    	block->qEnd - block->qStart);
    if (tSeq->size < block->tStart - tStart)
        {
        errAbort("read past end of sequence %s size =%d block->tStart-tstart=%d\n", tName, tSeq->size, block->tStart-tStart);
        }
    dyStringAppendN(tSym, tSeq->dna + block->tStart - tStart,
    	block->tEnd - block->tStart);
    lastBlock = block;
    }
if (qSym->stringSize != tSym->stringSize)
    errAbort("qSize and tSize don't agree in alignment ending line %d of %s",
	    lf->lineIx, lf->fileName);

if (rescore)
    score = axtScoreSym(scoreScheme, qSym->stringSize,
			qSym->string, tSym->string);

/* Fill in an axt and write it to output. */
ZeroVar(&axt);
axt.qName = qName;
axt.qStart = qStart;
axt.qEnd = qEnd;
axt.qStrand = (isRc ? '-' : '+');
axt.tName = tName;
axt.tStart = tStart;
axt.tEnd = tEnd;
axt.tStrand = '+';
axt.score = score;
axt.symCount = qSym->stringSize;
axt.qSym = qSym->string;
axt.tSym = tSym->string;
axtWrite(&axt, f);

/* Clean up. */
if (!qIsFa)
    freeDnaSeq(&qSeq);
freeDnaSeq(&tSeq);
dyStringFree(&qSym);
dyStringFree(&tSym);
}