void writeGap(struct dyString *aRes, int aGap, char *aSeq, struct dyString *bRes, int bGap, char *bSeq) /* Write double - gap. Something like: * ....123.... or --c * ...4123.... ag- */ { char abbrev[16]; int minToAbbreviate = 16; if (doShort && (aGap >= minToAbbreviate || bGap >= minToAbbreviate)) { fillShortGapString(abbrev, aGap, '.', 13); dyStringAppend(aRes, abbrev); fillShortGapString(abbrev, bGap, '.', 13); dyStringAppend(bRes, abbrev); } else { #ifdef OLD dyStringAppendMultiC(aRes, '-', aGap); dyStringAppendN(bRes, bSeq, aGap); dyStringAppendN(aRes, aSeq, bGap); dyStringAppendMultiC(bRes, '-', bGap); #endif /* OLD */ dyStringAppendMultiC(aRes, '-', bGap); dyStringAppendN(bRes, bSeq, bGap); dyStringAppendN(aRes, aSeq, aGap); dyStringAppendMultiC(bRes, '-', aGap); } }
void writeGap(struct dyString *aRes, int aGap, char *aSeq, struct dyString *bRes, int bGap, char *bSeq) /* Write double - gap. Something like: * --c * ag- */ { dyStringAppendMultiC(aRes, '-', bGap); dyStringAppendN(bRes, bSeq, bGap); dyStringAppendN(aRes, aSeq, aGap); dyStringAppendMultiC(bRes, '-', aGap); }
static void fillInMissing(struct oneOrg *nativeOrg, struct oneOrg *orgList, struct dnaSeq *native, int seqStart, int curPos, int aliStart) /* Fill in alignment strings in orgList with native sequence * for first organism, and dots for rest. */ { int fillSize = aliStart - curPos; int offset = curPos - seqStart; struct oneOrg *org; if (nativeOrg == NULL) return; dyStringAppendN(nativeOrg->dy, native->dna + offset, fillSize); for (org = orgList; org != NULL; org = org->next) { if (org != nativeOrg) dyStringAppendMultiC(org->dy, '.', fillSize); } }
void writeInsert(struct dyString *aRes, struct dyString *bRes, char *aSeq, int gapSize) /* Write out gap, possibly shortened, to aRes, bRes. */ { int minToAbbreviate = 16; if (doShort && gapSize >= minToAbbreviate) { char abbrevGap[16]; char abbrevSeq[16]; fillSpliceSites(abbrevSeq, gapSize, aSeq, 15); dyStringAppend(aRes, abbrevSeq); fillShortGapString(abbrevGap, gapSize, '-', 15); dyStringAppend(bRes, abbrevGap); } else { dyStringAppendN(aRes, aSeq, gapSize); dyStringAppendMultiC(bRes, '-', gapSize); } }
static void saveAxtBundle(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, boolean qIsRc, boolean tIsRc, enum ffStringency stringency, int minMatch, struct gfOutput *out) /* Save alignment to axtBundle. */ { struct axtData *ad = out->data; struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL; struct axt *axt; struct dyString *q = newDyString(1024), *t = newDyString(1024); struct axtBundle *gab; struct trans3 *t3List = NULL; if (t3Hash != NULL) t3List = hashMustFindVal(t3Hash, tSeq->name); AllocVar(gab); gab->tSize = chromSize; gab->qSize = qSeq->size; for (sAli = ali; sAli != NULL; sAli = eAli) { eAli = ffNextBreak(sAli, 8, tSeq, t3List); dyStringClear(q); dyStringClear(t); for (ff = sAli; ff != eAli; ff = ff->right) { dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart); dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart); rt = ff->right; if (rt != eAli) { int nGap = rt->nStart - ff->nEnd; int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE) + chromOffset; int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset; int hGap = nhStart - ohEnd; int gap = max(nGap, hGap); if (nGap < 0 || hGap < 0) { errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name); } if (nGap == gap) { dyStringAppendN(q, ff->nEnd, gap); dyStringAppendMultiC(t, '-', gap); } else { dyStringAppendN(t, ff->hEnd, gap); dyStringAppendMultiC(q, '-', gap); } } eFf = ff; /* Keep track of last block in bunch */ } assert(t->stringSize == q->stringSize); AllocVar(axt); axt->qName = cloneString(qSeq->name); axt->qStart = sAli->nStart - qSeq->dna; axt->qEnd = eFf->nEnd - qSeq->dna; axt->qStrand = (qIsRc ? '-' : '+'); axt->tName = cloneString(chromName); axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset; axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset; axt->tStrand = (tIsRc ? '-' : '+'); axt->symCount = t->stringSize; axt->qSym = cloneString(q->string); axt->tSym = cloneString(t->string); axt->frame = trans3Frame(sAli->hStart, t3List); if (out->qIsProt) axt->score = axtScoreProteinDefault(axt); else axt->score = axtScoreDnaDefault(axt); slAddHead(&gab->axtList, axt); } slReverse(&gab->axtList); dyStringFree(&q); dyStringFree(&t); slAddHead(&ad->bundleList, gab); }
struct mafAli *hgMafFrag( char *database, /* Database, must already have hSetDb to this */ char *track, /* Name of MAF track */ char *chrom, /* Chromosome (in database genome) */ int start, int end, /* start/end in chromosome */ char strand, /* Chromosome strand. */ char *outName, /* Optional name to use in first component */ struct slName *orderList /* Optional order of organisms. */ ) /* mafFrag- Extract maf sequences for a region from database. * This creates a somewhat unusual MAF that extends from start * to end whether or not there are actually alignments. Where * there are no alignments (or alignments missing a species) * a . character fills in. The score is always zero, and * the sources just indicate the species. You can mafFree this * as normal. */ { int chromSize = hChromSize(database, chrom); struct sqlConnection *conn = hAllocConn(database); struct dnaSeq *native = hChromSeq(database, chrom, start, end); struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end); char masterSrc[128]; struct hash *orgHash = newHash(10); struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL; int curPos = start, symCount = 0; struct slName *name; int order = 0; /* Check that the mafs are really copacetic, the particular * subtype we think is in the database that this (relatively) * simple code can handle. */ safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom); mafCheckFirstComponentSrc(mafList, masterSrc); mafCheckFirstComponentStrand(mafList, '+'); slSort(&mafList, mafCmp); /* Prebuild organisms if possible from input orderList. */ for (name = orderList; name != NULL; name = name->next) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, name->name, org, &org->name); org->dy = dyStringNew(native->size*1.5); org->order = order++; if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, database, org, &org->name); org->dy = dyStringNew(native->size*1.5); if (nativeOrg == NULL) nativeOrg = org; } /* Go through all mafs in window, mostly building up * org->dy strings. */ for (maf = mafList; maf != NULL; maf = maf->next) { struct mafComp *mc, *mcMaster = maf->components; struct mafAli *subMaf = NULL; order = 0; if (curPos < mcMaster->start) { fillInMissing(nativeOrg, orgList, native, start, curPos, mcMaster->start); symCount += mcMaster->start - curPos; } if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst * backtracking */ { if (mafNeedSubset(maf, masterSrc, curPos, end)) { subMaf = mafSubset(maf, masterSrc, curPos, end); if (subMaf == NULL) continue; } else subMaf = maf; for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order) { /* Extract name up to dot into 'orgName' */ char buf[128], *e, *orgName; if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */ continue; mc->leftStatus = mc->rightStatus = 0; /* squash annotation */ e = strchr(mc->src, '.'); if (e == NULL) orgName = mc->src; else { int len = e - mc->src; if (len >= sizeof(buf)) errAbort("organism/database name %s too long", mc->src); memcpy(buf, mc->src, len); buf[len] = 0; orgName = buf; } /* Look up dyString corresponding to org, and create a * new one if necessary. */ org = hashFindVal(orgHash, orgName); if (org == NULL) { if (orderList != NULL) errAbort("%s is not in orderList", orgName); AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, orgName, org, &org->name); org->dy = dyStringNew(native->size*1.5); dyStringAppendMultiC(org->dy, '.', symCount); if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL && order > org->order) org->order = order; org->hit = TRUE; /* Fill it up with alignment. */ dyStringAppendN(org->dy, mc->text, subMaf->textSize); } for (org = orgList; org != NULL; org = org->next) { if (!org->hit) dyStringAppendMultiC(org->dy, '.', subMaf->textSize); org->hit = FALSE; } symCount += subMaf->textSize; curPos = mcMaster->start + mcMaster->size; if (subMaf != maf) mafAliFree(&subMaf); } } if (curPos < end) { fillInMissing(nativeOrg, orgList, native, start, curPos, end); symCount += end - curPos; } mafAliFreeList(&mafList); slSort(&orgList, oneOrgCmp); if (strand == '-') { for (org = orgList; org != NULL; org = org->next) reverseComplement(org->dy->string, org->dy->stringSize); } /* Construct our maf */ AllocVar(maf); maf->textSize = symCount; for (org = orgList; org != NULL; org = org->next) { struct mafComp *mc; AllocVar(mc); if (org == orgList) { if (outName != NULL) { mc->src = cloneString(outName); mc->srcSize = native->size; mc->strand = '+'; mc->start = 0; mc->size = native->size; } else { mc->src = cloneString(masterSrc); mc->srcSize = chromSize; mc->strand = strand; if (strand == '-') reverseIntRange(&start, &end, chromSize); mc->start = start; mc->size = end-start; } } else { int size = countAlpha(org->dy->string); mc->src = cloneString(org->name); mc->srcSize = size; mc->strand = '+'; mc->start = 0; mc->size = size; } mc->text = cloneString(org->dy->string); dyStringFree(&org->dy); slAddHead(&maf->components, mc); } slReverse(&maf->components); slFreeList(&orgList); freeHash(&orgHash); hFreeConn(&conn); return maf; }
void writeInsert(struct dyString *aRes, struct dyString *bRes, char *aSeq, int gapSize) /* Write out gap, possibly shortened, to aRes, bRes. */ { dyStringAppendN(aRes, aSeq, gapSize); dyStringAppendMultiC(bRes, '-', gapSize); }
void outputBlocks(struct lineFile *lf, struct block *blockList, int score, FILE *f, boolean isRc, char *qName, int qSize, char *qNibDir, struct dlList *qCache, char *tName, int tSize, char *tNibDir, struct dlList *tCache, boolean rescore) /* Output block list as an axt to file f. */ { int qStart = BIGNUM, qEnd = 0, tStart = BIGNUM, tEnd = 0; struct block *lastBlock = NULL; struct block *block; struct dyString *qSym = newDyString(16*1024); struct dyString *tSym = newDyString(16*1024); struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seq = NULL; struct axt axt; boolean qIsTwoBit = twoBitIsFile(qNibDir); boolean tIsTwoBit = twoBitIsFile(tNibDir); if (blockList == NULL) return; /* Figure overall dimensions. */ for (block = blockList; block != NULL; block = block->next) { if (qStart > block->qStart) qStart = block->qStart; if (qEnd < block->qEnd) qEnd = block->qEnd; if (tStart > block->tStart) tStart = block->tStart; if (tEnd < block->tEnd) tEnd = block->tEnd; } /* Load sequence covering alignment from nib files. */ if (isRc) { reverseIntRange(&qStart, &qEnd, qSize); if (qIsFa) { for (seq = qFaList ; seq != NULL ; seq = seq->next) if (sameString(qName, seq->name)) break; if (seq != NULL) { AllocVar(qSeq); qSeq->size = qEnd - qStart; qSeq->name = cloneString(qName); qSeq->dna = cloneMem((seq->dna)+qStart, qSeq->size); } else errAbort("sequence not found %s\n",qName); } else qSeq = readFromCache(qCache, qNibDir, qName, qStart, qEnd - qStart, qSize, qIsTwoBit); reverseIntRange(&qStart, &qEnd, qSize); reverseComplement(qSeq->dna, qSeq->size); } else { if (qIsFa) { for (seq = qFaList ; seq != NULL ; seq = seq->next) { if (sameString(qName, seq->name)) break; } if (seq != NULL) { AllocVar(qSeq); qSeq->size = qEnd - qStart; qSeq->name = cloneString(qName); qSeq->dna = (seq->dna)+qStart; } else errAbort("sequence not found %s\n",qName); } else qSeq = readFromCache(qCache, qNibDir, qName, qStart, qEnd - qStart, qSize, qIsTwoBit); } if (tIsFa) { for (seq = tFaList ; seq != NULL ; seq = seq->next) if (sameString(tName, seq->name)) break; if (seq != NULL) { AllocVar(tSeq); tSeq->size = tEnd - tStart; tSeq->name = cloneString(tName); tSeq->dna = cloneMem((seq->dna)+tStart, tSeq->size); } else errAbort("sequence not found %s\n",tName); } else tSeq = readFromCache(tCache, tNibDir, tName, tStart, tEnd - tStart, tSize, tIsTwoBit); /* Loop through blocks copying sequence into dynamic strings. */ for (block = blockList; block != NULL; block = block->next) { if (lastBlock != NULL) { int qGap = block->qStart - lastBlock->qEnd; int tGap = block->tStart - lastBlock->tEnd; if (qGap != 0 && tGap != 0) { errAbort("Gaps in both strand on alignment ending line %d of %s", lf->lineIx, lf->fileName); } if (qGap > 0) { dyStringAppendMultiC(tSym, '-', qGap); dyStringAppendN(qSym, qSeq->dna + lastBlock->qEnd - qStart, qGap); } if (tGap > 0) { dyStringAppendMultiC(qSym, '-', tGap); dyStringAppendN(tSym, tSeq->dna + lastBlock->tEnd - tStart, tGap); } } if (qSeq->size < block->qStart - qStart) { errAbort("read past end of sequence %s size =%d block->qStart-qstart=%d block->qStart=%d qEnd=%d \n", qName, qSeq->size, block->qStart-qStart,block->qStart, block->qEnd ); } dyStringAppendN(qSym, qSeq->dna + block->qStart - qStart, block->qEnd - block->qStart); if (tSeq->size < block->tStart - tStart) { errAbort("read past end of sequence %s size =%d block->tStart-tstart=%d\n", tName, tSeq->size, block->tStart-tStart); } dyStringAppendN(tSym, tSeq->dna + block->tStart - tStart, block->tEnd - block->tStart); lastBlock = block; } if (qSym->stringSize != tSym->stringSize) errAbort("qSize and tSize don't agree in alignment ending line %d of %s", lf->lineIx, lf->fileName); if (rescore) score = axtScoreSym(scoreScheme, qSym->stringSize, qSym->string, tSym->string); /* Fill in an axt and write it to output. */ ZeroVar(&axt); axt.qName = qName; axt.qStart = qStart; axt.qEnd = qEnd; axt.qStrand = (isRc ? '-' : '+'); axt.tName = tName; axt.tStart = tStart; axt.tEnd = tEnd; axt.tStrand = '+'; axt.score = score; axt.symCount = qSym->stringSize; axt.qSym = qSym->string; axt.tSym = tSym->string; axtWrite(&axt, f); /* Clean up. */ if (!qIsFa) freeDnaSeq(&qSeq); freeDnaSeq(&tSeq); dyStringFree(&qSym); dyStringFree(&tSym); }