struct axt *pslToAxt(struct psl *psl, struct hash *qHash, char *tNibDir, struct dlList *fileCache) { static char *tName = NULL, *qName = NULL; static struct dnaSeq *tSeq = NULL; struct dyString *q = newDyString(16*1024); struct dyString *t = newDyString(16*1024); int blockIx; int qs, ts ; int lastQ = 0, lastT = 0, size; int qOffset = 0; int tOffset = 0; struct axt *axt = NULL; boolean qIsNib = FALSE; boolean tIsNib = FALSE; int cnt = 0; //struct dnaSeq *tSeq = NULL; struct nibInfo *tNib = NULL; struct dnaSeq *qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0); // hGenBankGetMrna(psl->qName, NULL); /* freeDnaSeq(&qSeq); freez(&qName); assert(mrnaList != NULL); for (mrna = mrnaList; mrna != NULL ; mrna = mrna->next) { assert(mrna != NULL); cnt++; if (sameString(mrna->name, psl->qName)) { qSeq = cloneDnaSeq(mrna); assert(qSeq != NULL); break; } } */ if (qSeq == NULL) { warn("mrna sequence data not found %s, searched %d sequences\n",psl->qName,cnt); dyStringFree(&q); dyStringFree(&t); dnaSeqFree(&tSeq); dnaSeqFree(&qSeq); return NULL; } if (qSeq->size != psl->qSize) { warn("sequence %s aligned is different size %d from mrna.fa file %d \n",psl->qName,psl->qSize,qSeq->size); dyStringFree(&q); dyStringFree(&t); dnaSeqFree(&tSeq); dnaSeqFree(&qSeq); return NULL; } qName = cloneString(psl->qName); if (qIsNib && psl->strand[0] == '-') qOffset = psl->qSize - psl->qEnd; else qOffset = 0; verbose(5,"qString len = %d qOffset = %d\n",qSeq->size,qOffset); if (tName == NULL || !sameString(tName, psl->tName) || tIsNib) { freeDnaSeq(&tSeq); freez(&tName); tName = cloneString(psl->tName); tNib = nibInfoFromCache(nibHash, tNibDir, tName); assert(tNib !=NULL); tSeq = nibInfoLoadStrand(tNib, psl->tStart, psl->tEnd, '+'); assert(tSeq !=NULL); tOffset = psl->tStart; //readCachedSeqPart(tName, psl->tStart, psl->tEnd-psl->tStart, // tHash, fileCache, &tSeq, &tOffset, &tIsNib); } verbose(4,"strand t %s \n",psl->strand); if (tSeq != NULL) verbose(5,"tString len = %d tOffset = %d\n",tSeq->size,tOffset); else errAbort("tSeq is NULL\n"); if (psl->strand[0] == '-') reverseComplement(qSeq->dna, qSeq->size); //if (strlen(psl->strand) > 1 ) // if (psl->strand[1] == '-') // reverseComplement(tSeq->dna, tSeq->size); for (blockIx=0; blockIx < psl->blockCount; ++blockIx) { qs = psl->qStarts[blockIx] - qOffset; ts = psl->tStarts[blockIx] - tOffset; if (blockIx != 0) { int qGap, tGap, minGap; qGap = qs - lastQ; tGap = ts - lastT; minGap = min(qGap, tGap); if (minGap > 0) { writeGap(q, qGap, qSeq->dna + lastQ, t, tGap, tSeq->dna + lastT); } else if (qGap > 0) { writeInsert(q, t, qSeq->dna + lastQ, qGap); } else if (tGap > 0) { writeInsert(t, q, tSeq->dna + lastT, tGap); } } size = psl->blockSizes[blockIx]; assert(qSeq != NULL); dyStringAppendN(q, qSeq->dna + qs, size); lastQ = qs + size; dyStringAppendN(t, tSeq->dna + ts, size); lastT = ts + size; } if (strlen(q->string) != strlen(t->string)) warn("Symbol count(t) %d != %d inconsistent at t %s:%d and qName %s\n%s\n%s\n", (int)strlen(t->string), (int)strlen(q->string), psl->tName, psl->tStart, psl->qName, t->string, q->string); if (psl->strand[0] == '-') { reverseComplement(q->string, q->stringSize); reverseComplement(t->string, t->stringSize); } axt = axtCreate(q->string, t->string, min(q->stringSize,t->stringSize), psl); dyStringFree(&q); dyStringFree(&t); //dnaSeqFree(&tSeq); dnaSeqFree(&qSeq); if (qIsNib) freez(&qName); //if (tIsNib) // freez(&tName); return axt; }
struct gapInfo *findLargeGaps(struct xaAli *xa, struct gapInfo *oldList) /* Find large gaps in alignment and classify them. */ { struct gdfGene *gdfList; struct gapInfo *gapList = NULL, *gap; int ceIx=0, cbIx=0, symIx=0; int ceStart=0, cbStart=0, symStart=0; int runSize = 0; char sym, lastSym = 0; int symCount = xa->symCount; /* Fetch C. elegans region. */ gdfList = wormGdfGenesInRange(xa->target, xa->tStart, xa->tEnd, &wormSangerGdfCache); /* Run a little state machine that does something at the end of each solid run * of a symbol. */ for (symIx = 0; symIx <= symCount; ++symIx) { sym = xa->hSym[symIx]; if (sym != lastSym) { if (runSize > 32) /* Introns need to be at least this long. */ { /* We're at end of a solid run. */ if (lastSym == 'Q' || lastSym == 'T') { int ceGapStart = xa->tStart + ceStart; int ceGapEnd = xa->tStart + ceIx; struct gdfGene *gdf; char hBefore = xa->hSym[symStart-1]; char hAfter = sym; char strand = '.'; AllocVar(gap); gap->query = cloneString(xa->query); gap->qStart = xa->qStart + cbStart; gap->qEnd = xa->qStart + cbIx; gap->target = cloneString(xa->target); gap->tStart = ceGapStart; gap->tEnd = ceGapEnd; gap->name = cloneString(xa->name); gap->size = runSize; gap->hSym = lastSym; if (uniqueGap(oldList, gap)) { slAddHead(&gapList, gap); classifyGap(gdfList, xa->target, ceGapStart, ceGapEnd, lastSym, &gap->type, &gdf); if (gdf != NULL) strand = gdf->strand; gap->hasIntronEnds = isIntron(xa, symStart, symIx, lastSym, strand, &gap->slideCount, &gap->isRc); if (gap->hasIntronEnds) slideGap(gap, xa, lastSym, symStart, symIx); if (isConserved(hBefore) && isConserved(hAfter)) gap->hasStrongHomology = TRUE; if (gap->hasStrongHomology) { if (lastSym == 'T') writeGap(gap, xa, symStart+gap->slideCount, symIx+gap->slideCount, strand, out); } } } } runSize = 0; ceStart = ceIx; cbStart = cbIx; symStart = symIx; lastSym = sym; } ++runSize; if (xa->qSym[symIx] != '-') ++cbIx; if (xa->tSym[symIx] != '-') ++ceIx; } gdfFreeGeneList(&gdfList); slReverse(&gapList); return gapList; }
void prettyOne(struct psl *psl, struct hash *qHash, struct hash *tHash, struct dlList *fileCache, FILE *f, boolean axt, FILE *checkFile) /* Make pretty output for one psl. Find target and query * sequence in hash. Load them. Output bases. */ { static char *tName = NULL, *qName = NULL; static struct dnaSeq *tSeq = NULL, *qSeq = NULL; struct dyString *q = newDyString(16*1024); struct dyString *t = newDyString(16*1024); int blockIx; int qs, ts; int lastQ = 0, lastT = 0, size; int qOffset = 0; int tOffset = 0; boolean qIsPartial = FALSE; boolean tIsPartial = FALSE; if (qName == NULL || !sameString(qName, psl->qName)) { freeDnaSeq(&qSeq); freez(&qName); qName = cloneString(psl->qName); readCachedSeqPart(qName, psl->qStart, psl->qEnd-psl->qStart, qHash, fileCache, &qSeq, &qOffset, &qIsPartial); if (qIsPartial && psl->strand[0] == '-') qOffset = psl->qSize - psl->qEnd; } if (tName == NULL || !sameString(tName, psl->tName) || tIsPartial) { freeDnaSeq(&tSeq); freez(&tName); tName = cloneString(psl->tName); readCachedSeqPart(tName, psl->tStart, psl->tEnd-psl->tStart, tHash, fileCache, &tSeq, &tOffset, &tIsPartial); } if (tIsPartial && psl->strand[1] == '-') tOffset = psl->tSize - psl->tEnd; if (psl->strand[0] == '-') reverseComplement(qSeq->dna, qSeq->size); if (psl->strand[1] == '-') reverseComplement(tSeq->dna, tSeq->size); for (blockIx=0; blockIx < psl->blockCount; ++blockIx) { qs = psl->qStarts[blockIx] - qOffset; ts = psl->tStarts[blockIx] - tOffset; /* Output gaps except in first case. */ if (blockIx != 0) { int qGap, tGap, minGap; qGap = qs - lastQ; tGap = ts - lastT; minGap = min(qGap, tGap); if (minGap > 0) { writeGap(q, qGap, qSeq->dna + lastQ, t, tGap, tSeq->dna + lastT); } else if (qGap > 0) { writeInsert(q, t, qSeq->dna + lastQ, qGap); } else if (tGap > 0) { writeInsert(t, q, tSeq->dna + lastT, tGap); } } /* Output sequence. */ size = psl->blockSizes[blockIx]; dyStringAppendN(q, qSeq->dna + qs, size); lastQ = qs + size; dyStringAppendN(t, tSeq->dna + ts, size); lastT = ts + size; if(q->stringSize != t->stringSize) { // printf("%d BLK %s q size %d t size %d diff %d qs size %d ts size %d\n",blockIx, psl->qName, q->stringSize, t->stringSize, q->stringSize - t->stringSize, qSeq->size, tSeq->size ); } } if (checkFile != NULL) { outputCheck(psl, qSeq, qOffset, tSeq, tOffset, checkFile); } if (psl->strand[0] == '-' && !qIsPartial) reverseComplement(qSeq->dna, qSeq->size); if (psl->strand[1] == '-' && !tIsPartial) reverseComplement(tSeq->dna, tSeq->size); if(q->stringSize != t->stringSize) { // printf("AF %s q size %d t size %d qs size %d ts size %d\n",psl->qName, q->stringSize, t->stringSize, qSeq->size, tSeq->size ); } //assert(q->stringSize == t->stringSize); if (axt) axtOutString(q->string, t->string, min(q->stringSize,t->stringSize), 60, psl, f); else prettyOutString(q->string, t->string, min(q->stringSize,t->stringSize), 60, psl, f); dyStringFree(&q); dyStringFree(&t); if (qIsPartial) freez(&qName); if (tIsPartial) freez(&tName); }