static void bioCycLink(struct pathwayLink *pl, struct sqlConnection *conn, char *geneId) /* Print out bioCyc database link. */ { char query[512], **row; struct sqlResult *sr; char *oldMapId = cloneString(""); char *upperCaseGenome = cloneString(genome); toUpperN(upperCaseGenome, strlen(upperCaseGenome)); sqlSafef(query, sizeof(query), "select bioCycPathway.mapId,description" " from bioCycPathway,bioCycMapDesc" " where bioCycPathway.kgId='%s'" " and bioCycPathway.mapId = bioCycMapDesc.mapId order by bioCycPathway.mapId" , geneId); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { /* only print new ones */ if (!sameWord(oldMapId, row[0])) { hPrintf("<A HREF=\"http://biocyc.org/%s/new-image?type=PATHWAY&object=%s&detail-level=2\" TARGET=_blank>", upperCaseGenome, row[0]); hPrintf("%s</A> - %s<BR>\n", row[0], row[1]); } oldMapId = cloneString(row[0]); } sqlFreeResult(&sr); }
void correctOne(struct dnaSeq *est, struct psl *psl, char *nibDir, struct hash *nibHash, FILE *f) /* Write one corrected EST to file. */ { struct dnaSeq *geno = readCachedNib(nibHash, nibDir, psl->tName, psl->tStart, psl->tEnd - psl->tStart); struct dyString *t = newDyString(est->size+20); int qSize = psl->qSize; int tSize = psl->tSize; int qLastEnd = 0; int blockIx; struct mrnaBlock *mbList, *mb; int genoOffset = psl->tStart; boolean isRc = FALSE; /* Load sequence and alignment blocks, coping with reverse * strand as necessary. */ toUpperN(geno->dna, geno->size); /* This helps debug... */ mbList = mrnaBlockFromPsl(psl); if (psl->strand[0] == '-') { reverseComplement(geno->dna, geno->size); genoOffset = tSize - psl->tEnd; for (mb = mbList; mb != NULL; mb = mb->next) { reverseIntRange(&mb->tStart, &mb->tEnd, tSize); reverseIntRange(&mb->qStart, &mb->qEnd, qSize); } slReverse(&mbList); isRc = TRUE; } /* Make t have corrected sequence. */ for (mb = mbList; mb != NULL; mb = mb->next) { int qStart = mb->qStart; int qEnd = mb->qEnd; int uncovSize = qStart - qLastEnd; if (uncovSize > 0) dyStringAppendN(t, est->dna + qLastEnd, uncovSize); dyStringAppendN(t, geno->dna + mb->tStart - genoOffset, mb->tEnd - mb->tStart); qLastEnd = qEnd; } if (qLastEnd != qSize) { int uncovSize = qSize - qLastEnd; dyStringAppendN(t, est->dna + qLastEnd, uncovSize); } /* Output */ faWriteNext(f, est->name, t->string, t->stringSize); /* Clean up time. */ slFreeList(&mbList); freeDyString(&t); freeDnaSeq(&geno); }
void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, int start, int end) /* Output sequence. */ { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end); if (noMask) toUpperN(seq->dna, seq->size); faWriteNext(f, seq->name, seq->dna, seq->size); dnaSeqFree(&seq); }
boolean wormFixupOrfName(char *name) /* Turn something into a proper cosmid.# style name. Return FALSE if it can't be done. */ { char *dot = strrchr(name, '.'); if (dot == NULL) return FALSE; toUpperN(name, dot-name); /* First part always upper case. */ if (!isdigit(dot[1])) /* Nameless cluster - just leave following digits be. */ return TRUE; else tolowers(dot+1); /* Suffix is lower case. */ return TRUE; }
static void blatzClient(char *input, char *output) /* Send query message and dna to server and print result. */ { struct dnaLoad *dl = dnaLoadOpen(input); struct dnaSeq *seq; FILE *f = mustOpen(output, "w"); static struct optionSpec options[] = { BZP_CLIENT_OPTIONS }; int i; while ((seq = dnaLoadNext(dl)) != NULL) { /* Connect */ int sd = netMustConnect(host, port); FILE *sf = NULL; /* Send query command. */ netSendString(sd, "query"); /* Send options. */ for (i=0; i<ArraySize(options); ++i) sendOption(sd, options[i].name); /* Send sequence. */ if (optionExists("rna") || optionExists("unmask")) toUpperN(seq->dna, seq->size); else { if (seqIsLower(seq)) warn("Sequence %s is all lower case, and thus ignored. Use -unmask " "flag to unmask lower case sequence.", seq->name); } netSendString(sd, "seq"); netSendString(sd, seq->name); netSendHugeString(sd, seq->dna); verbose(1, "%s\n", seq->name); dnaSeqFree(&seq); /* Get and save response. */ sf = netFileFromSocket(sd); copyOpenFile(sf, f); carefulClose(&sf); /* Close connection */ close(sd); } dnaLoadClose(&dl); carefulClose(&f); }
void mafStats(char *twoBitFile, char *mafDir, char *outFile) /* mafStats - Calculate basic stats on maf file including species-by-species * coverage and percent ID. */ { struct twoBitFile *tbf = twoBitOpen(twoBitFile); FILE *f = mustOpen(outFile, "w"); struct twoBitIndex *ix; long genomeSize = 0; struct hash *speciesHash = hashNew(0); struct speciesAcc *speciesList = NULL, *species; for (ix = tbf->indexList; ix != NULL; ix = ix->next) { unsigned chromSize = twoBitSeqSizeNoNs(tbf, ix->name); genomeSize += chromSize; char mafFileName[PATH_LEN]; safef(mafFileName, sizeof(mafFileName), "%s/%s.maf", mafDir, ix->name); struct mafFile *mf = mafMayOpen(mafFileName); verbose(1, "processing %s\n", ix->name); if (mf == NULL) { warn("%s doesn't exist", mafFileName); continue; } struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { struct mafComp *mc; for (mc = maf->components; mc != NULL; mc = mc->next) { if (mc->text != NULL) toUpperN(mc->text, maf->textSize); } addCounts(maf, speciesHash, &speciesList); mafAliFree(&maf); } mafFileFree(&mf); } slReverse(&speciesList); for (species = speciesList; species != NULL; species = species->next) { fprintf(f, "counts: %s\t%ld\t%ld\t%ld\n", species->name, species->covCount, species->aliCount, species->idCount); fprintf(f, "precents: %s\t%4.2f%%\t%4.2f%%\t%4.2f%%\n", species->name, 100.0 * species->covCount/genomeSize, 100.0 * species->aliCount/genomeSize, 100.0 * species->idCount/species->aliCount); } carefulClose(&f); }
boolean getWormGeneDna(char *name, DNA **retDna, boolean upcExons) /* Get the DNA associated with a gene. Optionally upper case exons. */ { struct gdfGene *g; struct slName *syn = NULL; long lstart, lend; int start, end; int dnaSize; DNA *dna; struct wormGdfCache *gdfCache; /* Translate biologist type name to cosmid.N name */ if (wormIsGeneName(name)) { syn = wormGeneToOrfNames(name); if (syn != NULL) name = syn->name; } if (strncmp(name, "g-", 2) == 0) gdfCache = &wormGenieGdfCache; else gdfCache = &wormSangerGdfCache; if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL) return FALSE; gdfGeneExtents(g, &lstart, &lend); start = lstart; end = lend; /* wormClipRangeToChrom(chromIds[g->chromIx], &start, &end); */ dnaSize = end-start; *retDna = dna = wormChromPart(chromIds[g->chromIx], start, dnaSize); gdfOffsetGene(g, -start); if (g->strand == '-') { reverseComplement(dna, dnaSize); gdfRcGene(g, dnaSize); } if (upcExons) { int i; struct gdfDataPoint *pt = g->dataPoints; for (i=0; i<g->dataCount; i += 2) { toUpperN(dna + pt[i].start, pt[i+1].start - pt[i].start); } } gdfFreeGene(g); return TRUE; }
static void capAliTextOnTrack(struct mafAli *maf, char *db, char *chrom, char *track, boolean onlyCds) /* Capitalize exons in alignment. */ { int rowOffset; struct sqlConnection *conn = sqlConnect(db); struct mafComp *selfMc = maf->components, *mc; int start = selfMc->start; int end = start + selfMc->size; struct sqlResult *sr = hRangeQuery(conn, track, chrom, start, end, NULL, &rowOffset); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row+rowOffset); int i; for (i=0; i<gp->exonCount; ++i) { int s = gp->exonStarts[i]; int e = gp->exonEnds[i]; if (onlyCds) { if (s < gp->cdsStart) s = gp->cdsStart; if (e > gp->cdsEnd) e = gp->cdsEnd; } if (s < start) s = start; if (e > end) e = end; if (findAliRange(selfMc->text, maf->textSize, s-start, e-start, &s, &e)) { for (mc = maf->components; mc != NULL; mc = mc->next) if (mc->text) toUpperN(mc->text + s, e-s); } } genePredFree(&gp); } sqlFreeResult(&sr); sqlDisconnect(&conn); }
struct hash *loadChroms(char *dir) /* Load zipped chromosome files into memory. */ { FILE *f; char fastaScan[16]; safef(fastaScan, sizeof(fastaScan), "*.%s", faExtn); struct fileInfo *chromEl, *chromList = listDirX(dir, fastaScan, TRUE); struct hash *chromHash = newHash(0); struct dnaSeq *seq; char chrom[128]; char *faName; int count = 0; verbose(2, "# scanning '%s/%s'\n", dir, fastaScan); for (chromEl = chromList; chromEl != NULL; chromEl = chromEl->next) { char *fileName = chromEl->name; splitPath(fileName, NULL, chrom, NULL); chopSuffix(chrom); if (startsWith("chr0", chrom)) /* Convert chr01 to chr1, etc. */ stripChar(chrom, '0'); if (sameString(chrom, "chrmt")) strcpy(chrom, "chr17"); f = fopen(fileName, "r"); AllocVar(seq); seq->name = cloneString(chrom); if (!faFastReadNext(f, &seq->dna, &seq->size, &faName)) errAbort("Couldn't load sequence from %s", fileName); seq->dna = cloneMem(seq->dna, seq->size+1); toUpperN(seq->dna, seq->size); hashAdd(chromHash, chrom, seq); verbose(3, "# loadChrom %s '%s'\n", fileName, chrom); fclose(f); f = NULL; count++; } if (0 == count) errAbort("not fasta files found in '%s/%s'\n", dir, fastaScan); return chromHash; }
static void processSeqsFromBed(struct twoBitFile *tbf, char *bedFileName, FILE *outFile) /* Get sequences defined by beds. Exclude introns. */ { struct bed *bed, *bedList = bedLoadAll(bedFileName); for (bed = bedList; bed != NULL; bed = bed->next) { struct dnaSeq *seq = twoBitAndBedToSeq(tbf, bed); char* seqName = NULL; if (clBedPos) { char buf[1024]; safef(buf, 1024, "%s:%d-%d", bed->chrom, bed->chromStart, bed->chromEnd); seqName = buf; } else seqName = seq->name; if (noMask) toUpperN(seq->dna, seq->size); faWriteNext(outFile, seqName, seq->dna, seq->size); dnaSeqFree(&seq); } }
void gdfUpcExons(struct gdfGene *gene, int geneOffset, DNA *dna, int dnaSize, int dnaOffset) /* Uppercase exons in DNA. */ { struct gdfDataPoint *dp = gene->dataPoints; int count = gene->dataCount; int start, end; long gffStart, gffEnd; int combinedOffset; int i; gdfGeneExtents(gene, &gffStart, &gffEnd); combinedOffset = -gffStart + geneOffset - dnaOffset; for (i=0; i<count; i += 2) { start = dp[i].start + combinedOffset; end = dp[i+1].start + combinedOffset; if (end <= 0 || start >= dnaSize) continue; if (start < 0) start = 0; if (end > dnaSize) end = dnaSize; toUpperN(dna+start, end-start); } }
void bigBlat(struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus) /* Run query against translated DNA database (3 frames on each strand). */ { int frame, i; struct dnaSeq *seq, trimmedSeq; struct genoFind *gfs[3]; aaSeq *dbSeqLists[3]; struct trans3 *t3List = NULL; int isRc; struct lineFile *lf = NULL; struct hash *t3Hash = NULL; boolean forceUpper = FALSE; boolean forceLower = FALSE; boolean toggle = FALSE; boolean maskUpper = FALSE; ZeroVar(&trimmedSeq); if (showStatus) printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount); /* Figure out how to manage query case. Proteins want to be in * upper case, generally, nucleotides in lower case. But there * may be repeatMasking based on case as well. */ if (transQuery) { if (qMask == NULL) forceLower = TRUE; else { maskUpper = TRUE; toggle = !sameString(qMask, "upper"); } } else { forceUpper = TRUE; } if (gvo->fileHead != NULL) gvo->fileHead(gvo, out); for (isRc = FALSE; isRc <= 1; ++isRc) { /* Initialize local pointer arrays to NULL to prevent surprises. */ for (frame = 0; frame < 3; ++frame) { gfs[frame] = NULL; dbSeqLists[frame] = NULL; } t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash); for (frame = 0; frame < 3; ++frame) { gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, repMatch, ooc, TRUE, oneOff, FALSE, stepSize); } for (i=0; i<queryCount; ++i) { aaSeq qSeq; lf = lineFileOpen(queryFiles[i], TRUE); while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name)) { dotOut(); /* Put it into right case and optionally mask on case. */ if (forceLower) toLowerN(qSeq.dna, qSeq.size); else if (forceUpper) toUpperN(qSeq.dna, qSeq.size); else if (maskUpper) { if (toggle) toggleCase(qSeq.dna, qSeq.size); upperToN(qSeq.dna, qSeq.size); } if (qSeq.size > qWarnSize) { warn("Query sequence %s has size %d, it might take a while.", qSeq.name, qSeq.size); } trimSeq(&qSeq, &trimmedSeq); if (transQuery) transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out); else tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out); gfOutputQuery(gvo, out); } lineFileClose(&lf); } /* Clean up time. */ trans3FreeList(&t3List); freeHash(&t3Hash); for (frame = 0; frame < 3; ++frame) { genoFindFree(&gfs[frame]); } for (seq = untransList; seq != NULL; seq = seq->next) { reverseComplement(seq->dna, seq->size); } } carefulClose(&out); }
static void alignAll(struct bzp *bzp, struct blatzIndex *indexList, struct dnaLoad *queryDl, char *outFile) /* Make up neighorhood index for queryList, and use it to scan * targetList. Put output in outFile */ { FILE *f = mustOpen(outFile, "w"); struct dnaSeq *query; // LX BEG int b, bend, printing; FILE *bedfp = NULL; // See if bed file output of the mask was requested if (differentString(bzp->dynaBedFileQ, "")) bedfp = mustOpen(bzp->dynaBedFileQ, "w"); // Counts all the query-target hits encountered by the program inside the // loops of gapless.c dynaHits = 0; // Counts how many target and query positions reached the limit dynaCountTarget = 0; dynaCountQuery = 0; // This is the limit used by the program, currently just bzp->dynaLimit(QT) // but should be useful for scaling to sequence size targetHitDLimit = VERY_LARGE_NUMBER; // perhaps unnecessary default queryHitDLimit = VERY_LARGE_NUMBER; // perhaps unnecessary default // LX END while ((query = dnaLoadNext(queryDl)) != NULL) { double bestScore = 0; struct chain *chainList; // LX BEG if (bzp->dynaLimitQ<VERY_LARGE_NUMBER) { queryHitDLimit = bzp->dynaLimitQ; // allocate zeroed memory for hit counters AllocArray(dynaCountQ, query->size); } // LX END if (bzp->unmask || bzp->rna) toUpperN(query->dna, query->size); if (bzp->rna) maskTailPolyA(query->dna, query->size); chainList = blatzAlign(bzp, indexList, query); if (chainList != NULL) bestScore = chainList->score; else { if (seqIsLower(query)) warn("Sequence %s is all lower case, and thus ignored. Use -unmask " "flag to unmask lower case sequence.", query->name); } verbose(1, "%s (%d bases) score %2.0f\n", query->name, query->size, bestScore); blatzWriteChains(bzp, &chainList, query, dnaLoadCurStart(queryDl), dnaLoadCurEnd(queryDl), dnaLoadCurSize(queryDl), indexList, f); // LX BEG // This prints the contents of the mask into the .bed file opened above if (bedfp != NULL) { if (bzp->dynaLimitQ<VERY_LARGE_NUMBER) { printing = 0; for (b=0;b<query->size;b++) { if (dynaCountQ[b] > queryHitDLimit) { if (printing == 0) { printing = 1; fprintf(bedfp,"%s %d ",query->name,b); } } if (dynaCountQ[b] <= queryHitDLimit) { if (printing == 1) { printing = 0; bend = b-1; fprintf(bedfp,"%d\n",bend); } } } } else { fprintf(bedfp,"#No dynamic masking data to print.\n"); } } // LX END dnaSeqFree(&query); } // LX BEG // Statistics to print about how many hits were dropped (ignored) dynaDrops = dynaCountTarget + dynaCountQuery; dynaDropsPerc = (float)100*dynaDrops/dynaHits+0.5; verbose(2, "%d dynaDrops (%f%%) at T=%d Q=%d \n", dynaDrops, (double)dynaDropsPerc, targetHitDLimit, queryHitDLimit); // Free dynamic memory used for the sequence-length-dependent counter arrays freeMem(dynaCountQ); if (bedfp != NULL) carefulClose(&bedfp); freeMem(dynaWordCount); // LX END carefulClose(&f); }
static void hgSeqConcatRegionsDb(char *db, char *chrom, int chromSize, char strand, char *name, int rCount, unsigned *rStarts, unsigned *rSizes, boolean *exonFlags, boolean *cdsFlags) /* Concatenate and print out dna for a series of regions. */ { // Note: this code use to generate different sequence ids if the global // database in hdb was different than the db parameter. This functionality // has been removed since the global database was removed and it didn't // appear to be used. struct dnaSeq *rSeq = NULL; struct dnaSeq *cSeq = NULL; char recName[256]; int seqStart, seqEnd; int offset, cSize; int i; boolean isRc = (strand == '-') || cgiBoolean("hgSeq.revComp"); boolean maskRep = cgiBoolean("hgSeq.maskRepeats"); int padding5 = cgiOptionalInt("hgSeq.padding5", 0); int padding3 = cgiOptionalInt("hgSeq.padding3", 0); char *casing = cgiString("hgSeq.casing"); char *repMasking = cgiString("hgSeq.repMasking"); char *granularity = cgiOptionalString("hgSeq.granularity"); boolean concatRegions = granularity && sameString("gene", granularity); if (rCount < 1) return; /* Don't support padding if granularity is gene (i.e. concat'ing all). */ if (concatRegions) { padding5 = padding3 = 0; } i = rCount - 1; seqStart = rStarts[0] - (isRc ? padding3 : padding5); seqEnd = rStarts[i] + rSizes[i] + (isRc ? padding5 : padding3); /* Padding might push us off the edge of the chrom; if so, truncate: */ if (seqStart < 0) { if (isRc) padding3 += seqStart; else padding5 += seqStart; seqStart = 0; } /* if we know the chromSize, don't pad out beyond it */ if ((chromSize > 0) && (seqEnd > chromSize)) { if (isRc) padding5 += (chromSize - seqEnd); else padding3 += (chromSize - seqEnd); seqEnd = chromSize; } if (seqEnd <= seqStart) { printf("# Null range for %s_%s (range=%s:%d-%d 5'pad=%d 3'pad=%d) (may indicate a query-side insert)\n", db, name, chrom, seqStart+1, seqEnd, padding5, padding3); return; } if (maskRep) { rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaMixed); if (sameString(repMasking, "N")) lowerToN(rSeq->dna, strlen(rSeq->dna)); if (!sameString(casing, "upper")) tolowers(rSeq->dna); } else if (sameString(casing, "upper")) rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaUpper); else rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaLower); /* Handle casing and compute size of concatenated sequence */ cSize = 0; for (i=0; i < rCount; i++) { if ((sameString(casing, "exon") && exonFlags[i]) || (sameString(casing, "cds") && cdsFlags[i])) { int rStart = rStarts[i] - seqStart; toUpperN(rSeq->dna+rStart, rSizes[i]); } cSize += rSizes[i]; } cSize += (padding5 + padding3); AllocVar(cSeq); cSeq->dna = needLargeMem(cSize+1); cSeq->size = cSize; offset = 0; for (i=0; i < rCount; i++) { int start = rStarts[i] - seqStart; int size = rSizes[i]; if (i == 0) { start -= (isRc ? padding3 : padding5); assert(start == 0); size += (isRc ? padding3 : padding5); } if (i == rCount-1) { size += (isRc ? padding5 : padding3); } memcpy(cSeq->dna+offset, rSeq->dna+start, size); offset += size; } assert(offset == cSeq->size); cSeq->dna[offset] = 0; freeDnaSeq(&rSeq); if (isRc) reverseComplement(cSeq->dna, cSeq->size); safef(recName, sizeof(recName), "%s_%s range=%s:%d-%d 5'pad=%d 3'pad=%d " "strand=%c repeatMasking=%s", db, name, chrom, seqStart+1, seqEnd, padding5, padding3, (isRc ? '-' : '+'), (maskRep ? repMasking : "none")); faWriteNext(stdout, recName, cSeq->dna, cSeq->size); freeDnaSeq(&cSeq); }
void axtHiQualDiffs(char *axtFile, struct hash *qacHash, FILE *f) /* Write out high quality diffs in axtFile to f. */ { char *qName = cloneString(""); UBYTE *qQuals = NULL; UBYTE *quals = NULL; struct qac *qac = NULL; struct axt *axt = NULL; struct lineFile *lf = lineFileOpen(axtFile, TRUE); int qStart, qDir, qPos, qWinStart, qWinEnd, tPos; int qWinSize = optionInt("winSize", 11); int qQualMin = optionInt("diffQualMin", 30); int qWinQualMin = optionInt("winQualMin", 25); int qWinMaxDiff = optionInt("winMaxDiff", 2); boolean qIndelOk = optionExists("indelOk"); boolean qIgnore98 = optionExists("ignore98"); boolean chimpPos = optionExists("chimpPos"); int qHalfWinSize = qWinSize/2; while ((axt = axtRead(lf)) != NULL) { char *qSym = axt->qSym, *tSym = axt->tSym; int symIx, symCount = axt->symCount; char qc,tc; toUpperN(qSym, symCount); toUpperN(tSym, symCount); if (!sameString(axt->qName, qName)) { freez(&qName); qName = cloneString(axt->qName); qac = hashMustFindVal(qacHash, qName); freez(&qQuals); qQuals = needHugeMem(qac->uncSize); rleUncompress(qac->data, qac->compSize, qQuals, qac->uncSize); } if (axt->qStrand == '+') { qStart = axt->qStart; qDir = 1; } else { qStart = qac->uncSize - axt->qStart - 1; qDir = -1; } qPos = qStart; tPos = axt->tStart; for (symIx = 0; symIx < symCount; ++symIx) { qc = qSym[symIx]; tc = tSym[symIx]; if (qc == '-') tPos += 1; else if (tc == '-') qPos += qDir; else { if (qc != tc) { qWinStart = qPos - qHalfWinSize; qWinEnd = qWinStart + qWinSize; if (qWinStart >= 0 && qWinEnd < qac->uncSize) { if (qQuals[qPos] >= qQualMin) { int i; boolean ok = TRUE; for (i = qWinStart; i<qWinEnd; ++i) if (qQuals[i] < qWinQualMin) { ok = FALSE; break; } if (ok) { int diffCount = 0; int symWinStart = symIx - qHalfWinSize; int symWinEnd = symWinStart + qWinSize; for (i=symWinStart; i < symWinEnd; ++i) { qc = qSym[i]; tc = tSym[i]; if (qc == '-' || tc == '-') { ok = FALSE; break; } if (qc != tc) ++diffCount; } if (ok && diffCount <= qWinMaxDiff && (!qIgnore98 || qQuals[qPos] != 98) ) { if (chimpPos) fprintf(f, "%s\t%d\t%d\t%c\t%c\t%s\t%d\t%d\n", axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx], axt->qName, qPos, qPos+1); else fprintf(f, "%s\t%d\t%d\t%c\t%c\n", axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx]); } } } } } qPos += qDir; tPos += 1; } } axtFree(&axt); } lineFileClose(&lf); }
void doTypeHeader(char *type, char *cellOrg,boolean sortable) { if ((organismOptLower != NULL) && !sameWord(cellOrg, organismOptLower)) errAbort("specified organism %s not consistent with cell type which is org %s\n", organismOpt, cellOrg); // NOTE: All tables must have the same number of columns in order to allow 'control' // to be swapped in Use colSapn= on description column printf("<THEAD><TR valign='bottom' style='background:%s;'>\n",COLOR_BG_HEADER_LTBLUE); int sortOrder = (sortable ? 1: -999); // hint: -999 will keep sortOrtder++ < 0 if (sameWord(type,CV_TERM_CELL)) { printf("<!-- Cell Line table: contains links to protocol file and vendor description page -->"); /* Venkat: To differentiate between the print statments of Mouse and Human Cell Lines */ if (sameWord(cellOrg,ORG_HUMAN)) { printColHeader(FALSE,type, sortOrder++,NULL,1); printColHeader(FALSE,"Tier", sortOrder++,NULL,1); printColHeader(FALSE,"Description",sortOrder++,NULL,1); printColHeader(FALSE,"Lineage", sortOrder++,NULL,1); printColHeader(FALSE,"Tissue", sortOrder++,NULL,1); printColHeader(FALSE,"Karyotype", sortOrder++,NULL,1); printColHeader(FALSE,"Sex", sortOrder++,NULL,1); printColHeader(FALSE,"Documents", sortOrder++,NULL,1); printColHeader(FALSE,"Vendor ID", sortOrder++,NULL,1); printColHeader(FALSE,"Term ID", sortOrder++,NULL,1); printColHeader(TRUE ,"Label", sortOrder++,NULL,1); } else { printColHeader(FALSE,"Source", sortOrder++,NULL,1); printColHeader(FALSE,"Description",sortOrder++,NULL,TABLE_COLS_AVAILABLE(8)); printColHeader(FALSE,"Category", sortOrder++,NULL,1); printColHeader(FALSE,"Tissue", sortOrder++,NULL,1); printColHeader(FALSE,"Sex", sortOrder++,NULL,1); printColHeader(FALSE,"Documents", sortOrder++,NULL,1); printColHeader(FALSE,"Source Lab", sortOrder++,NULL,1); printColHeader(FALSE,"Term ID", sortOrder++,NULL,1); printColHeader(TRUE ,"Label", sortOrder++,NULL,1); } } else if (sameWord(type,CV_TERM_ANTIBODY)) { printColHeader(FALSE,type, sortOrder++,NULL,1); printColHeader(FALSE,"Antibody Description",sortOrder++,NULL,TABLE_COLS_AVAILABLE(9)); printColHeader(FALSE,"Target", sortOrder++,NULL,1); printColHeader(FALSE,"Target Description", sortOrder++,"style='min-width:600px;'",1); printColHeader(FALSE,"Vendor ID", sortOrder++,NULL,1); printColHeader(FALSE,"Lab", sortOrder++,NULL,1); printColHeader(FALSE,"Documents", sortOrder++,NULL,1); printColHeader(FALSE,"Lots", sortOrder++,NULL,1); printColHeader(FALSE,"Target Link", sortOrder++,NULL,1); printColHeader(TRUE ,"Label", sortOrder++,NULL,1); } else { char *caplitalized = NULL; if (sameWord(type,CV_TERM_DATA_TYPE)) caplitalized = cloneString("Data Type"); else { caplitalized = cloneString(type); toUpperN(caplitalized,1); } printColHeader(FALSE,caplitalized,sortOrder++,NULL,1); if (sameWord(type,CV_TERM_LOCALIZATION)) { printColHeader(FALSE,"Description",sortOrder++,NULL,TABLE_COLS_AVAILABLE(3)); printColHeader(FALSE,"GO ID", sortOrder++,NULL,1); } else if (sameWord(type,CV_TERM_LAB)) { printColHeader(FALSE,"Institution",sortOrder++,NULL,TABLE_COLS_AVAILABLE(5)); printColHeader(FALSE,"Lab PI", sortOrder++,NULL,1); printColHeader(FALSE,"Grant PI", sortOrder++,NULL,1); printColHeader(FALSE,"Organism", sortOrder++,NULL,1); } else printColHeader(FALSE,"Description",sortOrder++,NULL,TABLE_COLS_AVAILABLE(2)); printColHeader(TRUE ,"Label",sortOrder++,NULL,1); freeMem(caplitalized); } puts("</TR></THEAD><TBODY>"); }
void blatSeq(char *userSeq, char *organism) /* Blat sequence user pasted in. */ { FILE *f; struct dnaSeq *seqList = NULL, *seq; struct tempName pslTn, faTn; int maxSingleSize, maxTotalSize, maxSeqCount; int minSingleSize = minMatchShown; char *genome, *db; char *type = cgiString("type"); char *seqLetters = cloneString(userSeq); struct serverTable *serve; int conn; int oneSize, totalSize = 0, seqCount = 0; boolean isTx = FALSE; boolean isTxTx = FALSE; boolean txTxBoth = FALSE; struct gfOutput *gvo; boolean qIsProt = FALSE; enum gfType qType, tType; struct hash *tFileCache = gfFileCacheNew(); boolean feelingLucky = cgiBoolean("Lucky"); getDbAndGenome(cart, &db, &genome, oldVars); if(!feelingLucky) cartWebStart(cart, db, "%s BLAT Results", trackHubSkipHubName(organism)); /* Load user sequence and figure out if it is DNA or protein. */ if (sameWord(type, "DNA")) { seqList = faSeqListFromMemText(seqLetters, TRUE); uToT(seqList); isTx = FALSE; } else if (sameWord(type, "translated RNA") || sameWord(type, "translated DNA")) { seqList = faSeqListFromMemText(seqLetters, TRUE); uToT(seqList); isTx = TRUE; isTxTx = TRUE; txTxBoth = sameWord(type, "translated DNA"); } else if (sameWord(type, "protein")) { seqList = faSeqListFromMemText(seqLetters, FALSE); isTx = TRUE; qIsProt = TRUE; } else { seqList = faSeqListFromMemTextRaw(seqLetters); isTx = !seqIsDna(seqList); if (!isTx) { for (seq = seqList; seq != NULL; seq = seq->next) { seq->size = dnaFilteredSize(seq->dna); dnaFilter(seq->dna, seq->dna); toLowerN(seq->dna, seq->size); subChar(seq->dna, 'u', 't'); } } else { for (seq = seqList; seq != NULL; seq = seq->next) { seq->size = aaFilteredSize(seq->dna); aaFilter(seq->dna, seq->dna); toUpperN(seq->dna, seq->size); } qIsProt = TRUE; } } if (seqList != NULL && seqList->name[0] == 0) { freeMem(seqList->name); seqList->name = cloneString("YourSeq"); } trimUniq(seqList); /* If feeling lucky only do the first on. */ if(feelingLucky && seqList != NULL) { seqList->next = NULL; } /* Figure out size allowed. */ maxSingleSize = (isTx ? 10000 : 75000); maxTotalSize = maxSingleSize * 2.5; #ifdef LOWELAB maxSeqCount = 200; #else maxSeqCount = 25; #endif /* Create temporary file to store sequence. */ trashDirFile(&faTn, "hgSs", "hgSs", ".fa"); faWriteAll(faTn.forCgi, seqList); /* Create a temporary .psl file with the alignments against genome. */ trashDirFile(&pslTn, "hgSs", "hgSs", ".pslx"); f = mustOpen(pslTn.forCgi, "w"); gvo = gfOutputPsl(0, qIsProt, FALSE, f, FALSE, TRUE); serve = findServer(db, isTx); /* Write header for extended (possibly protein) psl file. */ if (isTx) { if (isTxTx) { qType = gftDnaX; tType = gftDnaX; } else { qType = gftProt; tType = gftDnaX; } } else { qType = gftDna; tType = gftDna; } pslxWriteHead(f, qType, tType); if (qType == gftProt) { minSingleSize = 14; } else if (qType == gftDnaX) { minSingleSize = 36; } /* Loop through each sequence. */ for (seq = seqList; seq != NULL; seq = seq->next) { printf(" "); fflush(stdout); /* prevent apache cgi timeout by outputting something */ oneSize = realSeqSize(seq, !isTx); if ((seqCount&1) == 0) // Call bot delay every 2nd time starting with first time hgBotDelay(); if (++seqCount > maxSeqCount) { warn("More than 25 input sequences, stopping at %s.", seq->name); break; } if (oneSize > maxSingleSize) { warn("Sequence %s is %d letters long (max is %d), skipping", seq->name, oneSize, maxSingleSize); continue; } if (oneSize < minSingleSize) { warn("Warning: Sequence %s is only %d letters long (%d is the recommended minimum)", seq->name, oneSize, minSingleSize); // we could use "continue;" here to actually enforce skipping, // but let's give the short sequence a chance, it might work. // minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein. if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0 continue; } totalSize += oneSize; if (totalSize > maxTotalSize) { warn("Sequence %s would take us over the %d letter limit, stopping here.", seq->name, maxTotalSize); break; } conn = gfConnect(serve->host, serve->port); if (isTx) { gvo->reportTargetStrand = TRUE; if (isTxTx) { gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5, tFileCache, gvo, !txTxBoth); if (txTxBoth) { reverseComplement(seq->dna, seq->size); conn = gfConnect(serve->host, serve->port); gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5, tFileCache, gvo, FALSE); } } else { gfAlignTrans(&conn, serve->nibDir, seq, 5, tFileCache, gvo); } } else { gfAlignStrand(&conn, serve->nibDir, seq, FALSE, minMatchShown, tFileCache, gvo); reverseComplement(seq->dna, seq->size); conn = gfConnect(serve->host, serve->port); gfAlignStrand(&conn, serve->nibDir, seq, TRUE, minMatchShown, tFileCache, gvo); } gfOutputQuery(gvo, f); } carefulClose(&f); showAliPlaces(pslTn.forCgi, faTn.forCgi, serve->db, qType, tType, organism, feelingLucky); if(!feelingLucky) cartWebEnd(); gfFileCacheFree(&tFileCache); }
void itsaMake(int inCount, char *inputs[], char *output) /* itsaMake - Make a suffix array file out of input DNA sequences.. */ { verboseTimeInit(); bits64 maxGenomeSize = 1024LL*1024*1024*4; itsaBaseToValInit(); /* Load all DNA, make sure names are unique, and alphabetize by name. */ struct dnaSeq *seqList = NULL, *seq; struct hash *uniqSeqHash = hashNew(0); bits64 totalDnaSize = 1; /* FOr space between. */ int inputIx; for (inputIx=0; inputIx<inCount; ++inputIx) { char * input = inputs[inputIx]; struct dnaLoad *dl = dnaLoadOpen(input); while ((seq = dnaLoadNext(dl)) != NULL) { verbose(1, "read %s with %d bases\n", seq->name, seq->size); if (hashLookup(uniqSeqHash, seq->name)) errAbort("Input sequence name %s repeated, all must be unique.", seq->name); totalDnaSize += seq->size + 1; if (totalDnaSize > maxGenomeSize) errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize); slAddHead(&seqList, seq); } dnaLoadClose(&dl); } slSort(&seqList, dnaSeqCmpName); verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList)); /* Allocate big buffer for all DNA. */ DNA *allDna = globalAllDna = needHugeMem(totalDnaSize); allDna[0] = 0; bits64 chromOffset = 1; /* Have zeroes between each chrom, and before and after. */ /* Copy DNA to a single big buffer, and create chromInfo on each sequence. */ struct chromInfo *chrom, *chromList = NULL; for (seq = seqList; seq != NULL; seq = seq->next) { AllocVar(chrom); chrom->name = cloneString(seq->name); chrom->size = seq->size; chrom->offset = chromOffset; slAddHead(&chromList, chrom); toUpperN(seq->dna, seq->size); memcpy(allDna + chromOffset, seq->dna, seq->size + 1); chromOffset += seq->size + 1; } slReverse(&chromList); /* Free up separate dna sequences because we're going to need a lot of RAM soon. */ /* Allocate index array, and offset and list arrays. */ dnaSeqFreeList(&seqList); bits32 *index13; AllocArray(index13, itsaSlotCount); bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32)); bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32)); verboseTime(1, "Allocated buffers %lld bytes total", (long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32))); /* Where normally we'd keep some sort of structure with a next element to form a list * of matching positions in each slot of our index, to conserve memory we'll do this * with two parallel arrays. Because we're such cheapskates in terms of memory we'll * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different * purposes. * In the first phase they will together be used to form linked lists of * offsets, and the 13mer index will point to the first item in each list. In this * phase the offsetArray contains offsets into the allDna structure, and the listArray * contains the next pointers for the list. After the first phase we write out the * suffix array to disk. * In the second phase we read the suffix array back into the offsetArray, and * use the listArray for the traverseArray. We write out the traverse array to finish * things up. */ /* Load up all DNA buffer. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, " About to do first pass index\n"); indexChromPass1(chrom, allDna, offsetArray, listArray, index13); verbose(2, " Done first pass index\n"); } verboseTime(1, "Done big bucket sort"); slReverse(&chromList); itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output); }
static void pslShowAlignmentStranded2(struct psl *psl, boolean isProt, char *qName, bioSeq *qSeq, int qStart, int qEnd, char *tName, bioSeq *tSeq, int tStart, int tEnd, int exnStarts[], int exnEnds[], int exnCnt, FILE *f) /* Show stamper gene and stamp elements alignment using genomic sequence. * The aligned exons' sequence of stamper gene are shown in colors as usual, but the * the unaligned exon's sequence of stamper gene are shown in red color. */ { boolean tIsRc = (psl->strand[1] == '-'); boolean qIsRc = (psl->strand[0] == '-'); int mulFactor = (isProt ? 3 : 1); DNA *dna = NULL; /* Mixed case version of genomic DNA. */ int qSize = qSeq->size; char *qLetters = cloneString(qSeq->dna); int qbafStart, qbafEnd, tbafStart, tbafEnd; int qcfmStart, qcfmEnd, tcfmStart, tcfmEnd; tbafStart = psl->tStart; tbafEnd = psl->tEnd; tcfmStart = psl->tStart; tcfmEnd = psl->tEnd; qbafStart = qStart; qbafEnd = qEnd; qcfmStart = qStart; qcfmEnd = qEnd; /* Deal with minus strand. */ if (tIsRc) { int temp; reverseComplement(tSeq->dna, tSeq->size); temp = psl->tSize - tEnd; tEnd = psl->tSize - tStart; tStart = temp; tbafStart = psl->tEnd; tbafEnd = psl->tStart; tcfmStart = psl->tEnd; tcfmEnd = psl->tStart; } if (qIsRc) { int temp, j; reverseComplement(qSeq->dna, qSeq->size); reverseComplement(qLetters, qSeq->size); qcfmStart = qEnd; qcfmEnd = qStart; qbafStart = qEnd; qbafEnd = qStart; temp = psl->qSize - qEnd; qEnd = psl->qSize - qStart; qStart = temp; for(j = 0; j < exnCnt; j++) { temp = psl->qSize - exnStarts[j]; exnStarts[j] = psl->qSize - exnEnds[j]; exnEnds[j] = temp; } reverseInts(exnEnds, exnCnt); reverseInts(exnStarts, exnCnt); } dna = cloneString(tSeq->dna); if (qName == NULL) qName = psl->qName; if (tName == NULL) tName = psl->tName; fputs("Matching bases are colored blue and capitalized. " "Light blue bases mark the boundaries of gaps in either aligned sequence. " "Red bases are unaligned exons' bases of the query gene. \n", f); fprintf(f, "<H4><A NAME=cDNA></A>%s%s</H4>\n", qName, (qIsRc ? " (reverse complemented)" : "")); fprintf(f, "<PRE><TT>"); tolowers(qLetters); /* Display query sequence. */ { struct cfm *cfm; char *colorFlags = needMem(qSeq->size); int i = 0, j = 0, exnIdx = 0; int preStop = 0; for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]-1; int end = 0; bool omitExon = FALSE; while(exnIdx < exnCnt && psl->qStarts[i] > exnEnds[exnIdx]) { if(omitExon) { for( j = exnStarts[exnIdx] - qStart; j < exnEnds[exnIdx]-qStart; j++) { colorFlags[j] = socRed; } } exnIdx++; preStop = exnStarts[exnIdx] - qStart; omitExon = TRUE; } /*mark the boundary bases */ colorFlags[qs] = socBrightBlue; qLetters[qs] = toupper(qLetters[qs]); colorFlags[qs+sz] = socBrightBlue; qLetters[qs+sz] = toupper(qLetters[qs+sz]); /* determine block end */ if( i < psl->blockCount -1) end = psl->qStarts[i+1] < exnEnds[exnIdx] ? psl->qStarts[i+1] - qStart : exnEnds[exnIdx] - qStart; else end = qs + sz; for (j=preStop; j < end; j++) { if(j == 82) fprintf(stderr, "right here\n"); if (j > qs && j < qs+sz) { if (qSeq->dna[j] == tSeq->dna[ts+j-qs]) { colorFlags[j] = socBlue; qLetters[j] = toupper(qLetters[j]); } } else if(colorFlags[j] != socBrightBlue && colorFlags[j] != socBlue) colorFlags[j] = socRed; } preStop = end; } cfm = cfmNew(10, 60, TRUE, qIsRc, f, qcfmStart); for (i=0; i<qSize; ++i) cfmOut(cfm, qLetters[i], seqOutColorLookup[(int)colorFlags[i]]); cfmFree(&cfm); freez(&colorFlags); htmHorizontalLine(f); } fprintf(f, "</TT></PRE>\n"); fprintf(f, "<H4><A NAME=genomic></A>%s %s:</H4>\n", tName, (tIsRc ? "(reverse strand)" : "")); fprintf(f, "<PRE><TT>"); /* Display DNA sequence. */ { struct cfm *cfm; char *colorFlags = needMem(tSeq->size); int i,j; int curBlock = 0; for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]; if (isProt) { for (j=0; j<sz; ++j) { AA aa = qSeq->dna[qs+j]; int codonStart = ts + 3*j; DNA *codon = &tSeq->dna[codonStart]; AA trans = lookupCodon(codon); if (trans != 'X' && trans == aa) { colorFlags[codonStart] = socBlue; colorFlags[codonStart+1] = socBlue; colorFlags[codonStart+2] = socBlue; toUpperN(dna+codonStart, 3); } } } else { for (j=0; j<sz; ++j) { if (qSeq->dna[qs+j] == tSeq->dna[ts+j]) { colorFlags[ts+j] = socBlue; dna[ts+j] = toupper(dna[ts+j]); } } } colorFlags[ts] = socBrightBlue; colorFlags[ts+sz*mulFactor-1] = socBrightBlue; } cfm = cfmNew(10, 60, TRUE, tIsRc, f, tcfmStart); for (i=0; i<tSeq->size; ++i) { /* Put down "anchor" on first match position in haystack * so user can hop here with a click on the needle. */ if (curBlock < psl->blockCount && psl->tStarts[curBlock] == (i + tStart) ) { fprintf(f, "<A NAME=%d></A>", ++curBlock); /* Watch out for (rare) out-of-order tStarts! */ while (curBlock < psl->blockCount && psl->tStarts[curBlock] <= tStart + i) curBlock++; } cfmOut(cfm, dna[i], seqOutColorLookup[(int)colorFlags[i]]); } cfmFree(&cfm); freez(&colorFlags); htmHorizontalLine(f); } /* Display side by side. */ fprintf(f, "</TT></PRE>\n"); fprintf(f, "<H4><A NAME=ali></A>Side by Side Alignment*</H4>\n"); fprintf(f, "<PRE><TT>"); { struct baf baf; int i,j; bafInit(&baf, qSeq->dna, qbafStart, qIsRc, tSeq->dna, tbafStart, tIsRc, f, 60, isProt); if (isProt) { for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]; bafSetPos(&baf, qs, ts); bafStartLine(&baf); for (j=0; j<sz; ++j) { AA aa = qSeq->dna[qs+j]; int codonStart = ts + 3*j; DNA *codon = &tSeq->dna[codonStart]; bafOut(&baf, ' ', codon[0]); bafOut(&baf, aa, codon[1]); bafOut(&baf, ' ', codon[2]); } bafFlushLine(&baf); } fprintf( f, "<I>*when aa is different, BLOSUM positives are in green, BLOSUM negatives in red</I>\n"); } else { int lastQe = psl->qStarts[0] - qStart; int lastTe = psl->tStarts[0] - tStart; int maxSkip = 20; bafSetPos(&baf, lastQe, lastTe); bafStartLine(&baf); for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]; boolean doBreak = TRUE; int qSkip = qs - lastQe; int tSkip = ts - lastTe; if (qSkip >= 0 && qSkip <= maxSkip && tSkip == 0) { for (j=0; j<qSkip; ++j) bafOut(&baf, qSeq->dna[lastQe+j], '-'); doBreak = FALSE; } else if (tSkip > 0 && tSkip <= maxSkip && qSkip == 0) { for (j=0; j<tSkip; ++j) bafOut(&baf, '-', tSeq->dna[lastTe+j]); doBreak = FALSE; } if (doBreak) { bafFlushLine(&baf); bafSetPos(&baf, qs, ts); bafStartLine(&baf); } for (j=0; j<sz; ++j) bafOut(&baf, qSeq->dna[qs+j], tSeq->dna[ts+j]); lastQe = qs + sz; lastTe = ts + sz; } bafFlushLine(&baf); fprintf( f, "<I>*Aligned Blocks with gaps <= %d bases are merged for this display</I>\n", maxSkip); } } fprintf(f, "</TT></PRE>"); if (qIsRc) reverseComplement(qSeq->dna, qSeq->size); if (tIsRc) reverseComplement(tSeq->dna, tSeq->size); freeMem(dna); freeMem(qLetters); }
static void pslShowAlignmentStranded(struct psl *psl, boolean isProt, char *qName, bioSeq *qSeq, int qStart, int qEnd, char *tName, bioSeq *tSeq, int tStart, int tEnd, FILE *f) /* Show protein/DNA alignment or translated DNA alignment in HTML format. */ { boolean tIsRc = (psl->strand[1] == '-'); boolean qIsRc = (psl->strand[0] == '-'); int mulFactor = (isProt ? 3 : 1); DNA *dna = NULL; /* Mixed case version of genomic DNA. */ int qSize = qSeq->size; char *qLetters = cloneString(qSeq->dna); int qbafStart, qbafEnd, tbafStart, tbafEnd; int qcfmStart, qcfmEnd, tcfmStart, tcfmEnd; int lineWidth = isProt ? 60 : 50; tbafStart = tStart; tbafEnd = tEnd; tcfmStart = tStart; tcfmEnd = tEnd; qbafStart = qStart; qbafEnd = qEnd; qcfmStart = qStart; qcfmEnd = qEnd; /* Deal with minus strand. */ if (tIsRc) { int temp; reverseComplement(tSeq->dna, tSeq->size); tbafStart = tEnd; tbafEnd = tStart; tcfmStart = tEnd; tcfmEnd = tStart; temp = psl->tSize - tEnd; tEnd = psl->tSize - tStart; tStart = temp; } if (qIsRc) { int temp; reverseComplement(qSeq->dna, qSeq->size); reverseComplement(qLetters, qSeq->size); qcfmStart = qEnd; qcfmEnd = qStart; qbafStart = qEnd; qbafEnd = qStart; temp = psl->qSize - qEnd; qEnd = psl->qSize - qStart; qStart = temp; } dna = cloneString(tSeq->dna); if (qName == NULL) qName = psl->qName; if (tName == NULL) tName = psl->tName; fputs("Matching bases are colored blue and capitalized. " "Light blue bases mark the boundaries of gaps in either sequence.\n", f); fprintf(f, "<H4><A NAME=cDNA></A>%s%s</H4>\n", qName, (qIsRc ? " (reverse complemented)" : "")); fprintf(f, "<PRE><TT>"); tolowers(qLetters); /* Display query sequence. */ { struct cfm *cfm; char *colorFlags = needMem(qSeq->size); int i,j; for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]-1; colorFlags[qs] = socBrightBlue; qLetters[qs] = toupper(qLetters[qs]); colorFlags[qs+sz] = socBrightBlue; qLetters[qs+sz] = toupper(qLetters[qs+sz]); if (isProt) { for (j=1; j<sz; ++j) { AA aa = qSeq->dna[qs+j]; DNA *codon = &tSeq->dna[ts + 3*j]; AA trans = lookupCodon(codon); if (trans != 'X' && trans == aa) { colorFlags[qs+j] = socBlue; qLetters[qs+j] = toupper(qLetters[qs+j]); } } } else { for (j=1; j<sz; ++j) { if (qSeq->dna[qs+j] == tSeq->dna[ts+j]) { colorFlags[qs+j] = socBlue; qLetters[qs+j] = toupper(qLetters[qs+j]); } } } } cfm = cfmNew(10, lineWidth, TRUE, qIsRc, f, qcfmStart); for (i=0; i<qSize; ++i) cfmOut(cfm, qLetters[i], seqOutColorLookup[(int)colorFlags[i]]); cfmFree(&cfm); freez(&colorFlags); htmHorizontalLine(f); } fprintf(f, "</TT></PRE>\n"); fprintf(f, "<H4><A NAME=genomic></A>%s %s:</H4>\n", tName, (tIsRc ? "(reverse strand)" : "")); fprintf(f, "<PRE><TT>"); /* Display DNA sequence. */ { struct cfm *cfm; char *colorFlags = needMem(tSeq->size); int i,j; int curBlock = 0; for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]; if (isProt) { for (j=0; j<sz; ++j) { AA aa = qSeq->dna[qs+j]; int codonStart = ts + 3*j; DNA *codon = &tSeq->dna[codonStart]; AA trans = lookupCodon(codon); if (trans != 'X' && trans == aa) { colorFlags[codonStart] = socBlue; colorFlags[codonStart+1] = socBlue; colorFlags[codonStart+2] = socBlue; toUpperN(dna+codonStart, 3); } } } else { for (j=0; j<sz; ++j) { if (qSeq->dna[qs+j] == tSeq->dna[ts+j]) { colorFlags[ts+j] = socBlue; dna[ts+j] = toupper(dna[ts+j]); } } } colorFlags[ts] = socBrightBlue; colorFlags[ts+sz*mulFactor-1] = socBrightBlue; } cfm = cfmNew(10, lineWidth, TRUE, tIsRc, f, tcfmStart); for (i=0; i<tSeq->size; ++i) { /* Put down "anchor" on first match position in haystack * so user can hop here with a click on the needle. */ if (curBlock < psl->blockCount && psl->tStarts[curBlock] == (i + tStart) ) { fprintf(f, "<A NAME=%d></A>", ++curBlock); /* Watch out for (rare) out-of-order tStarts! */ while (curBlock < psl->blockCount && psl->tStarts[curBlock] <= tStart + i) curBlock++; } cfmOut(cfm, dna[i], seqOutColorLookup[(int)colorFlags[i]]); } cfmFree(&cfm); freez(&colorFlags); htmHorizontalLine(f); } /* Display side by side. */ fprintf(f, "</TT></PRE>\n"); fprintf(f, "<H4><A NAME=ali></A>Side by Side Alignment*</H4>\n"); fprintf(f, "<PRE><TT>"); { struct baf baf; int i,j; bafInit(&baf, qSeq->dna, qbafStart, qIsRc, tSeq->dna, tbafStart, tIsRc, f, lineWidth, isProt); if (isProt) { for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]; bafSetPos(&baf, qs, ts); bafStartLine(&baf); for (j=0; j<sz; ++j) { AA aa = qSeq->dna[qs+j]; int codonStart = ts + 3*j; DNA *codon = &tSeq->dna[codonStart]; bafOut(&baf, ' ', codon[0]); bafOut(&baf, aa, codon[1]); bafOut(&baf, ' ', codon[2]); } bafFlushLine(&baf); } fprintf( f, "<I>*When the translated amino acid in the genomic sequence differs from the \n" "corresponding amino acid in the protein, the coloring indicates the\n" "similarity of the two amino acids. Similar amino acids are green, \n" "dissimilar amino acids are red. The sign of the corresponding entry in\n" "the BLOSUM 62 matrix is used as the basis of this coloring.</I>\n"); } else { int lastQe = psl->qStarts[0] - qStart; int lastTe = psl->tStarts[0] - tStart; int maxSkip = 8; bafSetPos(&baf, lastQe, lastTe); bafStartLine(&baf); for (i=0; i<psl->blockCount; ++i) { int qs = psl->qStarts[i] - qStart; int ts = psl->tStarts[i] - tStart; int sz = psl->blockSizes[i]; boolean doBreak = TRUE; int qSkip = qs - lastQe; int tSkip = ts - lastTe; if (qSkip >= 0 && qSkip <= maxSkip && tSkip == 0) { for (j=0; j<qSkip; ++j) bafOut(&baf, qSeq->dna[lastQe+j], '-'); doBreak = FALSE; } else if (tSkip > 0 && tSkip <= maxSkip && qSkip == 0) { for (j=0; j<tSkip; ++j) bafOut(&baf, '-', tSeq->dna[lastTe+j]); doBreak = FALSE; } if (doBreak) { bafFlushLine(&baf); bafSetPos(&baf, qs, ts); bafStartLine(&baf); } for (j=0; j<sz; ++j) bafOut(&baf, qSeq->dna[qs+j], tSeq->dna[ts+j]); lastQe = qs + sz; lastTe = ts + sz; } bafFlushLine(&baf); fprintf( f, "<I>*Aligned Blocks with gaps <= %d bases are merged for this display</I>\n", maxSkip); } } fprintf(f, "</TT></PRE>"); if (qIsRc) reverseComplement(qSeq->dna, qSeq->size); if (tIsRc) reverseComplement(tSeq->dna, tSeq->size); freeMem(dna); freeMem(qLetters); }
void upperCaseCds(struct dnaSeq *dna, struct genbankCds *cds) /* uppercase the CDNS */ { tolowers(dna->dna); toUpperN(dna->dna+cds->start, (cds->end-cds->start)); }