boolean checkEnds(struct dnaSeq *chrom, int start, int end, char *ends, char *strand) /* Return TRUE if the ends of intron match the input ends. */ { char *s = chrom->dna + start; char *e = chrom->dna + end; char iEnds[5]; iEnds[0] = s[0]; iEnds[1] = s[1]; iEnds[2] = e[-2]; iEnds[3] = e[-1]; iEnds[4] = 0; toLowerN(iEnds, 4); if (strand[0] == '-') reverseComplement(iEnds, 4); return sameString(ends, iEnds); }
void blatSeq(char *userSeq, char *organism) /* Blat sequence user pasted in. */ { FILE *f; struct dnaSeq *seqList = NULL, *seq; struct tempName pslTn, faTn; int maxSingleSize, maxTotalSize, maxSeqCount; int minSingleSize = minMatchShown; char *genome, *db; char *type = cgiString("type"); char *seqLetters = cloneString(userSeq); struct serverTable *serve; int conn; int oneSize, totalSize = 0, seqCount = 0; boolean isTx = FALSE; boolean isTxTx = FALSE; boolean txTxBoth = FALSE; struct gfOutput *gvo; boolean qIsProt = FALSE; enum gfType qType, tType; struct hash *tFileCache = gfFileCacheNew(); boolean feelingLucky = cgiBoolean("Lucky"); getDbAndGenome(cart, &db, &genome, oldVars); if(!feelingLucky) cartWebStart(cart, db, "%s BLAT Results", trackHubSkipHubName(organism)); /* Load user sequence and figure out if it is DNA or protein. */ if (sameWord(type, "DNA")) { seqList = faSeqListFromMemText(seqLetters, TRUE); uToT(seqList); isTx = FALSE; } else if (sameWord(type, "translated RNA") || sameWord(type, "translated DNA")) { seqList = faSeqListFromMemText(seqLetters, TRUE); uToT(seqList); isTx = TRUE; isTxTx = TRUE; txTxBoth = sameWord(type, "translated DNA"); } else if (sameWord(type, "protein")) { seqList = faSeqListFromMemText(seqLetters, FALSE); isTx = TRUE; qIsProt = TRUE; } else { seqList = faSeqListFromMemTextRaw(seqLetters); isTx = !seqIsDna(seqList); if (!isTx) { for (seq = seqList; seq != NULL; seq = seq->next) { seq->size = dnaFilteredSize(seq->dna); dnaFilter(seq->dna, seq->dna); toLowerN(seq->dna, seq->size); subChar(seq->dna, 'u', 't'); } } else { for (seq = seqList; seq != NULL; seq = seq->next) { seq->size = aaFilteredSize(seq->dna); aaFilter(seq->dna, seq->dna); toUpperN(seq->dna, seq->size); } qIsProt = TRUE; } } if (seqList != NULL && seqList->name[0] == 0) { freeMem(seqList->name); seqList->name = cloneString("YourSeq"); } trimUniq(seqList); /* If feeling lucky only do the first on. */ if(feelingLucky && seqList != NULL) { seqList->next = NULL; } /* Figure out size allowed. */ maxSingleSize = (isTx ? 10000 : 75000); maxTotalSize = maxSingleSize * 2.5; #ifdef LOWELAB maxSeqCount = 200; #else maxSeqCount = 25; #endif /* Create temporary file to store sequence. */ trashDirFile(&faTn, "hgSs", "hgSs", ".fa"); faWriteAll(faTn.forCgi, seqList); /* Create a temporary .psl file with the alignments against genome. */ trashDirFile(&pslTn, "hgSs", "hgSs", ".pslx"); f = mustOpen(pslTn.forCgi, "w"); gvo = gfOutputPsl(0, qIsProt, FALSE, f, FALSE, TRUE); serve = findServer(db, isTx); /* Write header for extended (possibly protein) psl file. */ if (isTx) { if (isTxTx) { qType = gftDnaX; tType = gftDnaX; } else { qType = gftProt; tType = gftDnaX; } } else { qType = gftDna; tType = gftDna; } pslxWriteHead(f, qType, tType); if (qType == gftProt) { minSingleSize = 14; } else if (qType == gftDnaX) { minSingleSize = 36; } /* Loop through each sequence. */ for (seq = seqList; seq != NULL; seq = seq->next) { printf(" "); fflush(stdout); /* prevent apache cgi timeout by outputting something */ oneSize = realSeqSize(seq, !isTx); if ((seqCount&1) == 0) // Call bot delay every 2nd time starting with first time hgBotDelay(); if (++seqCount > maxSeqCount) { warn("More than 25 input sequences, stopping at %s.", seq->name); break; } if (oneSize > maxSingleSize) { warn("Sequence %s is %d letters long (max is %d), skipping", seq->name, oneSize, maxSingleSize); continue; } if (oneSize < minSingleSize) { warn("Warning: Sequence %s is only %d letters long (%d is the recommended minimum)", seq->name, oneSize, minSingleSize); // we could use "continue;" here to actually enforce skipping, // but let's give the short sequence a chance, it might work. // minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein. if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0 continue; } totalSize += oneSize; if (totalSize > maxTotalSize) { warn("Sequence %s would take us over the %d letter limit, stopping here.", seq->name, maxTotalSize); break; } conn = gfConnect(serve->host, serve->port); if (isTx) { gvo->reportTargetStrand = TRUE; if (isTxTx) { gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5, tFileCache, gvo, !txTxBoth); if (txTxBoth) { reverseComplement(seq->dna, seq->size); conn = gfConnect(serve->host, serve->port); gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5, tFileCache, gvo, FALSE); } } else { gfAlignTrans(&conn, serve->nibDir, seq, 5, tFileCache, gvo); } } else { gfAlignStrand(&conn, serve->nibDir, seq, FALSE, minMatchShown, tFileCache, gvo); reverseComplement(seq->dna, seq->size); conn = gfConnect(serve->host, serve->port); gfAlignStrand(&conn, serve->nibDir, seq, TRUE, minMatchShown, tFileCache, gvo); } gfOutputQuery(gvo, f); } carefulClose(&f); showAliPlaces(pslTn.forCgi, faTn.forCgi, serve->db, qType, tType, organism, feelingLucky); if(!feelingLucky) cartWebEnd(); gfFileCacheFree(&tFileCache); }
void bigBlat(struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus) /* Run query against translated DNA database (3 frames on each strand). */ { int frame, i; struct dnaSeq *seq, trimmedSeq; struct genoFind *gfs[3]; aaSeq *dbSeqLists[3]; struct trans3 *t3List = NULL; int isRc; struct lineFile *lf = NULL; struct hash *t3Hash = NULL; boolean forceUpper = FALSE; boolean forceLower = FALSE; boolean toggle = FALSE; boolean maskUpper = FALSE; ZeroVar(&trimmedSeq); if (showStatus) printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount); /* Figure out how to manage query case. Proteins want to be in * upper case, generally, nucleotides in lower case. But there * may be repeatMasking based on case as well. */ if (transQuery) { if (qMask == NULL) forceLower = TRUE; else { maskUpper = TRUE; toggle = !sameString(qMask, "upper"); } } else { forceUpper = TRUE; } if (gvo->fileHead != NULL) gvo->fileHead(gvo, out); for (isRc = FALSE; isRc <= 1; ++isRc) { /* Initialize local pointer arrays to NULL to prevent surprises. */ for (frame = 0; frame < 3; ++frame) { gfs[frame] = NULL; dbSeqLists[frame] = NULL; } t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash); for (frame = 0; frame < 3; ++frame) { gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, repMatch, ooc, TRUE, oneOff, FALSE, stepSize); } for (i=0; i<queryCount; ++i) { aaSeq qSeq; lf = lineFileOpen(queryFiles[i], TRUE); while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name)) { dotOut(); /* Put it into right case and optionally mask on case. */ if (forceLower) toLowerN(qSeq.dna, qSeq.size); else if (forceUpper) toUpperN(qSeq.dna, qSeq.size); else if (maskUpper) { if (toggle) toggleCase(qSeq.dna, qSeq.size); upperToN(qSeq.dna, qSeq.size); } if (qSeq.size > qWarnSize) { warn("Query sequence %s has size %d, it might take a while.", qSeq.name, qSeq.size); } trimSeq(&qSeq, &trimmedSeq); if (transQuery) transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out); else tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out); gfOutputQuery(gvo, out); } lineFileClose(&lf); } /* Clean up time. */ trans3FreeList(&t3List); freeHash(&t3Hash); for (frame = 0; frame < 3; ++frame) { genoFindFree(&gfs[frame]); } for (seq = untransList; seq != NULL; seq = seq->next) { reverseComplement(seq->dna, seq->size); } } carefulClose(&out); }