Пример #1
0
struct gfOutput *gfOutputAny(char *format,
                             int goodPpt, boolean qIsProt, boolean tIsProt,
                             boolean noHead, char *databaseName,
                             int databaseSeqCount, double databaseLetters,
                             double minIdentity,
                             FILE *f)
/* Initialize output in a variety of formats in file or memory.
 * Parameters:
 *    format - either 'psl', 'pslx', 'sim4', 'blast', 'wublast', 'axt', 'xml'
 *    goodPpt - minimum identity of alignments to output in parts per thousand
 *    qIsProt - true if query side is a protein.
 *    tIsProt - true if target (database) side is a protein.
 *    noHead - if true suppress header in psl/pslx output.
 *    databaseName - name of database.  Only used for blast output
 *    databaseSeq - number of sequences in database - only for blast
 *    databaseLetters - number of bases/aas in database - only blast
 *    minIdentity - minimum identity - only blast
 *    FILE *f - file.
 */
{
    struct gfOutput *out = NULL;
    static char *blastTypes[] = {"blast", "wublast", "blast8", "blast9", "xml"};

    if (format == NULL)
        format = "psl";
    if (sameWord(format, "psl"))
        out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, FALSE, noHead);
    else if (sameWord(format, "pslx"))
        out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, TRUE, noHead);
    else if (sameWord(format, "sim4"))
        out = gfOutputSim4(goodPpt, qIsProt, tIsProt, databaseName);
    else if (stringArrayIx(format, blastTypes, ArraySize(blastTypes)) >= 0)
        out = gfOutputBlast(goodPpt, qIsProt, tIsProt,
                            databaseName, databaseSeqCount, databaseLetters, format,
                            minIdentity, f);
    else if (sameWord(format, "axt"))
        out = gfOutputAxt(goodPpt, qIsProt, tIsProt, f);
    else if (sameWord(format, "maf"))
        out = gfOutputMaf(goodPpt, qIsProt, tIsProt, f);
    else
        errAbort("Unrecognized output format '%s'", format);
    return out;
}
struct psl* doDnaAlignment(struct dnaSeq *seq, char *db, char *blatHost, 
	char *port, char *nibDir, struct hash *tFileCache) 
/* get the alignment from the blat host for this sequence */
{
struct psl *pslList = NULL;
int conn =0;
struct tempName pslTn;
FILE *f = NULL;
struct gfOutput *gvo;


if(seq == NULL || db == NULL)
    errAbort("coordConv::doDnaAlignment() - dnaSeq and/or db can't be NULL.");
if(strlen(seq->dna) != seq->size)
    errAbort("coordConv::doDnaAlignment() - there seems to be something fishy about %s: the size doesn't equal the length", seq->name);
/* if there are too many n's it can cause the blat server to hang */
if(strstr(seq->dna, "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn") )
  return NULL; 
makeTempName(&pslTn,"ccR", ".psl");
f = mustOpen(pslTn.forCgi, "w");
gvo = gfOutputPsl(920, FALSE, FALSE, f, FALSE, FALSE);
gfOutputHead(gvo, f);

/* align to genome, both strands */
conn = gfConnect(blatHost, port);
gfAlignStrand(&conn, nibDir, seq, FALSE, 20, tFileCache, gvo);
reverseComplement(seq->dna, seq->size);
conn = gfConnect(blatHost, port);
gfAlignStrand(&conn, nibDir, seq, TRUE, 20 , tFileCache, gvo);
gfOutputQuery(gvo, f);
carefulClose(&f);
pslList = pslLoadAll(pslTn.forCgi);
remove(pslTn.forCgi);
gfOutputFree(&gvo);
return pslList;
}
Пример #3
0
void blatSeq(char *userSeq, char *organism)
/* Blat sequence user pasted in. */
{
FILE *f;
struct dnaSeq *seqList = NULL, *seq;
struct tempName pslTn, faTn;
int maxSingleSize, maxTotalSize, maxSeqCount;
int minSingleSize = minMatchShown;
char *genome, *db;
char *type = cgiString("type");
char *seqLetters = cloneString(userSeq);
struct serverTable *serve;
int conn;
int oneSize, totalSize = 0, seqCount = 0;
boolean isTx = FALSE;
boolean isTxTx = FALSE;
boolean txTxBoth = FALSE;
struct gfOutput *gvo;
boolean qIsProt = FALSE;
enum gfType qType, tType;
struct hash *tFileCache = gfFileCacheNew();
boolean feelingLucky = cgiBoolean("Lucky");

getDbAndGenome(cart, &db, &genome, oldVars);
if(!feelingLucky)
    cartWebStart(cart, db, "%s BLAT Results",  trackHubSkipHubName(organism));
/* Load user sequence and figure out if it is DNA or protein. */
if (sameWord(type, "DNA"))
    {
    seqList = faSeqListFromMemText(seqLetters, TRUE);
    uToT(seqList);
    isTx = FALSE;
    }
else if (sameWord(type, "translated RNA") || sameWord(type, "translated DNA"))
    {
    seqList = faSeqListFromMemText(seqLetters, TRUE);
    uToT(seqList);
    isTx = TRUE;
    isTxTx = TRUE;
    txTxBoth = sameWord(type, "translated DNA");
    }
else if (sameWord(type, "protein"))
    {
    seqList = faSeqListFromMemText(seqLetters, FALSE);
    isTx = TRUE;
    qIsProt = TRUE;
    }
else 
    {
    seqList = faSeqListFromMemTextRaw(seqLetters);
    isTx = !seqIsDna(seqList);
    if (!isTx)
	{
	for (seq = seqList; seq != NULL; seq = seq->next)
	    {
	    seq->size = dnaFilteredSize(seq->dna);
	    dnaFilter(seq->dna, seq->dna);
	    toLowerN(seq->dna, seq->size);
	    subChar(seq->dna, 'u', 't');
	    }
	}
    else
	{
	for (seq = seqList; seq != NULL; seq = seq->next)
	    {
	    seq->size = aaFilteredSize(seq->dna);
	    aaFilter(seq->dna, seq->dna);
	    toUpperN(seq->dna, seq->size);
	    }
	qIsProt = TRUE;
	}
    }
if (seqList != NULL && seqList->name[0] == 0)
    {
    freeMem(seqList->name);
    seqList->name = cloneString("YourSeq");
    }
trimUniq(seqList);

/* If feeling lucky only do the first on. */
if(feelingLucky && seqList != NULL)
    {
    seqList->next = NULL;
    }

/* Figure out size allowed. */
maxSingleSize = (isTx ? 10000 : 75000);
maxTotalSize = maxSingleSize * 2.5;
#ifdef LOWELAB
maxSeqCount = 200;
#else
maxSeqCount = 25;
#endif

/* Create temporary file to store sequence. */
trashDirFile(&faTn, "hgSs", "hgSs", ".fa");
faWriteAll(faTn.forCgi, seqList);

/* Create a temporary .psl file with the alignments against genome. */
trashDirFile(&pslTn, "hgSs", "hgSs", ".pslx");
f = mustOpen(pslTn.forCgi, "w");
gvo = gfOutputPsl(0, qIsProt, FALSE, f, FALSE, TRUE);
serve = findServer(db, isTx);
/* Write header for extended (possibly protein) psl file. */
if (isTx)
    {
    if (isTxTx)
        {
	qType = gftDnaX;
	tType = gftDnaX;
	}
    else
        {
	qType = gftProt;
	tType = gftDnaX;
	}
    }
else
    {
    qType = gftDna;
    tType = gftDna;
    }
pslxWriteHead(f, qType, tType);

if (qType == gftProt)
    {
    minSingleSize = 14;
    }
else if (qType == gftDnaX)
    {
    minSingleSize = 36;
    }


/* Loop through each sequence. */
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    printf(" "); fflush(stdout);  /* prevent apache cgi timeout by outputting something */
    oneSize = realSeqSize(seq, !isTx);
    if ((seqCount&1) == 0)	// Call bot delay every 2nd time starting with first time
	hgBotDelay();
    if (++seqCount > maxSeqCount)
        {
	warn("More than 25 input sequences, stopping at %s.",
	    seq->name);
	break;
	}
    if (oneSize > maxSingleSize)
	{
	warn("Sequence %s is %d letters long (max is %d), skipping",
	    seq->name, oneSize, maxSingleSize);
	continue;
	}
    if (oneSize < minSingleSize)
        {
	warn("Warning: Sequence %s is only %d letters long (%d is the recommended minimum)", 
		seq->name, oneSize, minSingleSize);
	// we could use "continue;" here to actually enforce skipping, 
	// but let's give the short sequence a chance, it might work.
	// minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein.
	if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0
	    continue;
	}
    totalSize += oneSize;
    if (totalSize > maxTotalSize)
        {
	warn("Sequence %s would take us over the %d letter limit, stopping here.",
	     seq->name, maxTotalSize);
	break;
	}
    conn = gfConnect(serve->host, serve->port);
    if (isTx)
	{
	gvo->reportTargetStrand = TRUE;
	if (isTxTx)
	    {
	    gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5, 
	    	tFileCache, gvo, !txTxBoth);
	    if (txTxBoth)
		{
		reverseComplement(seq->dna, seq->size);
		conn = gfConnect(serve->host, serve->port);
		gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5, 
			tFileCache, gvo, FALSE);
		}
	    }
	else
	    {
	    gfAlignTrans(&conn, serve->nibDir, seq, 5, tFileCache, gvo);
	    }
	}
    else
	{
	gfAlignStrand(&conn, serve->nibDir, seq, FALSE, minMatchShown, tFileCache, gvo);
	reverseComplement(seq->dna, seq->size);
	conn = gfConnect(serve->host, serve->port);
	gfAlignStrand(&conn, serve->nibDir, seq, TRUE, minMatchShown, tFileCache, gvo);
	}
    gfOutputQuery(gvo, f);
    }
carefulClose(&f);
showAliPlaces(pslTn.forCgi, faTn.forCgi, serve->db, qType, tType, 
	      organism, feelingLucky);
if(!feelingLucky)
    cartWebEnd();
gfFileCacheFree(&tFileCache);
}