Exemplo n.º 1
0
static boolean localizationExists(struct section *section,
	struct sqlConnection *conn, char *geneId)
/* Return TRUE if localization and existance tables exist and have something
 * on this one. */
{
char query[256];
/* mitopred - prediction of nuclear-encoded mitochondrial proteins */
if (swissProtAcc != NULL && sqlTableExists(conn, "mitopred"))
    {
    sqlSafef(query, sizeof(query),
	  "select count(*) from mitopred where name = '%s' or name = '%s'",
	  swissProtAcc, spAnyAccToId(spConn, swissProtAcc));
    if (sqlQuickNum(conn, query) > 0)
	return TRUE;
    }
/* SGD (Sacchromyces Genome Database) localization & abundance data */
if (sqlTablesExist(conn, "sgdLocalization sgdAbundance"))
    {
    sqlSafef(query, sizeof(query),
	  "select count(*) from sgdLocalization where name = '%s'", geneId);
    if (sqlQuickNum(conn, query) > 0)
	return TRUE;
    sqlSafef(query, sizeof(query),
	  "select count(*) from sgdAbundance where name = '%s'", geneId);
    if (sqlQuickNum(conn, query) > 0)
	return TRUE;
    }
return FALSE;
}
static void synonymPrint(struct section *section, 
	struct sqlConnection *conn, char *id)
/* Print out SwissProt comments - looking up typeId/commentVal. */
{
char *protAcc = getSwissProtAcc(conn, spConn, id);
char *spDisplayId;
char *refSeqAcc = "";
char *mrnaAcc = "";
char *oldDisplayId;
char condStr[255];
char *kgProteinID;
char *parAcc; /* parent accession of a variant splice protein */
char *chp;

if (isRgdGene(conn))
    {
    rgdGene2SynonymPrint(section,conn, id);
    return;
    }
if (sqlTablesExist(conn, "kgAlias"))
    printAlias(id, conn);
if (sameWord(genome, "Zebrafish"))
    {
    char *xrefTable = "ensXRefZfish";
    char *geneIdCol = "ensGeneId";
    /* get Gene Symbol and RefSeq accession from Zebrafish-specific */
    /* cross-reference table */
    printGeneSymbol(id, xrefTable, geneIdCol, conn);
    refSeqAcc = getRefSeqAcc(id, xrefTable, geneIdCol, conn);
    hPrintf("<B>ENSEMBL ID:</B> %s", id);
    }
else
    {
    char query[256];
    char *toRefTable = genomeOptionalSetting("knownToRef");
    if (toRefTable != NULL && sqlTableExists(conn, toRefTable))
        {
	safef(query, sizeof(query), "select value from %s where name='%s'", toRefTable,
		id);
	refSeqAcc = emptyForNull(sqlQuickString(conn, query));
	}
    if (sqlTableExists(conn, "kgXref"))
	{
	safef(query, sizeof(query), "select mRNA from kgXref where kgID='%s'", id);
	mrnaAcc = emptyForNull(sqlQuickString(conn, query));
	}
    if (sameWord(genome, "C. elegans"))
	hPrintf("<B>WormBase ID:</B> %s<BR>", id);
    else
	hPrintf("<B>UCSC ID:</B> %s<BR>", id);
    }
    
if (refSeqAcc[0] != 0)
    {
    hPrintf("<B>RefSeq Accession: </B> <A HREF=\"");
    printOurRefseqUrl(stdout, refSeqAcc);
    hPrintf("\">%s</A><BR>\n", refSeqAcc);
    }
else if (mrnaAcc[0] != 0)
    {
    safef(condStr, sizeof(condStr), "acc = '%s'", mrnaAcc);
    if (sqlGetField(database, "gbCdnaInfo", "acc", condStr) != NULL)
        {
    	hPrintf("<B>Representative RNA: </B> <A HREF=\"");
    	printOurMrnaUrl(stdout, mrnaAcc);
    	hPrintf("\">%s</A><BR>\n", mrnaAcc);
    	}
    else
    /* do not show URL link if it is not found in gbCdnaInfo */
    	{
    	hPrintf("<B>Representative RNA: %s </B>", mrnaAcc);
    	}
    }
if (protAcc != NULL)
    {
    kgProteinID = cloneString("");
    if (hTableExists(sqlGetDatabase(conn), "knownGene")
        && (isNotEmpty(cartOptionalString(cart, hggChrom)) &&
	      differentWord(cartOptionalString(cart, hggChrom),"none")))
    	{
    	safef(condStr, sizeof(condStr), "name = '%s' and chrom = '%s' and txStart=%s and txEnd=%s", 
	        id, cartOptionalString(cart, hggChrom), 
    	        cartOptionalString(cart, hggStart), 
		cartOptionalString(cart, hggEnd));
    	kgProteinID = sqlGetField(database, "knownGene", "proteinID", condStr);
    	}

    hPrintf("<B>Protein: ");
    if (strstr(kgProteinID, "-") != NULL)
        {
	parAcc = cloneString(kgProteinID);
	chp = strstr(parAcc, "-");
	*chp = '\0';
	
        /* show variant splice protein and the UniProt link here */
	hPrintf("<A HREF=\"http://www.uniprot.org/uniprot%s\" "
	    "TARGET=_blank>%s</A></B>, splice isoform of ",
	    kgProteinID, kgProteinID);
        hPrintf("<A HREF=\"http://www.uniprot.org/uniprot/%s\" "
	    "TARGET=_blank>%s</A></B>\n",
	    parAcc, parAcc);
	}
    else
        {
        hPrintf("<A HREF=\"http://www.uniprot.org/uniprot/%s\" "
	    "TARGET=_blank>%s</A></B>\n",
	    protAcc, protAcc);
	}
    /* show SWISS-PROT display ID if it is different than the accession ID */
    /* but, if display name is like: Q03399 | Q03399_HUMAN, then don't show display name */
    spDisplayId = spAnyAccToId(spConn, protAcc);
    if (spDisplayId == NULL) 
    	{
	errAbort("<br>%s seems to no longer be a valid protein ID in our latest UniProtKB DB.", protAcc);
	}
	
    if (strstr(spDisplayId, protAcc) == NULL)
	{
	hPrintf(" (aka %s", spDisplayId);
	/* show once if the new and old displayId are the same */
 	oldDisplayId = oldSpDisplayId(spDisplayId);
	if (oldDisplayId != NULL)
 	    {
            if (!sameWord(spDisplayId, oldDisplayId)
                && !sameWord(protAcc, oldDisplayId))
	    	{
	    	hPrintf(" or %s", oldDisplayId);
	    	}
	    }
	hPrintf(")<BR>\n");
	}
    }
printCcds(id, conn);

}
static void rgdGene2SynonymPrint(struct section *section,
        struct sqlConnection *conn, char *rgdGeneId)
{
char *geneSym = NULL, *geneName = NULL;
char query[256], **row;
struct sqlResult *sr;
if (rgdGeneId != NULL)
    {
    safef(query, sizeof(query), 
	    "select old_symbol, old_name from genes_rat where gene_rgd_id = '%s'", 
	    rgdGeneId+4L);
    sr = sqlGetResult(conn, query);
    if ((row = sqlNextRow(sr)) != NULL)
	{
	if (row[0][0] != 0 && !sameString(row[0], "n/a"))
	    {
	    geneSym = cloneString(row[0]);
	    hPrintf("<B>Symbol:</B> %s ", addComma(row[0]));
	    //hPrintf("<BR>\n");
	    }
	if (row[1][0] != 0 && !sameString(row[0], "n/a"))
	    {
	    geneName = cloneString(row[1]);
	    hPrintf("<BR><B>Name:</B> %s ", addComma(geneName));
	    hPrintf("<BR>\n");
	    }
	}
    sqlFreeResult(&sr);

    safef(query, sizeof(query), 
	    "select value from rgdGene2ToRefSeq where name= '%s'", rgdGeneId);
    sr = sqlGetResult(conn, query);
    if ((row = sqlNextRow(sr)) != NULL)
	{
        hPrintf("<B>RefSeq Accession: </B> <A HREF=\"");
	printOurRefseqUrl(stdout, row[0]);
	hPrintf("\">%s</A><BR>\n", row[0]);
	}
    sqlFreeResult(&sr);
    
    safef(query, sizeof(query), 
	    "select value from rgdGene2ToUniProt where name= '%s'", rgdGeneId);
    sr = sqlGetResult(conn, query);
    if ((row = sqlNextRow(sr)) != NULL)
    	{
	char *spId, *spDisplayId, *oldDisplayId;
	spId = row[0];
	hPrintf("<B>Protein: </B>");
        hPrintf("<A HREF=\"http://www.uniprot.org/uniprot/%s\" "
	    "TARGET=_blank>%s</A>\n",
	    spId, spId);
    
        /* show SWISS-PROT display ID if it is different than the accession ID */
        /* but, if display name is like: Q03399 | Q03399_HUMAN, then don't show display name */
        spDisplayId = spAnyAccToId(spConn, spId);
        if (spDisplayId == NULL) 
    	    {
	    errAbort("<br>The corresponding protein %s of this gene is not found in our current UniProtKB DB.", spId);
	    }   
	
        if (strstr(spDisplayId, spId) == NULL)
	    {
	    hPrintf(" (aka %s", spDisplayId);
	    /* show once if the new and old displayId are the same */
 	    oldDisplayId = oldSpDisplayId(spDisplayId);
	    if (oldDisplayId != NULL)
 	    	{
            	if (!sameWord(spDisplayId, oldDisplayId)
                    && !sameWord(spId, oldDisplayId))
	    	    {
	    	    hPrintf(" or %s", oldDisplayId);
	    	    }  
	        }	
	    hPrintf(")<BR>\n");
	    }  
   	}
    sqlFreeResult(&sr);
    }
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}
Exemplo n.º 5
0
static void localizationPrint(struct section *section, 
	struct sqlConnection *conn, char *geneId)
/* Print out localization and abundance links. */
{
char query[256], **row, *s = NULL;
struct sqlResult *sr;
boolean firstTime = TRUE;
/* mitopred - prediction of nuclear-encoded mitochondrial proteins */
if (swissProtAcc != NULL && sqlTableExists(conn, "mitopred"))
    {
    sqlSafef(query, sizeof(query), 
	  "select confidence from mitopred where name = '%s' or name = '%s'",
	  swissProtAcc, spAnyAccToId(spConn, swissProtAcc));
    sr = sqlGetResult(conn, query);
    firstTime = TRUE;
    while ((row = sqlNextRow(sr)) != NULL)
	{
	if (firstTime)
	    {
	    hPrintf("<B>Mitopred:</B> mitochondrion, confidence level: ");
	    firstTime = FALSE;
	    }
	else
	    {
	    hPrintf(", ");
	    }
	hPrintf("%s", row[0]);
	}
    sqlFreeResult(&sr);
    if (!firstTime)
	{
	hPrintf("<BR>");
	hPrintf("Prediction of nuclear-encoded mitochondrial proteins from "
	        "Guda et al., Bioinformatics. 2004 Jul 22;20(11):1785-94.<BR>"
	        "For more information see "
	        "<A HREF=\"http://mitopred.sdsc.edu/\" TARGET=_blank>"
	        "http://mitopred.sdsc.edu/</A>.<P>");
	}
    }
/* SGD (Sacchromyces Genome Database) localization & abundance data */
if (sqlTablesExist(conn, "sgdLocalization sgdAbundance"))
    {
    sqlSafef(query, sizeof(query), 
	  "select value from sgdLocalization where name = '%s'", geneId);
    sr = sqlGetResult(conn, query);
    firstTime = TRUE;
    while ((row = sqlNextRow(sr)) != NULL)
	{
	if (firstTime)
	    {
	    hPrintf("<B>SGD Localization:</B> ");
	    firstTime = FALSE;
	    }
	else
	    {
	    hPrintf(", ");
	    }
	hPrintf("%s", row[0]);
	}
    sqlFreeResult(&sr);
    if (!firstTime)
	{
	hPrintf("<BR>");
	}

    sqlSafef(query, sizeof(query), 
	  "select abundance from sgdAbundance where name = '%s'", geneId);
    s = sqlQuickString(conn, query);
    if (s != NULL)
	{
	hPrintf("<B>SGD Abundance:</B> %s (range from 41 to 1590000)<BR>\n",
		s);
	freez(&s);
	}
    hPrintf("Protein localization data from "
	    "Huh et al. (2003), Nature 425:686-691<BR>"
	    "Protein abundance data from "
	    "Ghaemmaghami et al. (2003) Nature 425:737-741<BR>"
	    "For more information see "
	    "<A HREF=\"http://yeastgfp.yeastgenome.org\" TARGET=_blank>"
	    "http://yeastgfp.yeastgenome.org</A>.");
    }
}