예제 #1
0
void txGeneColor(char *uniProtDb, char *infoFile, char *pickFile, char *outFile)
/* txGeneColor - Figure out color to draw gene in.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Open uniprot database connection. */
struct sqlConnection *uConn = sqlConnect(uniProtDb);

#ifdef OLD
/* Figure out our light and medium colors. */
mediumBlue.r = (6*trueBlue.r + 4*255)/10;
mediumBlue.g = (6*trueBlue.g + 4*255)/10;
mediumBlue.b = (6*trueBlue.b + 4*255)/10;
lightBlue.r = (1*trueBlue.r + 2*255)/3;
lightBlue.g = (1*trueBlue.g + 2*255)/3;
lightBlue.b = (1*trueBlue.b + 2*255)/3;
#endif /* OLD */

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    struct rgbColor *col;
    pick = hashFindVal(pickHash, info->name);
    if (pick != NULL)
        {
	char *source = pick->source;
	if (sameString(source, "RefPepValidated"))
	    col = &trueBlue;
	else if (sameString(source, "ccds"))
	    col = &trueBlue;
	else if (sameString(source, "RefPepReviewed"))
	    col = &trueBlue;
	else if (sameString(source, "RefSeqValidated"))
	    col = &trueBlue;
	else if (sameString(source, "RefSeqReviewed"))
	    col = &trueBlue;
	else if (sameString(source, "swissProt"))
	    col = &trueBlue;
	else if (startsWith("Ref", source))
	    col = &mediumBlue;
	else
	    col = &lightBlue;
	if (pick->swissProt[0] != 0)
	    {
	    char *acc = spLookupPrimaryAcc(uConn, pick->swissProt);
	    struct slName *pdbList = spPdbAccs(uConn, acc);
	    if (pdbList != NULL)
	        col = &black;
	    slFreeList(&pdbList);
	    }
	}
    else
        col = &lightBlue;
    fprintf(f, "%s\t%d\t%d\t%d\n", info->name, col->r, col->g, col->b);
    }
carefulClose(&f);
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}
예제 #3
0
void spTest(char *database, char *someAcc)
/* spTest - Test out sp library.. */
{
struct sqlConnection *conn = sqlConnect(database);
char *acc, *id, *binomial, *common;
struct slName *geneList, *gene, *accList, *n, *list;
struct slName *nameList, *name, *keyList, *key, *typeList, *type;
struct spFeature *featList, *feat;
struct spCitation *citeList, *cite;
char *ret = NULL;
int taxon;
int classId = 0, typeId = 0, refId = 0;

printf("input: %s\n", someAcc);
acc = spLookupPrimaryAcc(conn, someAcc);
printf("primary accession: %s\n", acc);
id = spAccToId(conn, acc);
printf("SwissProt id: %s\n", id);
printf("acc from id: %s\n", spIdToAcc(conn, id));
ret = spOrganelle(conn, acc);
printf("organelle: %s\n", (ret == NULL) ? "(null)" : ret);
printf("isCurated: %d\n", spIsCurated(conn, acc));
printf("aaSize: %d\n", spAaSize(conn,acc));
printf("molWeight: %d\n", spMolWeight(conn,acc));
printf("createDate: %s\n", spCreateDate(conn,acc));
printf("seqDate: %s\n", spSeqDate(conn,acc));
printf("annDate: %s\n", spAnnDate(conn,acc));
printf("description: %s\n", spDescription(conn, acc));
taxon = spTaxon(conn, acc);
printf("taxon: %d\n", taxon);
binomial = spTaxonToBinomial(conn, taxon);
printf("first scientific name: %s\n", binomial);
common = spTaxonToCommon(conn, taxon);
printf("first common name: %s\n", common);
printf("taxon from sci: %d\n", spBinomialToTaxon(conn, binomial));
printf("taxon from common: %d\n", spCommonToTaxon(conn, common));
printf("all scientific names:");
nameList = spBinomialNames(conn, acc);
for (name = nameList; name != NULL; name = name->next)
    printf(" %s,", name->name);
printf("\n");
printf("gene(s):");
geneList = spGenes(conn,acc);
for (gene=geneList; gene != NULL; gene = gene->next)
    printf(" %s,", gene->name);
printf("\n");
for (gene=geneList; gene != NULL; gene = gene->next)
    {
    accList = spGeneToAccs(conn, gene->name, 0);
    printf(" any %s:", gene->name);
    for (n = accList; n != NULL; n = n->next)
        printf(" %s,", n->name);
    printf("\n");
    slFreeList(&accList);
    printf(" %s %s:", common, gene->name);
    accList = spGeneToAccs(conn, gene->name, taxon);
    for (n = accList; n != NULL; n = n->next)
        printf(" %s,", n->name);
    printf("\n");
    slFreeList(&accList);
    }
slFreeList(&geneList);
printf("keyword(s):");
keyList = spKeywords(conn, acc);
for (key = keyList; key != NULL; key = key->next)
    printf(" %s,", key->name);
printf("\n");
for (key = keyList; key != NULL; key = key->next)
    {
    accList = spKeywordSearch(conn, key->name, taxon);
    printPartialList(common, key->name, accList, 4);
    slFreeList(&accList);
    break;	/* This is a little slow, once is enough. */
    }
for (key = keyList; key != NULL; key = key->next)
    {
    accList = spKeywordSearch(conn, key->name, 0);
    printPartialList("all", key->name, accList, 4);
    slFreeList(&accList);
    break;	/* This is a little slow, once is enough. */
    }
slFreeList(&keyList);

printf("All comments:\n");
list = slComments(conn, acc, NULL);
for (n = list; n != NULL; n = n->next)
    printf(" %s\n", n->name);
slFreeList(&list);

typeList = slCommentTypes(conn);
for (type = typeList; type != NULL; type = type->next)
    {
    list = slComments(conn, acc, type->name);
    if (list != NULL)
	{
	printf("%s comments:\n", type->name);
	for (n = list; n != NULL; n = n->next)
	    printf(" %s\n", n->name);
	slFreeList(&list);
	}
    }
slFreeList(&typeList);

list = spEmblAccs(conn, acc);
printf("GenBank/EMBL:");
for (n = list; n != NULL; n = n->next)
    printf(" %s,", n->name);
printf("\n");
if (list != NULL)
    printf("acc from %s: %s\n", 
    	list->name, spAccFromEmbl(conn, list->name));
slFreeList(&list);

list = spPdbAccs(conn, acc);
printf("PDB:");
for (n = list; n != NULL; n = n->next)
    printf(" %s,", n->name);
printf("\n");

featList = spFeatures(conn, acc, 0, 0);
printf("All features:\n");
for (feat = featList; feat != NULL; feat = feat->next)
    {
    printFeat(conn, feat);
    classId = feat->featureClass;
    typeId = feat->featureType;
    }
slFreeList(&featList);
if (classId != 0 && typeId != 0)
    {
    printf("%s class features:\n", spFeatureClassName(conn, classId));
    featList = spFeatures(conn, acc, classId, 0);
    for (feat = featList; feat != NULL; feat = feat->next)
	printFeat(conn, feat);
    slFreeList(&featList);
    printf("%s type features:\n", spFeatureTypeName(conn, typeId));
    featList = spFeatures(conn, acc, 0, typeId);
    for (feat = featList; feat != NULL; feat = feat->next)
	printFeat(conn, feat);
    slFreeList(&featList);
    printf("same class & type features:\n");
    featList = spFeatures(conn, acc, classId, typeId);
    for (feat = featList; feat != NULL; feat = feat->next)
	printFeat(conn, feat);
    slFreeList(&featList);
    printf("class loop: %d->%s->%d\n", classId, 
    	spFeatureClassName(conn, classId),
	spFeatureClassId(conn, spFeatureClassName(conn, classId)));
    printf("type loop: %d->%s->%d\n", typeId, 
    	spFeatureTypeName(conn, typeId),
	spFeatureTypeId(conn, spFeatureTypeName(conn, typeId)));
    }

citeList = spCitations(conn, acc);
for (cite = citeList; cite != NULL; cite = cite->next)
    {
    refId = cite->reference;
    printf("title: %s\n", spRefTitle(conn, refId));
    printf("authors:");
    list = spRefAuthors(conn, refId);
    for (n = list; n != NULL; n = n->next)
        printf(" %s, ", n->name);
    printf("\n");
    slFreeList(&list);
    printf("location: %s\n", spRefCite(conn, refId));
    printf("pubMed: %s\n", spRefPubMed(conn, refId));
    }
if (refId != 0)
    {
    printf("other accs associated with last reference:\n\t");
    list = spRefToAccs(conn, refId);
    printPartialList("", "", list, 6);
    slFreeList(&list);
    }
sqlDisconnect(&conn);
}
예제 #4
0
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, 
	char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile)
/* txGeneAlias - Make kgAlias and kgProtAlias tables.. */
{
/* Read and hash oldToNew */
struct hash *newToOldHash = loadNewToOldHash(oldToNew);

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);
struct sqlResult *sr;
char **row;
char query[256];

/* Open files. */
struct lineFile *lf = lineFileOpen(xrefFile, TRUE);
FILE *fAlias = mustOpen(aliasFile, "w");
FILE *fProt = mustOpen(protAliasFile, "w");

/* Stream through xref file, which has much of the info we need,
 * and which contains a line for each gene. */
char *words[KGXREF_NUM_COLS];
while (lineFileRowTab(lf, words))
    {
    /* Load the xref, and output most of it's fields as aliases. */
    struct kgXref *x = kgXrefLoad(words);
    char *id = x->kgID;
    outAlias(fAlias, id, x->kgID);
    outAlias(fAlias, id, x->mRNA);
    outAlias(fAlias, id, x->spID);
    outAlias(fAlias, id, x->spDisplayID);
    outAlias(fAlias, id, x->geneSymbol);
    outAlias(fAlias, id, x->refseq);
    outAlias(fAlias, id, x->protAcc);
    char *old = hashFindVal(newToOldHash, id);
    if (old != NULL)
        outAlias(fAlias, id, old);

    /* If we've got a uniProt ID, use that to get more info from uniProt. */
    char *acc = x->spID;
    if (acc[0] != 0)
        {
	/* Get current accession and output a bunch of easy protein aliases. */
	acc = spLookupPrimaryAcc(uConn, acc);
	outProt(fProt, id, acc, acc);
	outProt(fProt, id, acc, x->spDisplayID);
	outProt(fProt, id, acc, x->geneSymbol);
	outProt(fProt, id, acc, x->protAcc);
	if (old != NULL)
	    outProt(fProt, id, acc, old);

	/* Throw in old swissProt accessions. */
	sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc);
	sr = sqlGetResult(uConn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    outAlias(fAlias, id, row[0]);
	    outProt(fProt, id, acc, row[0]);
	    }

	/* Throw in gene names that SwissProt knows about */
	struct slName *gene, *geneList = spGenes(uConn, acc);
	for (gene = geneList; gene != NULL; gene = gene->next)
	    {
	    outAlias(fAlias, id, gene->name);
	    outProt(fProt, id, acc, gene->name);
	    }
	slFreeList(&geneList);
	}
    /* Throw in gene names from genbank. */
    /* At some point we may want to restrict this to the primary transcript in a cluster. */
    ev = hashFindVal(evHash,  id);
    if (ev != NULL)
	{
	int i;
	for (i=0; i<ev->accCount; ++i)
	    {
	    sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc);
	    int nameId = sqlQuickNum(gConn, query);
	    if (nameId != 0)
		{
		char name[64];
		sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId);
		if (sqlQuickQuery(gConn, query, name, sizeof(name)))
		    outAlias(fAlias, id, name);
		}
	    }
	}

    kgXrefFree(&x);
    }

carefulClose(&fAlias);
carefulClose(&fProt);
}