Exemple #1
0
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, 
	char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile)
/* txGeneAlias - Make kgAlias and kgProtAlias tables.. */
{
/* Read and hash oldToNew */
struct hash *newToOldHash = loadNewToOldHash(oldToNew);

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);
struct sqlResult *sr;
char **row;
char query[256];

/* Open files. */
struct lineFile *lf = lineFileOpen(xrefFile, TRUE);
FILE *fAlias = mustOpen(aliasFile, "w");
FILE *fProt = mustOpen(protAliasFile, "w");

/* Stream through xref file, which has much of the info we need,
 * and which contains a line for each gene. */
char *words[KGXREF_NUM_COLS];
while (lineFileRowTab(lf, words))
    {
    /* Load the xref, and output most of it's fields as aliases. */
    struct kgXref *x = kgXrefLoad(words);
    char *id = x->kgID;
    outAlias(fAlias, id, x->kgID);
    outAlias(fAlias, id, x->mRNA);
    outAlias(fAlias, id, x->spID);
    outAlias(fAlias, id, x->spDisplayID);
    outAlias(fAlias, id, x->geneSymbol);
    outAlias(fAlias, id, x->refseq);
    outAlias(fAlias, id, x->protAcc);
    char *old = hashFindVal(newToOldHash, id);
    if (old != NULL)
        outAlias(fAlias, id, old);

    /* If we've got a uniProt ID, use that to get more info from uniProt. */
    char *acc = x->spID;
    if ((acc[0] != 0)  && (acc = spLookupPrimaryAccMaybe(uConn, acc)) != NULL)
        {
	/* Get current accession and output a bunch of easy protein aliases. */
	outProt(fProt, id, acc, acc);
	outProt(fProt, id, acc, x->spDisplayID);
	outProt(fProt, id, acc, x->geneSymbol);
	outProt(fProt, id, acc, x->protAcc);
	if (old != NULL)
	    outProt(fProt, id, acc, old);

	/* Throw in old swissProt accessions. */
	sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc);
	sr = sqlGetResult(uConn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    outAlias(fAlias, id, row[0]);
	    outProt(fProt, id, acc, row[0]);
	    }

	/* Throw in gene names that SwissProt knows about */
	struct slName *gene, *geneList = spGenes(uConn, acc);
	for (gene = geneList; gene != NULL; gene = gene->next)
	    {
	    outAlias(fAlias, id, gene->name);
	    outProt(fProt, id, acc, gene->name);
	    }
	slFreeList(&geneList);
	}
    /* Throw in gene names from genbank. */
    /* At some point we may want to restrict this to the primary transcript in a cluster. */
    ev = hashFindVal(evHash,  id);
    if (ev != NULL)
	{
	int i;
	for (i=0; i<ev->accCount; ++i)
	    {
	    sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc);
	    int nameId = sqlQuickNum(gConn, query);
	    if (nameId != 0)
		{
		char name[64];
		sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId);
		if (sqlQuickQuery(gConn, query, name, sizeof(name)))
		    outAlias(fAlias, id, name);
		}
	    }
	}

    kgXrefFree(&x);
    }

carefulClose(&fAlias);
carefulClose(&fProt);
}
Exemple #2
0
void spTest(char *database, char *someAcc)
/* spTest - Test out sp library.. */
{
struct sqlConnection *conn = sqlConnect(database);
char *acc, *id, *binomial, *common;
struct slName *geneList, *gene, *accList, *n, *list;
struct slName *nameList, *name, *keyList, *key, *typeList, *type;
struct spFeature *featList, *feat;
struct spCitation *citeList, *cite;
char *ret = NULL;
int taxon;
int classId = 0, typeId = 0, refId = 0;

printf("input: %s\n", someAcc);
acc = spLookupPrimaryAcc(conn, someAcc);
printf("primary accession: %s\n", acc);
id = spAccToId(conn, acc);
printf("SwissProt id: %s\n", id);
printf("acc from id: %s\n", spIdToAcc(conn, id));
ret = spOrganelle(conn, acc);
printf("organelle: %s\n", (ret == NULL) ? "(null)" : ret);
printf("isCurated: %d\n", spIsCurated(conn, acc));
printf("aaSize: %d\n", spAaSize(conn,acc));
printf("molWeight: %d\n", spMolWeight(conn,acc));
printf("createDate: %s\n", spCreateDate(conn,acc));
printf("seqDate: %s\n", spSeqDate(conn,acc));
printf("annDate: %s\n", spAnnDate(conn,acc));
printf("description: %s\n", spDescription(conn, acc));
taxon = spTaxon(conn, acc);
printf("taxon: %d\n", taxon);
binomial = spTaxonToBinomial(conn, taxon);
printf("first scientific name: %s\n", binomial);
common = spTaxonToCommon(conn, taxon);
printf("first common name: %s\n", common);
printf("taxon from sci: %d\n", spBinomialToTaxon(conn, binomial));
printf("taxon from common: %d\n", spCommonToTaxon(conn, common));
printf("all scientific names:");
nameList = spBinomialNames(conn, acc);
for (name = nameList; name != NULL; name = name->next)
    printf(" %s,", name->name);
printf("\n");
printf("gene(s):");
geneList = spGenes(conn,acc);
for (gene=geneList; gene != NULL; gene = gene->next)
    printf(" %s,", gene->name);
printf("\n");
for (gene=geneList; gene != NULL; gene = gene->next)
    {
    accList = spGeneToAccs(conn, gene->name, 0);
    printf(" any %s:", gene->name);
    for (n = accList; n != NULL; n = n->next)
        printf(" %s,", n->name);
    printf("\n");
    slFreeList(&accList);
    printf(" %s %s:", common, gene->name);
    accList = spGeneToAccs(conn, gene->name, taxon);
    for (n = accList; n != NULL; n = n->next)
        printf(" %s,", n->name);
    printf("\n");
    slFreeList(&accList);
    }
slFreeList(&geneList);
printf("keyword(s):");
keyList = spKeywords(conn, acc);
for (key = keyList; key != NULL; key = key->next)
    printf(" %s,", key->name);
printf("\n");
for (key = keyList; key != NULL; key = key->next)
    {
    accList = spKeywordSearch(conn, key->name, taxon);
    printPartialList(common, key->name, accList, 4);
    slFreeList(&accList);
    break;	/* This is a little slow, once is enough. */
    }
for (key = keyList; key != NULL; key = key->next)
    {
    accList = spKeywordSearch(conn, key->name, 0);
    printPartialList("all", key->name, accList, 4);
    slFreeList(&accList);
    break;	/* This is a little slow, once is enough. */
    }
slFreeList(&keyList);

printf("All comments:\n");
list = slComments(conn, acc, NULL);
for (n = list; n != NULL; n = n->next)
    printf(" %s\n", n->name);
slFreeList(&list);

typeList = slCommentTypes(conn);
for (type = typeList; type != NULL; type = type->next)
    {
    list = slComments(conn, acc, type->name);
    if (list != NULL)
	{
	printf("%s comments:\n", type->name);
	for (n = list; n != NULL; n = n->next)
	    printf(" %s\n", n->name);
	slFreeList(&list);
	}
    }
slFreeList(&typeList);

list = spEmblAccs(conn, acc);
printf("GenBank/EMBL:");
for (n = list; n != NULL; n = n->next)
    printf(" %s,", n->name);
printf("\n");
if (list != NULL)
    printf("acc from %s: %s\n", 
    	list->name, spAccFromEmbl(conn, list->name));
slFreeList(&list);

list = spPdbAccs(conn, acc);
printf("PDB:");
for (n = list; n != NULL; n = n->next)
    printf(" %s,", n->name);
printf("\n");

featList = spFeatures(conn, acc, 0, 0);
printf("All features:\n");
for (feat = featList; feat != NULL; feat = feat->next)
    {
    printFeat(conn, feat);
    classId = feat->featureClass;
    typeId = feat->featureType;
    }
slFreeList(&featList);
if (classId != 0 && typeId != 0)
    {
    printf("%s class features:\n", spFeatureClassName(conn, classId));
    featList = spFeatures(conn, acc, classId, 0);
    for (feat = featList; feat != NULL; feat = feat->next)
	printFeat(conn, feat);
    slFreeList(&featList);
    printf("%s type features:\n", spFeatureTypeName(conn, typeId));
    featList = spFeatures(conn, acc, 0, typeId);
    for (feat = featList; feat != NULL; feat = feat->next)
	printFeat(conn, feat);
    slFreeList(&featList);
    printf("same class & type features:\n");
    featList = spFeatures(conn, acc, classId, typeId);
    for (feat = featList; feat != NULL; feat = feat->next)
	printFeat(conn, feat);
    slFreeList(&featList);
    printf("class loop: %d->%s->%d\n", classId, 
    	spFeatureClassName(conn, classId),
	spFeatureClassId(conn, spFeatureClassName(conn, classId)));
    printf("type loop: %d->%s->%d\n", typeId, 
    	spFeatureTypeName(conn, typeId),
	spFeatureTypeId(conn, spFeatureTypeName(conn, typeId)));
    }

citeList = spCitations(conn, acc);
for (cite = citeList; cite != NULL; cite = cite->next)
    {
    refId = cite->reference;
    printf("title: %s\n", spRefTitle(conn, refId));
    printf("authors:");
    list = spRefAuthors(conn, refId);
    for (n = list; n != NULL; n = n->next)
        printf(" %s, ", n->name);
    printf("\n");
    slFreeList(&list);
    printf("location: %s\n", spRefCite(conn, refId));
    printf("pubMed: %s\n", spRefPubMed(conn, refId));
    }
if (refId != 0)
    {
    printf("other accs associated with last reference:\n\t");
    list = spRefToAccs(conn, refId);
    printPartialList("", "", list, 6);
    slFreeList(&list);
    }
sqlDisconnect(&conn);
}
Exemple #3
0
void spDbAddVarSplice(char *database, char *inFile, char *outDir)
/* spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database. */
{
struct sqlConnection *conn = sqlConnect(database);
char query[256];
makeDir(outDir);
FILE *varProtein = openToWrite(outDir, "varProtein.txt");
FILE *varAcc = openToWrite(outDir, "varAcc.txt");
FILE *varDisplayId = openToWrite(outDir, "varDisplayId.txt");
FILE *varAccToTaxon = openToWrite(outDir, "varAccToTaxon.txt");
FILE *varDescription = openToWrite(outDir, "varDescription.txt");
FILE *varGene = openToWrite(outDir, "varGene.txt");
FILE *varGeneLogic = openToWrite(outDir, "varGeneLogic.txt");
struct lineFile *lf = lineFileOpen(inFile, TRUE);
aaSeq seq;
ZeroVar(&seq);
while (faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    char *row[4];
    char *name = seq.name;
    if (startsWith("sp|", name))	// Skip over sp| introduced Aug 2009
        name += 3;
    int rowSize = chopString(name, "-|", row, ArraySize(row));
    if (rowSize != 3)
        errAbort("Expecting name to be in format accession-N|DISP_ID, got %s\n", name);
    char *acc = row[0];
    char *version = row[1];
    char *displayId = row[2];
    int accLen = strlen(acc);
    int verLen = strlen(version);
    int displayIdLen = strlen(displayId);

    /* Do some tests. */
    if ((accLen != 6 && accLen != 10) || isdigit(acc[0]) || !isdigit(acc[accLen-1]))
        errAbort("wierd accession %s before line %d of %s", acc, lf->lineIx, lf->fileName);
    if (!isdigit(version[0]) || verLen > 4)
        errAbort("wierd version %s before line %d of %s", version, lf->lineIx, lf->fileName);
    if (countChars(displayId, '_') != 1 || displayIdLen < 6 || displayIdLen > 16)
        errAbort("wierd displayId %s before line %d of %s", displayId, lf->lineIx, lf->fileName);
    if (accLen + 1 + verLen >= sizeof(SpAcc))
        errAbort("Need to increase size of SpAcc in spDb.h because of %s-%s - need %d characters but only have %lu", acc, version, accLen + 1 + verLen, sizeof(SpAcc));

    /* Print out parsed results. */
    fprintf(varAcc, "%s-%s\t%s\t%s\n", acc, version, acc, version);
    fprintf(varProtein, "%s-%s\t%s\n", acc, version, seq.dna);
    fprintf(varDisplayId, "%s-%s\t%s-%s\n", acc, version, acc, version);

    /* Look up taxon of base protein and use it to write to varAccToTaxon table. */
    int taxon = spTaxon(conn, acc);
    fprintf(varAccToTaxon, "%s-%s\t%d\n", acc, version, taxon);

    /*Transfer description. */
    char *description = spDescription(conn, acc);
    fprintf(varDescription, "%s-%s\t%s\n", acc, version, description);
    freez(&description);

    /* Transfer gene logic. */
    sqlSafef(query, sizeof(query), "select val from geneLogic where acc = '%s'", acc);
    char *geneLogic = sqlQuickString(conn, query);
    if (geneLogic != NULL)
        fprintf(varGeneLogic, "%s-%s\t%s\n", acc, version, geneLogic);
    freez(&geneLogic);

    /* Transfer genes. */
    struct slName *gene, *geneList = spGenes(conn, acc);
    for (gene = geneList; gene != NULL; gene = gene->next)
        fprintf(varGene, "%s-%s\t%s\n", acc, version, gene->name);
    slFreeList(&geneList);

    }
carefulClose(&varAcc);
carefulClose(&varProtein);
carefulClose(&varDisplayId);
carefulClose(&varAccToTaxon);
carefulClose(&varDescription);
carefulClose(&varGene);
carefulClose(&varGeneLogic);
sqlDisconnect(&conn);
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}