void txGeneColor(char *uniProtDb, char *infoFile, char *pickFile, char *outFile) /* txGeneColor - Figure out color to draw gene in.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } /* Open uniprot database connection. */ struct sqlConnection *uConn = sqlConnect(uniProtDb); #ifdef OLD /* Figure out our light and medium colors. */ mediumBlue.r = (6*trueBlue.r + 4*255)/10; mediumBlue.g = (6*trueBlue.g + 4*255)/10; mediumBlue.b = (6*trueBlue.b + 4*255)/10; lightBlue.r = (1*trueBlue.r + 2*255)/3; lightBlue.g = (1*trueBlue.g + 2*255)/3; lightBlue.b = (1*trueBlue.b + 2*255)/3; #endif /* OLD */ /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { struct rgbColor *col; pick = hashFindVal(pickHash, info->name); if (pick != NULL) { char *source = pick->source; if (sameString(source, "RefPepValidated")) col = &trueBlue; else if (sameString(source, "ccds")) col = &trueBlue; else if (sameString(source, "RefPepReviewed")) col = &trueBlue; else if (sameString(source, "RefSeqValidated")) col = &trueBlue; else if (sameString(source, "RefSeqReviewed")) col = &trueBlue; else if (sameString(source, "swissProt")) col = &trueBlue; else if (startsWith("Ref", source)) col = &mediumBlue; else col = &lightBlue; if (pick->swissProt[0] != 0) { char *acc = spLookupPrimaryAcc(uConn, pick->swissProt); struct slName *pdbList = spPdbAccs(uConn, acc); if (pdbList != NULL) col = &black; slFreeList(&pdbList); } } else col = &lightBlue; fprintf(f, "%s\t%d\t%d\t%d\n", info->name, col->r, col->g, col->b); } carefulClose(&f); }
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, char *evFile, char *outFile) /* txGeneXref - Make kgXref type table for genes.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct hash *geneToProtHash = makeGeneToProtHash(genePredFile); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); removePickVersions(pick); hashAdd(pickHash, pick->name, pick); } /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { char *kgID = info->name; char *mRNA = ""; char *spID = ""; char *spDisplayID = ""; char *geneSymbol = NULL; char *refseq = ""; char *protAcc = ""; char *description = NULL; char query[256]; char *proteinId = hashMustFindVal(geneToProtHash, info->name); boolean isAb = sameString(info->category, "antibodyParts"); pick = hashFindVal(pickHash, info->name); ev = hashFindVal(evHash, info->name); if (pick != NULL) { /* Fill in the relatively straightforward fields. */ refseq = pick->refSeq; if (info->orfSize > 0) { protAcc = pick->refProt; spID = proteinId; if (sameString(protAcc, spID)) spID = pick->uniProt; if (spID[0] != 0) spDisplayID = spAnyAccToId(uConn, spID); } /* Fill in gene symbol and description from refseq if possible. */ if (refseq[0] != 0) { struct sqlResult *sr; safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'", refseq); sr = sqlGetResult(gConn, query); char **row = sqlNextRow(sr); if (row != NULL) { geneSymbol = cloneString(row[0]); if (!sameWord("unknown protein", row[1])) description = cloneString(row[1]); } sqlFreeResult(&sr); } /* If need be try uniProt for gene symbol and description. */ if (spID[0] != 0 && (geneSymbol == NULL || description == NULL)) { char *acc = spLookupPrimaryAcc(uConn, spID); if (description == NULL) description = spDescription(uConn, acc); if (geneSymbol == NULL) { struct slName *nameList = spGenes(uConn, acc); if (nameList != NULL) geneSymbol = cloneString(nameList->name); slFreeList(&nameList); } } } /* If it's an antibody fragment use that as name. */ if (isAb) { geneSymbol = cloneString("abParts"); description = cloneString("Parts of antibodies, mostly variable regions."); isAb = TRUE; } if (ev == NULL) { mRNA = cloneString(""); if (!isAb) { errAbort("%s is %s but not %s\n", info->name, infoFile, evFile); } } else { mRNA = cloneString(ev->primary); chopSuffix(mRNA); } /* Still no joy? Try genbank RNA records. */ if (geneSymbol == NULL || description == NULL) { if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { if (sameString(geneSymbol, "n/a")) geneSymbol = NULL; } } if (description == NULL) { safef(query, sizeof(query), "select description.name from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } if (geneSymbol == NULL) geneSymbol = mRNA; if (description == NULL) description = mRNA; /* Get rid of some characters that will cause havoc downstream. */ stripChar(geneSymbol, '\''); subChar(geneSymbol, '<', '['); subChar(geneSymbol, '>', ']'); /* Abbreviate geneSymbol if too long */ if (strlen(geneSymbol) > 40) strcpy(geneSymbol+37, "..."); fprintf(f, "%s\t", kgID); fprintf(f, "%s\t", mRNA); fprintf(f, "%s\t", spID); fprintf(f, "%s\t", spDisplayID); fprintf(f, "%s\t", geneSymbol); fprintf(f, "%s\t", refseq); fprintf(f, "%s\t", protAcc); fprintf(f, "%s\n", description); } carefulClose(&f); }
void spTest(char *database, char *someAcc) /* spTest - Test out sp library.. */ { struct sqlConnection *conn = sqlConnect(database); char *acc, *id, *binomial, *common; struct slName *geneList, *gene, *accList, *n, *list; struct slName *nameList, *name, *keyList, *key, *typeList, *type; struct spFeature *featList, *feat; struct spCitation *citeList, *cite; char *ret = NULL; int taxon; int classId = 0, typeId = 0, refId = 0; printf("input: %s\n", someAcc); acc = spLookupPrimaryAcc(conn, someAcc); printf("primary accession: %s\n", acc); id = spAccToId(conn, acc); printf("SwissProt id: %s\n", id); printf("acc from id: %s\n", spIdToAcc(conn, id)); ret = spOrganelle(conn, acc); printf("organelle: %s\n", (ret == NULL) ? "(null)" : ret); printf("isCurated: %d\n", spIsCurated(conn, acc)); printf("aaSize: %d\n", spAaSize(conn,acc)); printf("molWeight: %d\n", spMolWeight(conn,acc)); printf("createDate: %s\n", spCreateDate(conn,acc)); printf("seqDate: %s\n", spSeqDate(conn,acc)); printf("annDate: %s\n", spAnnDate(conn,acc)); printf("description: %s\n", spDescription(conn, acc)); taxon = spTaxon(conn, acc); printf("taxon: %d\n", taxon); binomial = spTaxonToBinomial(conn, taxon); printf("first scientific name: %s\n", binomial); common = spTaxonToCommon(conn, taxon); printf("first common name: %s\n", common); printf("taxon from sci: %d\n", spBinomialToTaxon(conn, binomial)); printf("taxon from common: %d\n", spCommonToTaxon(conn, common)); printf("all scientific names:"); nameList = spBinomialNames(conn, acc); for (name = nameList; name != NULL; name = name->next) printf(" %s,", name->name); printf("\n"); printf("gene(s):"); geneList = spGenes(conn,acc); for (gene=geneList; gene != NULL; gene = gene->next) printf(" %s,", gene->name); printf("\n"); for (gene=geneList; gene != NULL; gene = gene->next) { accList = spGeneToAccs(conn, gene->name, 0); printf(" any %s:", gene->name); for (n = accList; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); slFreeList(&accList); printf(" %s %s:", common, gene->name); accList = spGeneToAccs(conn, gene->name, taxon); for (n = accList; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); slFreeList(&accList); } slFreeList(&geneList); printf("keyword(s):"); keyList = spKeywords(conn, acc); for (key = keyList; key != NULL; key = key->next) printf(" %s,", key->name); printf("\n"); for (key = keyList; key != NULL; key = key->next) { accList = spKeywordSearch(conn, key->name, taxon); printPartialList(common, key->name, accList, 4); slFreeList(&accList); break; /* This is a little slow, once is enough. */ } for (key = keyList; key != NULL; key = key->next) { accList = spKeywordSearch(conn, key->name, 0); printPartialList("all", key->name, accList, 4); slFreeList(&accList); break; /* This is a little slow, once is enough. */ } slFreeList(&keyList); printf("All comments:\n"); list = slComments(conn, acc, NULL); for (n = list; n != NULL; n = n->next) printf(" %s\n", n->name); slFreeList(&list); typeList = slCommentTypes(conn); for (type = typeList; type != NULL; type = type->next) { list = slComments(conn, acc, type->name); if (list != NULL) { printf("%s comments:\n", type->name); for (n = list; n != NULL; n = n->next) printf(" %s\n", n->name); slFreeList(&list); } } slFreeList(&typeList); list = spEmblAccs(conn, acc); printf("GenBank/EMBL:"); for (n = list; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); if (list != NULL) printf("acc from %s: %s\n", list->name, spAccFromEmbl(conn, list->name)); slFreeList(&list); list = spPdbAccs(conn, acc); printf("PDB:"); for (n = list; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); featList = spFeatures(conn, acc, 0, 0); printf("All features:\n"); for (feat = featList; feat != NULL; feat = feat->next) { printFeat(conn, feat); classId = feat->featureClass; typeId = feat->featureType; } slFreeList(&featList); if (classId != 0 && typeId != 0) { printf("%s class features:\n", spFeatureClassName(conn, classId)); featList = spFeatures(conn, acc, classId, 0); for (feat = featList; feat != NULL; feat = feat->next) printFeat(conn, feat); slFreeList(&featList); printf("%s type features:\n", spFeatureTypeName(conn, typeId)); featList = spFeatures(conn, acc, 0, typeId); for (feat = featList; feat != NULL; feat = feat->next) printFeat(conn, feat); slFreeList(&featList); printf("same class & type features:\n"); featList = spFeatures(conn, acc, classId, typeId); for (feat = featList; feat != NULL; feat = feat->next) printFeat(conn, feat); slFreeList(&featList); printf("class loop: %d->%s->%d\n", classId, spFeatureClassName(conn, classId), spFeatureClassId(conn, spFeatureClassName(conn, classId))); printf("type loop: %d->%s->%d\n", typeId, spFeatureTypeName(conn, typeId), spFeatureTypeId(conn, spFeatureTypeName(conn, typeId))); } citeList = spCitations(conn, acc); for (cite = citeList; cite != NULL; cite = cite->next) { refId = cite->reference; printf("title: %s\n", spRefTitle(conn, refId)); printf("authors:"); list = spRefAuthors(conn, refId); for (n = list; n != NULL; n = n->next) printf(" %s, ", n->name); printf("\n"); slFreeList(&list); printf("location: %s\n", spRefCite(conn, refId)); printf("pubMed: %s\n", spRefPubMed(conn, refId)); } if (refId != 0) { printf("other accs associated with last reference:\n\t"); list = spRefToAccs(conn, refId); printPartialList("", "", list, 6); slFreeList(&list); } sqlDisconnect(&conn); }
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile) /* txGeneAlias - Make kgAlias and kgProtAlias tables.. */ { /* Read and hash oldToNew */ struct hash *newToOldHash = loadNewToOldHash(oldToNew); /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); struct sqlResult *sr; char **row; char query[256]; /* Open files. */ struct lineFile *lf = lineFileOpen(xrefFile, TRUE); FILE *fAlias = mustOpen(aliasFile, "w"); FILE *fProt = mustOpen(protAliasFile, "w"); /* Stream through xref file, which has much of the info we need, * and which contains a line for each gene. */ char *words[KGXREF_NUM_COLS]; while (lineFileRowTab(lf, words)) { /* Load the xref, and output most of it's fields as aliases. */ struct kgXref *x = kgXrefLoad(words); char *id = x->kgID; outAlias(fAlias, id, x->kgID); outAlias(fAlias, id, x->mRNA); outAlias(fAlias, id, x->spID); outAlias(fAlias, id, x->spDisplayID); outAlias(fAlias, id, x->geneSymbol); outAlias(fAlias, id, x->refseq); outAlias(fAlias, id, x->protAcc); char *old = hashFindVal(newToOldHash, id); if (old != NULL) outAlias(fAlias, id, old); /* If we've got a uniProt ID, use that to get more info from uniProt. */ char *acc = x->spID; if (acc[0] != 0) { /* Get current accession and output a bunch of easy protein aliases. */ acc = spLookupPrimaryAcc(uConn, acc); outProt(fProt, id, acc, acc); outProt(fProt, id, acc, x->spDisplayID); outProt(fProt, id, acc, x->geneSymbol); outProt(fProt, id, acc, x->protAcc); if (old != NULL) outProt(fProt, id, acc, old); /* Throw in old swissProt accessions. */ sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc); sr = sqlGetResult(uConn, query); while ((row = sqlNextRow(sr)) != NULL) { outAlias(fAlias, id, row[0]); outProt(fProt, id, acc, row[0]); } /* Throw in gene names that SwissProt knows about */ struct slName *gene, *geneList = spGenes(uConn, acc); for (gene = geneList; gene != NULL; gene = gene->next) { outAlias(fAlias, id, gene->name); outProt(fProt, id, acc, gene->name); } slFreeList(&geneList); } /* Throw in gene names from genbank. */ /* At some point we may want to restrict this to the primary transcript in a cluster. */ ev = hashFindVal(evHash, id); if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc); int nameId = sqlQuickNum(gConn, query); if (nameId != 0) { char name[64]; sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId); if (sqlQuickQuery(gConn, query, name, sizeof(name))) outAlias(fAlias, id, name); } } } kgXrefFree(&x); } carefulClose(&fAlias); carefulClose(&fProt); }