void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile) /* txGeneAlias - Make kgAlias and kgProtAlias tables.. */ { /* Read and hash oldToNew */ struct hash *newToOldHash = loadNewToOldHash(oldToNew); /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); struct sqlResult *sr; char **row; char query[256]; /* Open files. */ struct lineFile *lf = lineFileOpen(xrefFile, TRUE); FILE *fAlias = mustOpen(aliasFile, "w"); FILE *fProt = mustOpen(protAliasFile, "w"); /* Stream through xref file, which has much of the info we need, * and which contains a line for each gene. */ char *words[KGXREF_NUM_COLS]; while (lineFileRowTab(lf, words)) { /* Load the xref, and output most of it's fields as aliases. */ struct kgXref *x = kgXrefLoad(words); char *id = x->kgID; outAlias(fAlias, id, x->kgID); outAlias(fAlias, id, x->mRNA); outAlias(fAlias, id, x->spID); outAlias(fAlias, id, x->spDisplayID); outAlias(fAlias, id, x->geneSymbol); outAlias(fAlias, id, x->refseq); outAlias(fAlias, id, x->protAcc); char *old = hashFindVal(newToOldHash, id); if (old != NULL) outAlias(fAlias, id, old); /* If we've got a uniProt ID, use that to get more info from uniProt. */ char *acc = x->spID; if ((acc[0] != 0) && (acc = spLookupPrimaryAccMaybe(uConn, acc)) != NULL) { /* Get current accession and output a bunch of easy protein aliases. */ outProt(fProt, id, acc, acc); outProt(fProt, id, acc, x->spDisplayID); outProt(fProt, id, acc, x->geneSymbol); outProt(fProt, id, acc, x->protAcc); if (old != NULL) outProt(fProt, id, acc, old); /* Throw in old swissProt accessions. */ sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc); sr = sqlGetResult(uConn, query); while ((row = sqlNextRow(sr)) != NULL) { outAlias(fAlias, id, row[0]); outProt(fProt, id, acc, row[0]); } /* Throw in gene names that SwissProt knows about */ struct slName *gene, *geneList = spGenes(uConn, acc); for (gene = geneList; gene != NULL; gene = gene->next) { outAlias(fAlias, id, gene->name); outProt(fProt, id, acc, gene->name); } slFreeList(&geneList); } /* Throw in gene names from genbank. */ /* At some point we may want to restrict this to the primary transcript in a cluster. */ ev = hashFindVal(evHash, id); if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc); int nameId = sqlQuickNum(gConn, query); if (nameId != 0) { char name[64]; sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId); if (sqlQuickQuery(gConn, query, name, sizeof(name))) outAlias(fAlias, id, name); } } } kgXrefFree(&x); } carefulClose(&fAlias); carefulClose(&fProt); }
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, char *evFile, char *outFile) /* txGeneXref - Make kgXref type table for genes.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct hash *geneToProtHash = makeGeneToProtHash(genePredFile); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); removePickVersions(pick); hashAdd(pickHash, pick->name, pick); } /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { char *kgID = info->name; char *mRNA = ""; char *spID = ""; char *spDisplayID = ""; char *geneSymbol = NULL; char *refseq = ""; char *protAcc = ""; char *description = NULL; char query[256]; char *proteinId = hashMustFindVal(geneToProtHash, info->name); boolean isAb = sameString(info->category, "antibodyParts"); pick = hashFindVal(pickHash, info->name); ev = hashFindVal(evHash, info->name); if (pick != NULL) { /* Fill in the relatively straightforward fields. */ refseq = pick->refSeq; if (info->orfSize > 0) { protAcc = pick->refProt; spID = proteinId; if (sameString(protAcc, spID)) spID = pick->uniProt; if (spID[0] != 0) spDisplayID = spAnyAccToId(uConn, spID); } /* Fill in gene symbol and description from refseq if possible. */ if (refseq[0] != 0) { struct sqlResult *sr; safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'", refseq); sr = sqlGetResult(gConn, query); char **row = sqlNextRow(sr); if (row != NULL) { geneSymbol = cloneString(row[0]); if (!sameWord("unknown protein", row[1])) description = cloneString(row[1]); } sqlFreeResult(&sr); } /* If need be try uniProt for gene symbol and description. */ if (spID[0] != 0 && (geneSymbol == NULL || description == NULL)) { char *acc = spLookupPrimaryAcc(uConn, spID); if (description == NULL) description = spDescription(uConn, acc); if (geneSymbol == NULL) { struct slName *nameList = spGenes(uConn, acc); if (nameList != NULL) geneSymbol = cloneString(nameList->name); slFreeList(&nameList); } } } /* If it's an antibody fragment use that as name. */ if (isAb) { geneSymbol = cloneString("abParts"); description = cloneString("Parts of antibodies, mostly variable regions."); isAb = TRUE; } if (ev == NULL) { mRNA = cloneString(""); if (!isAb) { errAbort("%s is %s but not %s\n", info->name, infoFile, evFile); } } else { mRNA = cloneString(ev->primary); chopSuffix(mRNA); } /* Still no joy? Try genbank RNA records. */ if (geneSymbol == NULL || description == NULL) { if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { if (sameString(geneSymbol, "n/a")) geneSymbol = NULL; } } if (description == NULL) { safef(query, sizeof(query), "select description.name from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } if (geneSymbol == NULL) geneSymbol = mRNA; if (description == NULL) description = mRNA; /* Get rid of some characters that will cause havoc downstream. */ stripChar(geneSymbol, '\''); subChar(geneSymbol, '<', '['); subChar(geneSymbol, '>', ']'); /* Abbreviate geneSymbol if too long */ if (strlen(geneSymbol) > 40) strcpy(geneSymbol+37, "..."); fprintf(f, "%s\t", kgID); fprintf(f, "%s\t", mRNA); fprintf(f, "%s\t", spID); fprintf(f, "%s\t", spDisplayID); fprintf(f, "%s\t", geneSymbol); fprintf(f, "%s\t", refseq); fprintf(f, "%s\t", protAcc); fprintf(f, "%s\n", description); } carefulClose(&f); }