void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile) /* txGeneAlias - Make kgAlias and kgProtAlias tables.. */ { /* Read and hash oldToNew */ struct hash *newToOldHash = loadNewToOldHash(oldToNew); /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); struct sqlResult *sr; char **row; char query[256]; /* Open files. */ struct lineFile *lf = lineFileOpen(xrefFile, TRUE); FILE *fAlias = mustOpen(aliasFile, "w"); FILE *fProt = mustOpen(protAliasFile, "w"); /* Stream through xref file, which has much of the info we need, * and which contains a line for each gene. */ char *words[KGXREF_NUM_COLS]; while (lineFileRowTab(lf, words)) { /* Load the xref, and output most of it's fields as aliases. */ struct kgXref *x = kgXrefLoad(words); char *id = x->kgID; outAlias(fAlias, id, x->kgID); outAlias(fAlias, id, x->mRNA); outAlias(fAlias, id, x->spID); outAlias(fAlias, id, x->spDisplayID); outAlias(fAlias, id, x->geneSymbol); outAlias(fAlias, id, x->refseq); outAlias(fAlias, id, x->protAcc); char *old = hashFindVal(newToOldHash, id); if (old != NULL) outAlias(fAlias, id, old); /* If we've got a uniProt ID, use that to get more info from uniProt. */ char *acc = x->spID; if ((acc[0] != 0) && (acc = spLookupPrimaryAccMaybe(uConn, acc)) != NULL) { /* Get current accession and output a bunch of easy protein aliases. */ outProt(fProt, id, acc, acc); outProt(fProt, id, acc, x->spDisplayID); outProt(fProt, id, acc, x->geneSymbol); outProt(fProt, id, acc, x->protAcc); if (old != NULL) outProt(fProt, id, acc, old); /* Throw in old swissProt accessions. */ sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc); sr = sqlGetResult(uConn, query); while ((row = sqlNextRow(sr)) != NULL) { outAlias(fAlias, id, row[0]); outProt(fProt, id, acc, row[0]); } /* Throw in gene names that SwissProt knows about */ struct slName *gene, *geneList = spGenes(uConn, acc); for (gene = geneList; gene != NULL; gene = gene->next) { outAlias(fAlias, id, gene->name); outProt(fProt, id, acc, gene->name); } slFreeList(&geneList); } /* Throw in gene names from genbank. */ /* At some point we may want to restrict this to the primary transcript in a cluster. */ ev = hashFindVal(evHash, id); if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc); int nameId = sqlQuickNum(gConn, query); if (nameId != 0) { char name[64]; sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId); if (sqlQuickQuery(gConn, query, name, sizeof(name))) outAlias(fAlias, id, name); } } } kgXrefFree(&x); } carefulClose(&fAlias); carefulClose(&fProt); }
void spTest(char *database, char *someAcc) /* spTest - Test out sp library.. */ { struct sqlConnection *conn = sqlConnect(database); char *acc, *id, *binomial, *common; struct slName *geneList, *gene, *accList, *n, *list; struct slName *nameList, *name, *keyList, *key, *typeList, *type; struct spFeature *featList, *feat; struct spCitation *citeList, *cite; char *ret = NULL; int taxon; int classId = 0, typeId = 0, refId = 0; printf("input: %s\n", someAcc); acc = spLookupPrimaryAcc(conn, someAcc); printf("primary accession: %s\n", acc); id = spAccToId(conn, acc); printf("SwissProt id: %s\n", id); printf("acc from id: %s\n", spIdToAcc(conn, id)); ret = spOrganelle(conn, acc); printf("organelle: %s\n", (ret == NULL) ? "(null)" : ret); printf("isCurated: %d\n", spIsCurated(conn, acc)); printf("aaSize: %d\n", spAaSize(conn,acc)); printf("molWeight: %d\n", spMolWeight(conn,acc)); printf("createDate: %s\n", spCreateDate(conn,acc)); printf("seqDate: %s\n", spSeqDate(conn,acc)); printf("annDate: %s\n", spAnnDate(conn,acc)); printf("description: %s\n", spDescription(conn, acc)); taxon = spTaxon(conn, acc); printf("taxon: %d\n", taxon); binomial = spTaxonToBinomial(conn, taxon); printf("first scientific name: %s\n", binomial); common = spTaxonToCommon(conn, taxon); printf("first common name: %s\n", common); printf("taxon from sci: %d\n", spBinomialToTaxon(conn, binomial)); printf("taxon from common: %d\n", spCommonToTaxon(conn, common)); printf("all scientific names:"); nameList = spBinomialNames(conn, acc); for (name = nameList; name != NULL; name = name->next) printf(" %s,", name->name); printf("\n"); printf("gene(s):"); geneList = spGenes(conn,acc); for (gene=geneList; gene != NULL; gene = gene->next) printf(" %s,", gene->name); printf("\n"); for (gene=geneList; gene != NULL; gene = gene->next) { accList = spGeneToAccs(conn, gene->name, 0); printf(" any %s:", gene->name); for (n = accList; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); slFreeList(&accList); printf(" %s %s:", common, gene->name); accList = spGeneToAccs(conn, gene->name, taxon); for (n = accList; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); slFreeList(&accList); } slFreeList(&geneList); printf("keyword(s):"); keyList = spKeywords(conn, acc); for (key = keyList; key != NULL; key = key->next) printf(" %s,", key->name); printf("\n"); for (key = keyList; key != NULL; key = key->next) { accList = spKeywordSearch(conn, key->name, taxon); printPartialList(common, key->name, accList, 4); slFreeList(&accList); break; /* This is a little slow, once is enough. */ } for (key = keyList; key != NULL; key = key->next) { accList = spKeywordSearch(conn, key->name, 0); printPartialList("all", key->name, accList, 4); slFreeList(&accList); break; /* This is a little slow, once is enough. */ } slFreeList(&keyList); printf("All comments:\n"); list = slComments(conn, acc, NULL); for (n = list; n != NULL; n = n->next) printf(" %s\n", n->name); slFreeList(&list); typeList = slCommentTypes(conn); for (type = typeList; type != NULL; type = type->next) { list = slComments(conn, acc, type->name); if (list != NULL) { printf("%s comments:\n", type->name); for (n = list; n != NULL; n = n->next) printf(" %s\n", n->name); slFreeList(&list); } } slFreeList(&typeList); list = spEmblAccs(conn, acc); printf("GenBank/EMBL:"); for (n = list; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); if (list != NULL) printf("acc from %s: %s\n", list->name, spAccFromEmbl(conn, list->name)); slFreeList(&list); list = spPdbAccs(conn, acc); printf("PDB:"); for (n = list; n != NULL; n = n->next) printf(" %s,", n->name); printf("\n"); featList = spFeatures(conn, acc, 0, 0); printf("All features:\n"); for (feat = featList; feat != NULL; feat = feat->next) { printFeat(conn, feat); classId = feat->featureClass; typeId = feat->featureType; } slFreeList(&featList); if (classId != 0 && typeId != 0) { printf("%s class features:\n", spFeatureClassName(conn, classId)); featList = spFeatures(conn, acc, classId, 0); for (feat = featList; feat != NULL; feat = feat->next) printFeat(conn, feat); slFreeList(&featList); printf("%s type features:\n", spFeatureTypeName(conn, typeId)); featList = spFeatures(conn, acc, 0, typeId); for (feat = featList; feat != NULL; feat = feat->next) printFeat(conn, feat); slFreeList(&featList); printf("same class & type features:\n"); featList = spFeatures(conn, acc, classId, typeId); for (feat = featList; feat != NULL; feat = feat->next) printFeat(conn, feat); slFreeList(&featList); printf("class loop: %d->%s->%d\n", classId, spFeatureClassName(conn, classId), spFeatureClassId(conn, spFeatureClassName(conn, classId))); printf("type loop: %d->%s->%d\n", typeId, spFeatureTypeName(conn, typeId), spFeatureTypeId(conn, spFeatureTypeName(conn, typeId))); } citeList = spCitations(conn, acc); for (cite = citeList; cite != NULL; cite = cite->next) { refId = cite->reference; printf("title: %s\n", spRefTitle(conn, refId)); printf("authors:"); list = spRefAuthors(conn, refId); for (n = list; n != NULL; n = n->next) printf(" %s, ", n->name); printf("\n"); slFreeList(&list); printf("location: %s\n", spRefCite(conn, refId)); printf("pubMed: %s\n", spRefPubMed(conn, refId)); } if (refId != 0) { printf("other accs associated with last reference:\n\t"); list = spRefToAccs(conn, refId); printPartialList("", "", list, 6); slFreeList(&list); } sqlDisconnect(&conn); }
void spDbAddVarSplice(char *database, char *inFile, char *outDir) /* spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database. */ { struct sqlConnection *conn = sqlConnect(database); char query[256]; makeDir(outDir); FILE *varProtein = openToWrite(outDir, "varProtein.txt"); FILE *varAcc = openToWrite(outDir, "varAcc.txt"); FILE *varDisplayId = openToWrite(outDir, "varDisplayId.txt"); FILE *varAccToTaxon = openToWrite(outDir, "varAccToTaxon.txt"); FILE *varDescription = openToWrite(outDir, "varDescription.txt"); FILE *varGene = openToWrite(outDir, "varGene.txt"); FILE *varGeneLogic = openToWrite(outDir, "varGeneLogic.txt"); struct lineFile *lf = lineFileOpen(inFile, TRUE); aaSeq seq; ZeroVar(&seq); while (faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { char *row[4]; char *name = seq.name; if (startsWith("sp|", name)) // Skip over sp| introduced Aug 2009 name += 3; int rowSize = chopString(name, "-|", row, ArraySize(row)); if (rowSize != 3) errAbort("Expecting name to be in format accession-N|DISP_ID, got %s\n", name); char *acc = row[0]; char *version = row[1]; char *displayId = row[2]; int accLen = strlen(acc); int verLen = strlen(version); int displayIdLen = strlen(displayId); /* Do some tests. */ if ((accLen != 6 && accLen != 10) || isdigit(acc[0]) || !isdigit(acc[accLen-1])) errAbort("wierd accession %s before line %d of %s", acc, lf->lineIx, lf->fileName); if (!isdigit(version[0]) || verLen > 4) errAbort("wierd version %s before line %d of %s", version, lf->lineIx, lf->fileName); if (countChars(displayId, '_') != 1 || displayIdLen < 6 || displayIdLen > 16) errAbort("wierd displayId %s before line %d of %s", displayId, lf->lineIx, lf->fileName); if (accLen + 1 + verLen >= sizeof(SpAcc)) errAbort("Need to increase size of SpAcc in spDb.h because of %s-%s - need %d characters but only have %lu", acc, version, accLen + 1 + verLen, sizeof(SpAcc)); /* Print out parsed results. */ fprintf(varAcc, "%s-%s\t%s\t%s\n", acc, version, acc, version); fprintf(varProtein, "%s-%s\t%s\n", acc, version, seq.dna); fprintf(varDisplayId, "%s-%s\t%s-%s\n", acc, version, acc, version); /* Look up taxon of base protein and use it to write to varAccToTaxon table. */ int taxon = spTaxon(conn, acc); fprintf(varAccToTaxon, "%s-%s\t%d\n", acc, version, taxon); /*Transfer description. */ char *description = spDescription(conn, acc); fprintf(varDescription, "%s-%s\t%s\n", acc, version, description); freez(&description); /* Transfer gene logic. */ sqlSafef(query, sizeof(query), "select val from geneLogic where acc = '%s'", acc); char *geneLogic = sqlQuickString(conn, query); if (geneLogic != NULL) fprintf(varGeneLogic, "%s-%s\t%s\n", acc, version, geneLogic); freez(&geneLogic); /* Transfer genes. */ struct slName *gene, *geneList = spGenes(conn, acc); for (gene = geneList; gene != NULL; gene = gene->next) fprintf(varGene, "%s-%s\t%s\n", acc, version, gene->name); slFreeList(&geneList); } carefulClose(&varAcc); carefulClose(&varProtein); carefulClose(&varDisplayId); carefulClose(&varAccToTaxon); carefulClose(&varDescription); carefulClose(&varGene); carefulClose(&varGeneLogic); sqlDisconnect(&conn); }
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, char *evFile, char *outFile) /* txGeneXref - Make kgXref type table for genes.. */ { /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct hash *geneToProtHash = makeGeneToProtHash(genePredFile); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(pickFile, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); removePickVersions(pick); hashAdd(pickHash, pick->name, pick); } /* Load evidence into hash */ struct hash *evHash = newHash(18); struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile); for (ev = evList; ev != NULL; ev = ev->next) hashAdd(evHash, ev->name, ev); /* Open connections to our databases */ struct sqlConnection *gConn = sqlConnect(genomeDb); struct sqlConnection *uConn = sqlConnect(uniProtDb); /* Read in info file, and loop through it to make out file. */ struct txInfo *info, *infoList = txInfoLoadAll(infoFile); FILE *f = mustOpen(outFile, "w"); for (info = infoList; info != NULL; info = info->next) { char *kgID = info->name; char *mRNA = ""; char *spID = ""; char *spDisplayID = ""; char *geneSymbol = NULL; char *refseq = ""; char *protAcc = ""; char *description = NULL; char query[256]; char *proteinId = hashMustFindVal(geneToProtHash, info->name); boolean isAb = sameString(info->category, "antibodyParts"); pick = hashFindVal(pickHash, info->name); ev = hashFindVal(evHash, info->name); if (pick != NULL) { /* Fill in the relatively straightforward fields. */ refseq = pick->refSeq; if (info->orfSize > 0) { protAcc = pick->refProt; spID = proteinId; if (sameString(protAcc, spID)) spID = pick->uniProt; if (spID[0] != 0) spDisplayID = spAnyAccToId(uConn, spID); } /* Fill in gene symbol and description from refseq if possible. */ if (refseq[0] != 0) { struct sqlResult *sr; safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'", refseq); sr = sqlGetResult(gConn, query); char **row = sqlNextRow(sr); if (row != NULL) { geneSymbol = cloneString(row[0]); if (!sameWord("unknown protein", row[1])) description = cloneString(row[1]); } sqlFreeResult(&sr); } /* If need be try uniProt for gene symbol and description. */ if (spID[0] != 0 && (geneSymbol == NULL || description == NULL)) { char *acc = spLookupPrimaryAcc(uConn, spID); if (description == NULL) description = spDescription(uConn, acc); if (geneSymbol == NULL) { struct slName *nameList = spGenes(uConn, acc); if (nameList != NULL) geneSymbol = cloneString(nameList->name); slFreeList(&nameList); } } } /* If it's an antibody fragment use that as name. */ if (isAb) { geneSymbol = cloneString("abParts"); description = cloneString("Parts of antibodies, mostly variable regions."); isAb = TRUE; } if (ev == NULL) { mRNA = cloneString(""); if (!isAb) { errAbort("%s is %s but not %s\n", info->name, infoFile, evFile); } } else { mRNA = cloneString(ev->primary); chopSuffix(mRNA); } /* Still no joy? Try genbank RNA records. */ if (geneSymbol == NULL || description == NULL) { if (ev != NULL) { int i; for (i=0; i<ev->accCount; ++i) { char *acc = ev->accs[i]; chopSuffix(acc); if (geneSymbol == NULL) { safef(query, sizeof(query), "select geneName.name from gbCdnaInfo,geneName " "where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc); geneSymbol = sqlQuickString(gConn, query); if (geneSymbol != NULL) { if (sameString(geneSymbol, "n/a")) geneSymbol = NULL; } } if (description == NULL) { safef(query, sizeof(query), "select description.name from gbCdnaInfo,description " "where description.id=gbCdnaInfo.description " "and gbCdnaInfo.acc = '%s'", acc); description = sqlQuickString(gConn, query); if (description != NULL) { if (sameString(description, "n/a")) description = NULL; } } } } } if (geneSymbol == NULL) geneSymbol = mRNA; if (description == NULL) description = mRNA; /* Get rid of some characters that will cause havoc downstream. */ stripChar(geneSymbol, '\''); subChar(geneSymbol, '<', '['); subChar(geneSymbol, '>', ']'); /* Abbreviate geneSymbol if too long */ if (strlen(geneSymbol) > 40) strcpy(geneSymbol+37, "..."); fprintf(f, "%s\t", kgID); fprintf(f, "%s\t", mRNA); fprintf(f, "%s\t", spID); fprintf(f, "%s\t", spDisplayID); fprintf(f, "%s\t", geneSymbol); fprintf(f, "%s\t", refseq); fprintf(f, "%s\t", protAcc); fprintf(f, "%s\n", description); } carefulClose(&f); }