char *newSpDisplayId(char *oldSpDisplayId) /* Convert from old Swiss-Prot display ID to new display ID */ { static struct sqlConnection *conn=NULL; char condStr[255]; char *newSpDisplayId; if (conn==NULL) { conn = sqlConnect(PROTEOME_DB_NAME); if (conn == NULL) return NULL; } safef(condStr, sizeof(condStr), "oldDisplayId='%s'", oldSpDisplayId); newSpDisplayId = sqlGetField(PROTEOME_DB_NAME, "spOldNew", "newDisplayId", condStr); return(newSpDisplayId); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char *chp0, *chp; char *kgID; FILE *o1, *o2; char cond_str[256]; char *database; char *proteinDB; boolean doingAlias, bothDone; char *answer; char *symbol, *alias, *aliases; if (argc != 3) usage(); database = cloneString(argv[1]); proteinDB = cloneString(argv[2]); conn = hAllocConn(database); conn2= hAllocConn(database); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); doingAlias = TRUE; bothDone = FALSE; while (!bothDone) { if (doingAlias) { sqlSafef(query2, sizeof query2, "select symbol, aliases from %s.hgnc;", proteinDB); } else { sqlSafef(query2, sizeof query2, "select symbol, prvSymbols from %s.hgnc;", proteinDB); } sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { symbol = row2[0]; aliases = row2[1]; if ( (symbol != NULL) && (strlen(symbol) != 0) ) { sqlSafefFrag(cond_str, sizeof cond_str, "geneSymbol = '%s'", symbol); answer = sqlGetField(database, "kgXref", "kgID", cond_str); if (answer != NULL) { kgID = strdup(answer); fprintf(o2, "%s\t%s\n", kgID, symbol); } if ( (aliases != NULL) && (strlen(aliases) != 0) && (answer != NULL) ) { kgID = strdup(answer); chp0 = aliases; while (chp0 != NULL) { while (*chp0 == ' ') chp0++; chp = strstr(chp0, ","); if (chp == NULL) { alias = strdup(chp0); /* get rid of quote character in some aliases */ if (*alias == '"') { *(alias + strlen(alias) - 1) = '\0'; alias++; printf("%s\n", alias);fflush(stdout); } chp0 = NULL; } else { *chp = '\0'; /* get rid of quote character in some aliases */ if (*chp0 == '"') { *(chp0 + strlen(chp0) - 1) = '\0'; chp0++; printf("%s\n", chp0);fflush(stdout); } alias = strdup(chp0); chp0 = chp+1; } if (kgID != NULL) { fprintf(o1, "%s\t%s\t%s\n", kgID, symbol, alias); fprintf(o2, "%s\t%s\n", kgID, alias); } } } } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); if (doingAlias) { doingAlias = FALSE; } else { bothDone = TRUE; } } fclose(o1); fclose(o2); /* geneAlias.tab has 3 columns, the 2nd is HUGO.symbol and 3rd contains aliases and withdraws */ mustSystem("cat j.dat|sort|uniq >geneAlias.tab"); /* kgAliasM.tab has 2 columns, all entries from HUGO.symbol, HUGO.aliass, and HUGO.withdraws are listed in the 2nd column. */ mustSystem("cat jj.dat|sort|uniq >kgAliasM.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
void doMiddle(struct cart *theCart) /* Print the body of an html file. */ { char cond_str[255]; struct sqlConnection *conn; char *proteinAC; char *chp, *chp1, *chp9; char *debugTmp = NULL; char *chromStr, *cdsStartStr, *cdsEndStr, posStr[255]; char *supportedGenomeDatabase; char *answer; char *queryID; /* Initialize layout and database. */ cart = theCart; /* Uncomment this to see parameters for debugging. */ /* Be careful though, it breaks if custom track * is more than 4k */ /* { struct dyString *state = cgiUrlString(); hPrintf("State: %s\n", state->string); } */ queryID = cartOptionalString(cart, "proteinID"); if (sameString(queryID, "")) { hUserAbort("Please go back and enter a gene symbol or a Swiss-Prot/TrEMBL protein ID.\n"); } if (cgiVarExists("db")) { /* if db is known, get key variables set */ proteinInSupportedGenome = TRUE; database = cgiOptionalString("db"); organism = hDbOrganism(database); protDbName = hPdbFromGdb(database); proteinID = strdup(queryID); } else { protCntInSwissByGene = searchProteinsInSwissProtByGene(queryID); /* no CGI 'db' variable means it did not come in from GB but from pbGateway */ /* search existing GB databases to see if this protein can be found */ protCntInSupportedGenomeDb = searchProteinsInSupportedGenomes(queryID, &supportedGenomeDatabase); if ((protCntInSupportedGenomeDb > 1) || protCntInSwissByGene >= 1) { /* more than 1 proteins match the query ID, present selection web page */ proteinInSupportedGenome = 1; presentProteinSelections(queryID, protCntInSwissByGene, protCntInSupportedGenomeDb); return; } else { if (protCntInSupportedGenomeDb == 1) { /* one and only one protein found in a genome DB that support KG and PB */ proteinInSupportedGenome = TRUE; database = strdup(supportedGenomeDatabase); organism = hDbOrganism(database); protDbName = hPdbFromGdb(database); proteinID=strdup(queryID); } else { /* not found in genome DBs that support KG/PB */ /* now search PROTEOME_DB_NAMES to see if this protein is there. */ answer = uniProtFindPrimAcc(queryID); if (answer == NULL) { hUserAbort("'%s' does not seem to be a valid UniProtKB protein ID or a gene " "symbol.<br><br>Click <A HREF=\"../cgi-bin/pbGateway\">here</A> " "to start another query.", queryID); } proteinInSupportedGenome = FALSE; database = strdup(GLOBAL_PB_DB); organism = strdup(""); protDbName = strdup(PROTEOME_DB_NAME); proteinID = strdup(answer); } } if (proteinInSupportedGenome) { spConn = sqlConnect(database); sqlSafefFrag(cond_str, sizeof(cond_str), "alias='%s'", queryID); proteinID = sqlGetField(database, "kgSpAlias", "spID", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "spID='%s'", proteinID); answer = sqlGetField(database, "kgXref", "spDisplayID", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s'", answer); chromStr = sqlGetField(database, "knownGene", "chrom", cond_str); if (chromStr) { cdsStartStr = sqlGetField(database, "knownGene", "cdsStart", cond_str); cdsEndStr = sqlGetField( database, "knownGene", "cdsEnd", cond_str); safef(posStr, sizeof(posStr), "%s:%s-%s", chromStr, cdsStartStr, cdsEndStr); positionStr = strdup(posStr); cartSetString(cart, "position", positionStr); cartSetString(cart, "organism", organism); } } } /* print out key variables for debugging */ /* printf("<br>before enter main section: <br>proteinInSupportedGenome=%d<br>proteinID=%s <br>database=%s <br>organism=%s <br>protDbName=%s\n", proteinInSupportedGenome, proteinID, database, organism, protDbName);fflush(stdout); */ if (hTableExists(database, "kgProtMap2")) { kgVersion = KG_III; strcpy(kgProtMapTableName, "kgProtMap2"); } debugTmp = cartUsualString(cart, "hgDebug", "off"); if(sameString(debugTmp, "on")) hgDebug = TRUE; else hgDebug = FALSE; conn = hAllocConn(database); hgsid = cartOptionalString(cart, "hgsid"); if (hgsid != NULL) { safef(hgsidStr, sizeof(hgsidStr), "&hgsid=%s", hgsid); } else { strcpy(hgsidStr, ""); } /* check proteinID to see if it is a valid SWISS-PROT/TrEMBL accession or display ID */ /* then assign the accession number to global variable proteinID */ sqlSafefFrag(cond_str, sizeof(cond_str), "accession='%s'", proteinID); proteinAC = sqlGetField(protDbName, "spXref3", "accession", cond_str); if (proteinAC == NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "displayID='%s'", proteinID); proteinAC = sqlGetField(protDbName, "spXref3", "accession", cond_str); if (proteinAC == NULL) { hUserAbort("'%s' does not seem to be a valid Swiss-Prot/TrEMBL protein ID or gene symbol.<br><br>Click <A HREF=\"../cgi-bin/pbGateway\">here</A> to start another query." , proteinID); } else { protDisplayID = proteinID; proteinID = proteinAC; } } else { sqlSafefFrag(cond_str, sizeof(cond_str), "accession='%s'", proteinID); protDisplayID = sqlGetField(protDbName, "spXref3", "displayID", cond_str); } if (proteinInSupportedGenome) { if (kgVersion == KG_III) { sqlSafefFrag(cond_str, sizeof(cond_str), "spId='%s'", proteinID); mrnaID = sqlGetField(database, "kgXref", "kgId", cond_str); } else { sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s'", protDisplayID); mrnaID = sqlGetField(database, "knownGene", "name", cond_str); } } else { mrnaID = NULL; positionStr = NULL; } sqlSafefFrag(cond_str, sizeof(cond_str), "accession='%s'", proteinID); description = sqlGetField(protDbName, "spXref3", "description", cond_str); if (positionStr != NULL) { chp = strstr(positionStr, ":"); *chp = '\0'; prevGBChrom = cloneString(positionStr); chp1 = chp + 1; chp9 = strstr(chp1, "-"); *chp9 = '\0'; prevGBStartPos = atoi(chp1); chp1 = chp9 + 1; prevGBEndPos = atoi(chp1); } else { prevGBChrom = NULL; prevGBStartPos = -1; prevGBEndPos = -1; } /* Do main display. */ if (cgiVarExists("pbt.psOutput")) handlePostscript(); else { doTrackForm(NULL, NULL); } }
static void synonymPrint(struct section *section, struct sqlConnection *conn, char *id) /* Print out SwissProt comments - looking up typeId/commentVal. */ { char *protAcc = getSwissProtAcc(conn, spConn, id); char *spDisplayId; char *refSeqAcc = ""; char *mrnaAcc = ""; char *oldDisplayId; char condStr[255]; char *kgProteinID; char *parAcc; /* parent accession of a variant splice protein */ char *chp; if (isRgdGene(conn)) { rgdGene2SynonymPrint(section,conn, id); return; } if (sqlTablesExist(conn, "kgAlias")) printAlias(id, conn); if (sameWord(genome, "Zebrafish")) { char *xrefTable = "ensXRefZfish"; char *geneIdCol = "ensGeneId"; /* get Gene Symbol and RefSeq accession from Zebrafish-specific */ /* cross-reference table */ printGeneSymbol(id, xrefTable, geneIdCol, conn); refSeqAcc = getRefSeqAcc(id, xrefTable, geneIdCol, conn); hPrintf("<B>ENSEMBL ID:</B> %s", id); } else { char query[256]; char *toRefTable = genomeOptionalSetting("knownToRef"); if (toRefTable != NULL && sqlTableExists(conn, toRefTable)) { safef(query, sizeof(query), "select value from %s where name='%s'", toRefTable, id); refSeqAcc = emptyForNull(sqlQuickString(conn, query)); } if (sqlTableExists(conn, "kgXref")) { safef(query, sizeof(query), "select mRNA from kgXref where kgID='%s'", id); mrnaAcc = emptyForNull(sqlQuickString(conn, query)); } if (sameWord(genome, "C. elegans")) hPrintf("<B>WormBase ID:</B> %s<BR>", id); else hPrintf("<B>UCSC ID:</B> %s<BR>", id); } if (refSeqAcc[0] != 0) { hPrintf("<B>RefSeq Accession: </B> <A HREF=\""); printOurRefseqUrl(stdout, refSeqAcc); hPrintf("\">%s</A><BR>\n", refSeqAcc); } else if (mrnaAcc[0] != 0) { safef(condStr, sizeof(condStr), "acc = '%s'", mrnaAcc); if (sqlGetField(database, "gbCdnaInfo", "acc", condStr) != NULL) { hPrintf("<B>Representative RNA: </B> <A HREF=\""); printOurMrnaUrl(stdout, mrnaAcc); hPrintf("\">%s</A><BR>\n", mrnaAcc); } else /* do not show URL link if it is not found in gbCdnaInfo */ { hPrintf("<B>Representative RNA: %s </B>", mrnaAcc); } } if (protAcc != NULL) { kgProteinID = cloneString(""); if (hTableExists(sqlGetDatabase(conn), "knownGene") && (isNotEmpty(cartOptionalString(cart, hggChrom)) && differentWord(cartOptionalString(cart, hggChrom),"none"))) { safef(condStr, sizeof(condStr), "name = '%s' and chrom = '%s' and txStart=%s and txEnd=%s", id, cartOptionalString(cart, hggChrom), cartOptionalString(cart, hggStart), cartOptionalString(cart, hggEnd)); kgProteinID = sqlGetField(database, "knownGene", "proteinID", condStr); } hPrintf("<B>Protein: "); if (strstr(kgProteinID, "-") != NULL) { parAcc = cloneString(kgProteinID); chp = strstr(parAcc, "-"); *chp = '\0'; /* show variant splice protein and the UniProt link here */ hPrintf("<A HREF=\"http://www.uniprot.org/uniprot%s\" " "TARGET=_blank>%s</A></B>, splice isoform of ", kgProteinID, kgProteinID); hPrintf("<A HREF=\"http://www.uniprot.org/uniprot/%s\" " "TARGET=_blank>%s</A></B>\n", parAcc, parAcc); } else { hPrintf("<A HREF=\"http://www.uniprot.org/uniprot/%s\" " "TARGET=_blank>%s</A></B>\n", protAcc, protAcc); } /* show SWISS-PROT display ID if it is different than the accession ID */ /* but, if display name is like: Q03399 | Q03399_HUMAN, then don't show display name */ spDisplayId = spAnyAccToId(spConn, protAcc); if (spDisplayId == NULL) { errAbort("<br>%s seems to no longer be a valid protein ID in our latest UniProtKB DB.", protAcc); } if (strstr(spDisplayId, protAcc) == NULL) { hPrintf(" (aka %s", spDisplayId); /* show once if the new and old displayId are the same */ oldDisplayId = oldSpDisplayId(spDisplayId); if (oldDisplayId != NULL) { if (!sameWord(spDisplayId, oldDisplayId) && !sameWord(protAcc, oldDisplayId)) { hPrintf(" or %s", oldDisplayId); } } hPrintf(")<BR>\n"); } } printCcds(id, conn); }
int getSuperfamilies(char *proteinID) /* preserved here for previous older genomes. Newer genomes should be using getSuperfamilies2(). 6/16/04 Fan*/ { struct sqlConnection *conn, *conn2; char query[MAXNAMELEN]; struct sqlResult *sr; char **row; char cond_str[255]; char *genomeID, *seqID, *modelID, *eValue, *sfID, *sfDesc; char *region; int done; char *ensPep; char *transcriptName; char *chp, *chp2; int ii = 0; int int_start, int_end; if (!hTableExists(database, "sfAssign")) return(0); conn = hAllocConn(database); conn2 = hAllocConn(database); if (hTableExists(database, "ensemblXref3")) { /* use ensemblXref3 for Ensembl data release after ensembl34d */ sqlSafefFrag(cond_str, sizeof(cond_str), "tremblAcc='%s'", proteinID); ensPep = sqlGetField(database, "ensemblXref3", "protein", cond_str); if (ensPep == NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "swissAcc='%s'", proteinID); ensPep = sqlGetField(database, "ensemblXref3", "protein", cond_str); if (ensPep == NULL) return(0); } } else { if (! (hTableExists(database, "ensemblXref") || hTableExists(database, "ensTranscript") ) ) return(0); /* two steps query needed because the recent Ensembl gene_xref 11/2003 table does not have valid translation_name */ sqlSafefFrag(cond_str, sizeof(cond_str), "external_name='%s'", protDisplayID); transcriptName = sqlGetField(database, "ensGeneXref", "transcript_name", cond_str); if (transcriptName == NULL) { return(0); } else { sqlSafefFrag(cond_str, sizeof(cond_str), "transcript_name='%s';", transcriptName); ensPep = sqlGetField(database, "ensTranscript", "translation_name", cond_str); if (ensPep == NULL) { hFreeConn(&conn); return(0); } } } ensPepName = ensPep; sqlSafef(query, sizeof(query), "select * from %s.sfAssign where seqID='%s' and evalue <= 0.02;", database, ensPep); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); if (row == NULL) return(0); while (row != NULL) { genomeID = row[0]; seqID = row[1]; modelID = row[2]; region = row[3]; eValue = row[4]; sfID = row[5]; /* sfDesc = row[6]; */ /* !!! the recent Suprefamily sfAssign table does not have valid sf description */ sqlSafefFrag(cond_str, sizeof(cond_str), "id=%s;", sfID); sfDesc = sqlGetField(database, "sfDes", "description", cond_str); /* !!! refine logic here later to be defensive against illegal syntax */ chp = region; done = 0; while (!done) { chp2 = strstr(chp, "-"); *chp2 = '\0'; chp2++; sscanf(chp, "%d", &int_start); chp = chp2; chp2 = strstr(chp, ","); if (chp2 != NULL) { *chp2 = '\0'; } else { done = 1; } chp2++; sscanf(chp, "%d", &int_end); sfId[ii] = atoi(sfID); sfStart[ii] = int_start; sfEnd[ii] = int_end; strncpy(superfam_name[ii], sfDesc, MAXNAMELEN-1); ii++; chp = chp2; } row = sqlNextRow(sr); } sqlFreeResult(&sr); hFreeConn(&conn); hFreeConn(&conn2); return(ii); }
void doAnomalies(char *aa, int len, int *yOffp) /* draw the AA Anomalies track */ { char res; int index; char cond_str[255]; char *answer; int xx, yy; int i, j; char *chp; int aaResCnt[20]; double aaResFreqDouble[20]; int abnormal; int ia = -1; double pctLow[20], pctHi[20]; /* count frequency for each residue for current protein */ chp = aa; for (j=0; j<20; j++) { aaResCnt[j] = 0; /* get cutoff threshold value pairs */ sqlSafefFrag(cond_str, sizeof(cond_str), "AA='%c'", aaAlphabet[j]); answer = sqlGetField(database, "pbAnomLimit", "pctLow", cond_str); pctLow[j] = (double)(atof(answer)); answer = sqlGetField(database, "pbAnomLimit", "pctHi", cond_str); pctHi[j] = (double)(atof(answer)); } for (i=0; i<len; i++) { for (j=0; j<20; j++) { if (*chp == aaChar[j]) { aaResCnt[j] ++; break; } } chp++; } for (j=0; j<20; j++) { aaResFreqDouble[j] = ((double)aaResCnt[j])/((double)len); } currentYoffset = *yOffp; for (index=0; index < len; index++) { res = aa[index]; ia = -1; for (j=0; j<20; j++) { if (res == aaChar[j]) { ia = j; break; } } /* skip non-standard AA alphabets */ if (ia == -1) continue; calxy(index, *yOffp, &xx, &yy); abnormalColor = pbRed; abnormal = chkAnomaly(aaResFreqDouble[ia], pctLow[ia], pctHi[ia]); if (abnormal > 0) { vgBox(g_vg, xx, yy-5, 1*pbScale, 5, abnormalColor); } else { if (abnormal < 0) { vgBox(g_vg, xx, yy, 1*pbScale, 5, abnormalColor); } } vgBox(g_vg, xx, yy, 1*pbScale, 1, MG_BLACK); } calxy0(0, *yOffp, &xx, &yy); vgBox(g_vg, 0, yy-10, xx, 20, bkgColor); trackTitle = cloneString("AA Anomalies"); vgTextRight(g_vg, xx-25, yy-4, 10, 10, MG_BLACK, g_font, trackTitle); trackTitleLen = strlen(trackTitle); mapBoxTrackTitle(xx-25-trackTitleLen*6, yy-6, trackTitleLen*6+12, 14, trackTitle, "pepAnom"); /* update y offset */ *yOffp = *yOffp + 15; }
int main(int argc, char *argv[]) { struct sqlConnection *conn; FILE *inf; FILE *o1; char cond_str[256]; char *database; char *proteinFileName; char *outputFileName; char *answer; char *alias; char *id; char *chp0, *chp1, *chp2, *chp; char *kgID; char line[2000]; if (argc != 4) usage(); database = cloneString(argv[1]); proteinFileName = cloneString(argv[2]); outputFileName = cloneString(argv[3]); conn = hAllocConn(database); o1 = mustOpen(outputFileName, "w"); if ((inf = mustOpen(proteinFileName, "r")) == NULL) { fprintf(stderr, "Can't open file %s.\n", proteinFileName); exit(8); } while (fgets(line, 1000, inf) != NULL) { chp = strstr(line, "ID "); if (chp != line) { fprintf(stderr, "expected ID line, but got: %s\n", line); exit(1); } chp = chp + strlen("ID "); id = chp; chp = strstr(id, " "); *chp = '\0'; id = strdup(id); sqlSafefFrag(cond_str, sizeof cond_str, "proteinID = '%s'", id); answer = sqlGetField(database, "knownGene", "name", cond_str); kgID = NULL; if (answer != NULL) { kgID = strdup(answer); } if (fgets(line, 1000, inf) == NULL) { break; } do { /* "//" signal end of a record */ if ((line[0] == '/') && (line[1] == '/')) break; // work on GN (Gene Name) line only chp = strstr(line, "GN "); if (chp != NULL) { chp = line + strlen(line) -2; if (*chp == '.') { *chp = '\0'; } else { chp++; *chp = '\0'; } chp0 = line + 5; while (chp0 != NULL) { while (*chp0 == ' ') chp0++; chp1 = strstr(chp0, " OR "); chp2 = strstr(chp0, " AND "); chp = NULL; if (chp1 != NULL) { if (chp2 != NULL) { if (chp1 < chp2) { chp = chp1; } else { chp = chp2; } } else { chp = chp1; } } if (chp2!= NULL) { if (chp1 != NULL) { if (chp1 < chp2) { chp = chp1; } else { chp = chp2; } } else { chp = chp2; } } if (chp == NULL) { alias = strdup(chp0); chp0 = NULL; } else { *chp = '\0'; alias = strdup(chp0); chp0 = chp+4; } if (kgID != NULL) { // clean up "(XXXX" or "XXXX)" if (*alias == '(') alias++; chp = strstr(alias, ")"); if (chp != NULL) *chp = '\0'; fprintf(o1, "%s\t%s\n", kgID, alias); } } } } while (fgets(line, 1000, inf) != NULL); } fclose(o1); hFreeConn(&conn); return(0); }
static void gadPrint(struct section *section, struct sqlConnection *conn, char *geneId) /* Print out GAD section. */ { int refPrinted = 0; boolean showCompleteGadList; char condStr[256]; char query[256]; struct sqlResult *sr; char **row; struct dyString *currentCgiUrl; char *upperDisease; char *url = cloneString("http://geneticassociationdb.nih.gov/cgi-bin/tableview.cgi?table=allview&cond=gene="); char *itemName; if (url != NULL && url[0] != 0) { safef(condStr, sizeof(condStr), "k.kgId='%s' and k.geneSymbol = g.geneSymbol", geneId); itemName = sqlGetField(database, "kgXref k, gadAll g", "k.geneSymbol", condStr); showCompleteGadList = FALSE; if (cgiOptionalString("showAllRef") != NULL) { if (sameWord(cgiOptionalString("showAllRef"), "Y") || sameWord(cgiOptionalString("showAllRef"), "y") ) { showCompleteGadList = TRUE; } } currentCgiUrl = cgiUrlString(); printf("<B>Genetic Association Database: "); printf("<A HREF=\"%s'%s'\" target=_blank>", url, itemName); printf("%s</B></A>\n", itemName); printf("<BR><B>CDC HuGE Published Literature: "); printf("<A HREF=\"%s%s%s\" target=_blank>", "http://hugenavigator.net/HuGENavigator/searchSummary.do?firstQuery=", itemName, "&publitSearchType=now&whichContinue=firststart&check=n&dbType=publit&Mysubmit=go"); printf("%s</B></A>\n", itemName); /* List diseases associated with the gene */ safef(query, sizeof(query), "select distinct broadPhen from gadAll where geneSymbol='%s' and association = 'Y' order by broadPhen", itemName); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); if (row != NULL) { upperDisease = replaceChars(row[0], "'", "''"); touppers(upperDisease); printf("<BR><B>Positive Disease Associations: </B>"); printf("<A HREF=\"%s%s%s%s%s\" target=_blank>", "http://geneticassociationdb.nih.gov/cgi-bin/tableview.cgi?table=allview&cond=upper(DISEASE)%20like%20'%25", cgiEncode(upperDisease), "%25'%20AND%20upper(GENE)%20%20like%20'%25", itemName, "%25'"); printf("%s</B></A>\n", row[0]); row = sqlNextRow(sr); } while (row != NULL) { upperDisease = replaceChars(row[0], "'", "''"); touppers(upperDisease); printf(", <A HREF=\"%s%s%s%s%s\" target=_blank>", "http://geneticassociationdb.nih.gov/cgi-bin/tableview.cgi?table=allview&cond=upper(DISEASE)%20like%20'%25", cgiEncode(upperDisease), "%25'%20AND%20upper(GENE)%20%20like%20'%25", itemName, "%25'"); printf("%s</B></A>\n", row[0]); row = sqlNextRow(sr); } sqlFreeResult(&sr); refPrinted = 0; safef(query, sizeof(query), "select broadPhen,reference,title,journal, pubMed, conclusion from gadAll where geneSymbol='%s' and association = 'Y' order by broadPhen", itemName); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); if (row != NULL) printf("<BR><B>Related Studies: </B><OL>"); while (row != NULL) { printf("<LI><B>%s </B>", row[0]); printf("<br>%s, %s, %s.\n", row[1], row[2], row[3]); if (!sameWord(row[4], "")) { printf(" [PubMed "); printf("<A HREF=\"%s%s%s'\" target=_blank>", "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=pubmed&cmd=Retrieve&dopt=Abstract&list_uids=", row[4],"&query_hl=1&itool=genome.ucsc.edu"); printf("%s</B></A>]\n", row[4]); } printf("<br><i>%s</i>\n", row[5]); printf("</LI>\n"); refPrinted++; if ((!showCompleteGadList) && (refPrinted >= 3)) break; row = sqlNextRow(sr); } sqlFreeResult(&sr); printf("</OL>"); if ((!showCompleteGadList) && (row != NULL)) { printf("<B>   more ... </B>"); printf( "<A HREF=\"%s?showAllRef=Y&%s#gad\">click here to view the complete list</A> ", "hgGene", currentCgiUrl->string); } } }
int main(int argc, char *argv[]) { struct sqlConnection *conn2, *conn3, *conn4; char query2[256], query3[256]; struct sqlResult *sr2, *sr3; char **row2, **row3; char *accession; char *extDB; char *extAC; char condStr[255]; char *id, *subId, *avStr, *pos; char *baseAAStr, *subsAAStr; char baseAA, subsAA; char *genomeDb; char *aaSeq; char ch; int aaPos, aaLen; int nTotal = 0; int nOK = 0; int nBase = 0; int nErr = 0; int nSubs = 0; boolean gotAMatch = FALSE; FILE *outf; if (argc != 3) usage(); genomeDb = argv[1]; outf = fopen(argv[2], "w"); conn2= hAllocConn(); conn3= hAllocConn(); conn4= hAllocConn(); /* loop thru all recordd in the omimAvPos table */ sqlSafef(query2, sizeof query2, "select * from %s.omimAvPos", genomeDb); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { id = row2[0]; subId = row2[1]; avStr = row2[2]; pos = row2[3]; baseAAStr = row2[4]; subsAAStr = row2[5]; baseAA = *baseAAStr; subsAA = *subsAAStr; aaPos = atoi(pos); /* find corresponding protein for each OMIM record */ sqlSafef(query3, sizeof query3, "select distinct accession, extDB, extAC from %s.spXref2 where extAC='%s' and extDB='MIM';", PROTEOME_DB_NAME, id); sr3 = sqlMustGetResult(conn3, query3); row3 = sqlNextRow(sr3); while (row3 != NULL) { accession = row3[0]; extDB = row3[1]; extAC = row3[2]; nTotal++; gotAMatch = FALSE; /* get protein sequence */ sqlSafefFrag(condStr, sizeof condStr, "acc='%s'", accession); aaSeq = sqlGetField(UNIPROT_DB_NAME, "protein", "val", condStr); aaLen = strlen(aaSeq); /* check AA (both base and substitition) of the AV entry against AA in the protein sequence */ if (aaPos <= aaLen) { ch = *(aaSeq+aaPos-1); if (ch == baseAA) { gotAMatch = TRUE; nOK++; nBase++; } else { if (ch == subsAA) { gotAMatch = TRUE; nOK++; nSubs++; } } if (gotAMatch) { fprintf(outf, "%s\t%s\t%s\t%s\n", id, subId, accession, pos); } else { nErr++; } } else { nErr++; } row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn2); hFreeConn(&conn3); fclose(outf); fprintf(stderr, "nTotal\t= %6d\n", nTotal); fprintf(stderr, "nOk\t= %6d\n", nOK); fprintf(stderr, "nBase\t= %6d\n", nBase); fprintf(stderr, "nSub\t= %6d\n", nSubs); fprintf(stderr, "nErr\t= %6d\n", nErr); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3, *conn5; char query2[256], query3[256], query5[256]; struct sqlResult *sr2, *sr3, *sr5; char **row2, **row3, **row5; char cond_str[512]; char *chp; FILE *o1, *o2; boolean hasKGmRNA; char *proteinDisplayID; char *gbAC; char *locusID; /* LocusLink ID */ char *refAC; /* Refseq accession.version */ char *giNCBI2; /* NCBI gi for the protein record associated with the CDS */ char *revStatus; /* review status */ char *proteinAC2; /* protein accession.version */ char *taxID2; /* tax id */ char *protDbName; char *refSeq; char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd, *exonCount, *exonStarts, *exonEnds; char *gseq, *hseq, *swissprot; int alignmentID=0; if (argc != 4) usage(); dbName = argv[1]; protDbName = argv[2]; genomeReadOnly = argv[3]; sprintf(tempDbName, "%sTemp", dbName); hSetDb(genomeReadOnly); conn = hAllocConn(); conn2= hAllocConn(); conn3= hAllocConn(); conn5= hAllocConn(); o1 = fopen("dnaGene.tab", "w"); o2 = fopen("j.dat", "w"); // scan all RefSeq entries sqlSafef(query2, sizeof query2, "select * from %s.locus2Ref0;", tempDbName); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { locusID = row2[0]; refAC = row2[1]; giNCBI2 = row2[2]; revStatus = row2[3]; proteinAC2 = row2[4]; taxID2 = row2[5]; refSeq = strdup(refAC); chp = strstr(refAC, "."); if (chp != NULL) *chp = '\0'; proteinDisplayID = NULL; /* check if the locusID of this RefSeq points to a KG mRNA */ hasKGmRNA = checkMrna(locusID); /* check if this RefSeq has 'g' type sequence(s) referenced */ sqlSafefFrag(cond_str, sizeof cond_str, "locusID=%s and seqType='g';", locusID); gseq = sqlGetField(tempDbName, "locus2Acc0", "gbac", cond_str); /* process only 'g' type record which does not have corresponding KG entry */ if ((!hasKGmRNA) && (gseq != NULL)) { sqlSafefFrag(cond_str, sizeof cond_str, "name='%s'", refAC); hseq = sqlGetField(genomeReadOnly, "refGene", "name", cond_str); if (hseq != NULL) { sqlSafefFrag(cond_str, sizeof cond_str, "refseq='%s';", refAC); swissprot = sqlGetField(protDbName, "hugo", "swissprot", cond_str); if (swissprot != NULL) { if (strlen(swissprot) >0) { // HUGO has an entry with swissprot ID, get display ID sqlSafefFrag(cond_str, sizeof cond_str, "accession='%s';", swissprot); proteinDisplayID = sqlGetField(protDbName, "spXref2", "displayID", cond_str); if (proteinDisplayID == NULL) { fprintf(stderr, "%s: a HUGO.swissprot, ", swissprot); fprintf(stderr, "but not a SP Primary AC.\n"); fflush(stdout); } } else { //printf("HGNC has a non-NULL but empty swissprot field "); //printf("for %s\n", refAC);fflush(stdout); } } // not finding it in HUGO does not mean not a valid one for sure if (proteinDisplayID == NULL) { // get gbAC and check if spXref2 actually has it sqlSafef(query3, sizeof query3, "select gbAC from %s.locus2Acc0 where locusID=%s;", tempDbName, locusID); sr3 = sqlMustGetResult(conn3, query3); row3 = sqlNextRow(sr3); while (row3 != NULL) { gbAC = row3[0]; chp = strstr(gbAC, "."); if (chp != NULL) *chp = '\0'; sqlSafefFrag(cond_str, sizeof cond_str, "extAC='%s'", gbAC); proteinDisplayID = sqlGetField(protDbName, "spXref2", "displayID", cond_str); if (proteinDisplayID == NULL) { //printf("%s %s is in refGene, but has no SWISS-PROT.\n", // locusID, refAC); //fflush(stdout); } else { //printf("%s %s got 2nd chance.\n", refAC, gbAC);fflush(stdout); break; } row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } if (proteinDisplayID != NULL) { // generate KG entry sqlSafef(query5, sizeof query5, "select * from %s.refGene where name='%s';", genomeReadOnly, refAC); sr5 = sqlMustGetResult(conn5, query5); row5 = sqlNextRow(sr5); while (row5 != NULL) { name = row5[0]; chrom = row5[1]; strand = row5[2]; txStart = row5[3]; txEnd = row5[4]; cdsStart = row5[5]; cdsEnd = row5[6]; exonCount = row5[7]; exonStarts = row5[8]; exonEnds = row5[9]; fprintf(o1, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tdna%d\n", name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, proteinDisplayID, alignmentID); alignmentID++; fprintf(o2, "%s\t%c\t%s\n", name, 'g', proteinAC2); row5 = sqlNextRow(sr5); } sqlFreeResult(&sr5); } } } row2 = sqlNextRow(sr2); } fclose(o1); fclose(o2); sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); hFreeConn(&conn5); system("sort j.dat|uniq >dnaLink.tab"); system("rm j.dat"); return(0); }
void doStamps(char *proteinID, char *mrnaID, char *aa, struct vGfx *vg, int *yOffp) /* draw proteome browser stamps */ { int i,j,l; char cond_str[200]; char *valStr; char valStr2[50]; char *answer; double pI=0.0; double exonCount; char *chp; int len; int cCnt; int xPosition; int yPosition; int stampWidth, stampHeight; int aaResCnt[30]; double aaResFreqDouble[30]; int aaResFound; int totalResCnt; double molWeight=0.0; double hydroSum; struct pbStamp *stampDataPtr; for (j=0; j<23; j++) { aaResCnt[j] = 0; } l=len = strlen(aa); chp = aa; for (i=0; i<l; i++) { aaResFound = 0; for (j=0; j<23; j++) { if (*chp == aaAlphabet[j]) { aaResFound = 1; aaResCnt[j] ++; } } chp++; } totalResCnt = 0; for (i=0; i<23; i++) { totalResCnt = totalResCnt + aaResCnt[i]; } for (i=0; i<20; i++) { aaResFreqDouble[i] = ((double)aaResCnt[i])/((double)totalResCnt); } AllocVar(stampPictPtr); stampWidth = 75*(1+pbScale/3); stampHeight = 60*(1+pbScale/3); xPosition = 15; yPosition = *yOffp + 135; if (pbScale >= 6) yPosition = yPosition + 20; boundaryColor = vgFindColorIx(g_vg, 170, 170, 170); /* draw pI stamp */ safef(cond_str, sizeof(cond_str), "accession='%s'", proteinID); answer = sqlGetField(database, "pepPi", "count(*)", cond_str); /* either 0 or multiple rows are not valid */ if (strcmp(answer, "1") == 0) { answer = sqlGetField(database, "pepPi", "pI", cond_str); pI = (double)atof(answer); stampDataPtr = getStampData("pepPi"); setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScaleNonInt(stampDataPtr, stampPictPtr, 2); safef(valStr2, sizeof(valStr2), "%.1f", pI); markStamp(stampDataPtr, stampPictPtr, pI, valStr2, tx, ty); pbStampFree(&stampDataPtr); } else { stampDataPtr = getStampData("pepPi"); setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScale(stampDataPtr, stampPictPtr, 2); safef(valStr2, sizeof(valStr2), "N/A"); markStamp0(stampDataPtr, stampPictPtr, pI, valStr2, tx, ty); pbStampFree(&stampDataPtr); } /* skip Mol Wt, if it is GSID */ if (!hIsGsidServer()) { /* draw Mol Wt stamp */ safef(cond_str, sizeof(cond_str), "accession='%s'", proteinID); answer = sqlGetField(database, "pepMwAa", "MolWeight", cond_str); if (answer != NULL) { safef(valStr2, sizeof(valStr2), "%s Da", answer); molWeight = (double)atof(answer); stampDataPtr = getStampData("pepMolWt"); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScaleMW(stampDataPtr, stampPictPtr, 50000); markStamp(stampDataPtr, stampPictPtr, molWeight, valStr2, tx, ty); pbStampFree(&stampDataPtr); } else { safef(valStr2, sizeof(valStr2), "N/A"); stampDataPtr = getStampData("pepMolWt"); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScaleMW(stampDataPtr, stampPictPtr, 50000); markStamp0(stampDataPtr, stampPictPtr, molWeight, valStr2, tx, ty); pbStampFree(&stampDataPtr); } } if (!proteinInSupportedGenome) { if (!hIsGsidServer()) xPosition = xPosition + stampWidth + stampWidth/8; goto skip_exon; } /* draw exon count stamp */ if (kgVersion == KG_III) { safef(cond_str, sizeof(cond_str), "qName='%s'", mrnaID); } else { safef(cond_str, sizeof(cond_str), "qName='%s'", proteinID); } answer = sqlGetField(database, kgProtMapTableName, "blockCount", cond_str); if (answer != NULL) { valStr = cloneString(answer); exonCount = (double)atoi(answer); stampDataPtr = getStampData("exonCnt"); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScale(stampDataPtr, stampPictPtr, 5); markStamp(stampDataPtr, stampPictPtr, exonCount, valStr, tx, ty); pbStampFree(&stampDataPtr); } skip_exon: if (!hIsGsidServer()) { /* draw AA residual anomolies stamp */ if (answer != NULL) { stampDataPtr = getStampData("pepRes"); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, 3*stampWidth/2, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); for (i=0; i<20; i++) { markResStamp(aaAlphabet[i], stampDataPtr, stampPictPtr, i, aaResFreqDouble[i], tx, ty, avg, stddev); } pbStampFree(&stampDataPtr); } xPosition = 15; yPosition = yPosition + 170; } /* skip swInterPro if it is GSID */ if (!hIsGsidServer()) { /* draw family size stamp */ safef(cond_str, sizeof(cond_str), "accession='%s'", proteinID); answer = sqlGetField(protDbName, "swInterPro", "count(*)", cond_str); if (answer != NULL) { valStr = cloneString(answer); stampDataPtr = getStampData("intPCnt"); setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScale(stampDataPtr, stampPictPtr, 1); markStamp(stampDataPtr, stampPictPtr, (double)(atoi(answer)), valStr, tx, ty); pbStampFree(&stampDataPtr); } else { valStr = cloneString("N/A"); stampDataPtr = getStampData("intPCnt"); setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScale(stampDataPtr, stampPictPtr, 1); markStamp0(stampDataPtr, stampPictPtr, (double)(atoi(answer)), valStr, tx, ty); pbStampFree(&stampDataPtr); } } /* draw hydrophobicity stamp */ chp = protSeq; hydroSum = 0; for (i=0; i<protSeqLen; i++) { hydroSum = hydroSum + aa_hydro[(int)(*chp)]; chp++; } stampDataPtr = getStampData("hydro"); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScaleHydro(stampDataPtr, stampPictPtr, 1.0); safef(valStr2, sizeof(valStr2), "%.1f", hydroSum/(double)len); markStamp(stampDataPtr, stampPictPtr, hydroSum/(double)len, valStr2, tx, ty); pbStampFree(&stampDataPtr); /* draw Cystein Count stamp */ chp = protSeq; cCnt = 0; for (i=0; i<len; i++) { if (*chp == 'C') cCnt ++; chp++; } stampDataPtr = getStampData("cCnt"); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScale(stampDataPtr, stampPictPtr, 10); safef(valStr2, sizeof(valStr2), "%d", cCnt); markStamp(stampDataPtr, stampPictPtr, (double)cCnt, valStr2, tx, ty); pbStampFree(&stampDataPtr); /* if it is GSID, draw AA residual anomolies here */ if (hIsGsidServer()) { xPosition = 15; yPosition = yPosition + 170; /* draw AA residual anomolies stamp */ if (answer != NULL) { stampDataPtr = getStampData("pepRes"); setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, 3*stampWidth/2, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); for (i=0; i<20; i++) { markResStamp(aaAlphabet[i], stampDataPtr, stampPictPtr, i, aaResFreqDouble[i], tx, ty, avg, stddev); } pbStampFree(&stampDataPtr); } } /* draw AA residual anomolies stddev stamp */ if (answer != NULL) { exonCount = (double)atof(answer); stampDataPtr = getStampData("pepRes"); if (hIsGsidServer()) { xPosition = xPosition + stampWidth*1.62 + stampWidth/8; } else { xPosition = xPosition + stampWidth + stampWidth/8; } setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, 3*stampWidth/2, stampHeight); stampDataPtr->ymin = -4.0; stampDataPtr->ymax = 4.0; for (i=0; i<20; i++) { markResStdvStamp(stampDataPtr, stampPictPtr, i, aaResFreqDouble[i], tx, ty, avg, stddev); } /* draw background after bars drawn so that "... stddev" labels do not get covered by bars */ stampDataPtr = getStampData("pepRes"); setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, 3*stampWidth/2, stampHeight); drawPbStampB(stampDataPtr, stampPictPtr); pbStampFree(&stampDataPtr); } /* The follwing section was used to plot freq distribution for each AA so that we can view than to decide on whether +/- 2 stddev is applicable and what cutoff thresholds to use. Keep it here for possible future reuse. */ /* vertLabel = cloneString("Frequency"); for (i=strlen(vertLabel)-1; i>=0; i--) { vertLabel[i+1] = '\0'; vgTextCentered(g_vg, 3, 45+i*10, 10, 10, MG_BLACK, g_font, vertLabel+i); vgTextCentered(g_vg, 3, 215+i*10, 10, 10, MG_BLACK, g_font, vertLabel+i); } xPosition = xPosition + 80; for (j=0; j<20; j++) { safef(tempStr, sizeof(tempStr), "%c", aaAlphabet[j]); stampDataPtr = getStampData(tempStr); xPosition = xPosition + stampWidth + stampWidth/8; setPbStampPict(stampPictPtr, stampDataPtr, xPosition, yPosition, stampWidth, stampHeight); drawPbStamp(stampDataPtr, stampPictPtr); drawXScale(stampDataPtr, stampPictPtr, 10); safef(valStr2, sizeof(valStr2), "%c", aaAlphabet[j]); markStamp(stampDataPtr, stampPictPtr, 0.0, valStr2, tx, ty); pbStampFree(&stampDataPtr); } */ }
void markResStdvStamp(struct pbStamp *pbStampPtr, struct pbStampPict *stampPictPtr, int iTarget, double yValueIn, double tx[], double ty[], double avg[], double stddev[]) /* mark the AA residual stddev stamp */ { int ix, iy; double txmin, tymin, txmax, tymax; double yValue, yPlotValue; int len; int xx, yy; double pctLow, pctHi; char cond_str[255]; char *answer; char aaChar; len = pbStampPtr->len; txmin = pbStampPtr->xmin; txmax = pbStampPtr->xmax; /* force fit for the stddev stamp plot */ tymin = -4.0; tymax = 4.0; ix = stampPictPtr->xOrig; iy = stampPictPtr->yOrig; aaChar = aaAlphabet[iTarget]; safef(cond_str, sizeof(cond_str), "AA='%c'", aaChar); answer = sqlGetField(database, "pbAnomLimit", "pctLow", cond_str); pctLow = (double)(atof(answer)); answer = sqlGetField(database, "pbAnomLimit", "pctHi", cond_str); pctHi = (double)(atof(answer)); yScale = (double)(120)/8.0; calStampXY(stampPictPtr, (txmax-txmin)/2.0, tymax, &xx, &yy); yValue = (yValueIn - avg[iTarget])/stddev[iTarget]; if (yValue > tymax) { yPlotValue = tymax; } else { if (yValue < tymin) { yPlotValue = tymin; } else { yPlotValue = yValue; } } if (yValueIn > pctHi) { vLine(tx[iTarget]+0.4, 0.0, yPlotValue, 3, abnormalColor); } else { if (yValueIn <= pctLow) { vLine(tx[iTarget]+0.4, 0.0+yPlotValue, -yPlotValue, 3, abnormalColor); } else { /* normal range */ if ((yValueIn - avg[iTarget]) >= 0.0) { vLine(tx[iTarget]+0.4, 0.0, yPlotValue, 2, normalColor); } else { vLine(tx[iTarget]+0.4, 0.0+yPlotValue, -yPlotValue, 2, normalColor); } } } }
int main(int argc, char *argv[]) { char *skippedKgId; char *lastValidKgId; struct sqlConnection *conn2, *conn3; struct sqlResult *sr2; char query2[256]; char **row2; char *proteinID; FILE *o3, *o7; char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd, *exonCount, *exonStarts, *exonEnds; char *alignID; char *chp; int i, j; int isDuplicate; char *genomeDBname; char *proteinDataDate; char proteinsDB[40]; char spDB[40]; char *acc; #define MAX_EXON 1000 int exStart[MAX_EXON], exEnd[MAX_EXON]; int exCount; int aaStart[MAX_EXON], aaEnd[MAX_EXON]; char *sp, *ep; int aalen; int cdsS, cdsE; int eS, eE; if (argc != 3) usage(); proteinDataDate = argv[1]; genomeDBname = argv[2]; safef(spDB, sizeof(spDB), "sp%s", proteinDataDate); safef(proteinsDB, sizeof(proteinsDB), "proteins%s", proteinDataDate); o3 = fopen("j.dat", "w"); o7 = fopen("jj.dat", "w"); conn2= hAllocConn(genomeDBname); conn3= hAllocConn(genomeDBname); inf = mustOpen("sorted.lis", "r"); strcpy(oldInfo, ""); skippedKgId = cloneString(""); lastValidKgId = cloneString(""); isDuplicate = 0; oldMrnaStr = cloneString(""); oldAlignStr = cloneString(""); oldProteinStr = cloneString(""); mrnaStr = cloneString(""); proteinStr = cloneString(""); alignStr = cloneString(""); while (fgets(line_in, 10000, inf) != NULL) { strcpy(line, line_in); chp = strstr(line, "\t"); /* chrom */ chp ++; chp = strstr(chp, "\t"); /* cds block start position */ chp ++; chp = strstr(chp, "\t"); /* cds block end position */ *chp = '\0'; chp++; strcpy(newInfo, line); if (sameString(oldInfo, newInfo)) { isDuplicate = 1; } else { /* remember previous record as old only if it is not a duplicate */ if (!isDuplicate) { oldMrnaStr = mrnaStr; oldProteinStr = proteinStr; oldAlignStr = alignStr; } strcpy(oldInfo, newInfo); isDuplicate = 0; } chp = strstr(chp, "\t"); /* priority score */ chp ++; chp = strstr(chp, "\t"); /* mRNA transcription length */ chp ++; chp = strstr(chp, "\t"); /* mRNA date */ chp ++; mrnaStr = chp; chp = strstr(chp, "\t"); /* mRNA ID */ *chp = '\0'; chp ++; mrnaStr = cloneString(mrnaStr); proteinStr = chp; chp = strstr(chp, "\t"); /* protein ID */ *chp = '\0'; chp ++; proteinStr = cloneString(proteinStr); alignID = chp; /* get rid of "end-of-line" character at the end of the string */ alignStr = trimSpaces(alignID); if (isDuplicate) { /* only put out records for valid KG entries */ if (!sameString(oldMrnaStr, skippedKgId) || sameString(oldMrnaStr, lastValidKgId)) { fprintf(o7, "%s\t%s\t%s\t%s\n", oldMrnaStr, oldProteinStr, mrnaStr, proteinStr); } } else { safef(query2, sizeof(query2), "select * from %sTemp.knownGene0 where alignID='%s';", genomeDBname, alignID); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { name = row2[0]; chrom = row2[1]; strand = row2[2]; txStart = row2[3]; txEnd = row2[4]; cdsStart = row2[5]; cdsEnd = row2[6]; exonCount = row2[7]; exonStarts = row2[8]; exonEnds = row2[9]; proteinID = row2[10]; alignID = row2[11]; sscanf(exonCount, "%d", &exCount); sp = cloneString(exonStarts); ep = cloneString(exonEnds); sscanf(cdsStart, "%d", &cdsS); sscanf(cdsEnd, "%d", &cdsE); aalen = 0; j=0; for (i=0; i<exCount; i++) { chp = strstr(sp, ","); *chp = '\0'; sscanf(sp, "%d", &(exStart[i])); chp++; sp = chp; chp = strstr(ep, ","); *chp = '\0'; sscanf(ep, "%d", &(exEnd[i])); eS = exStart[i]; eE = exEnd[i]; if (cdsS > eS) { eS = cdsS; } if (cdsE < eE) { eE = cdsE; } if (eS > eE) { eS = 0; eE = 0; } if (eS != eE) { aaStart[j] = aalen; aaEnd[j] = aaStart[j] + (eE- eS +1)/3 -1; aalen = aalen + (eE- eS +1)/3; j++; } chp++; ep = chp; } cdsLen = aalen; safef(cond_str, sizeof(cond_str), "val='%s'", proteinID); acc = sqlGetField(spDB, "displayId", "acc", cond_str); safef(cond_str, sizeof(cond_str), "acc='%s'", acc); aaStr=sqlGetField(spDB, "protein", "val", cond_str); aaLen = strlen(aaStr); if ((cdsLen > 50) || ((cdsLen * 100)/aaLen > 50)) { fprintf(o3,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, proteinID, alignID); lastValidKgId = cloneString(name); } else { printf("skipping %s %d \n", name, cdsLen); skippedKgId = cloneString(name); } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); } } hFreeConn(&conn2); hFreeConn(&conn3); fclose(o3); fclose(o7); mustSystem("cat j.dat|sort|uniq >knownGene.tab"); mustSystem("cat jj.dat|sort|uniq >duplicate.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
void doSamT02(char *proteinId, char *database) /* display the UCSC SAM-T02 Protein Structure Analysis and Prediction section */ { char *itemName = NULL; char query2[256]; struct sqlResult *sr2; char **row2; struct sqlConnection *conn, *conn2 = hAllocConn(database); char condStr[256]; char *chp; char *samSubDir; char *samHttpStr0 = NULL; /* SAM server*/ char *samHttpStr = NULL; /* UCSC GB site */ int homologCount; char *homologID; char *SCOPdomain; char *chain; char *bestEValStr = NULL; float eValue, bestEVal; char goodSCOPdomain[40]; int first = 1; /* return if this genome does not have SAM protein analysis results */ /* defensive logic to guard against the situation that the binary program is pushed, but the data tables are not */ conn = sqlConnect(database); if (!(sqlTableExists(conn, "samSubdir") && sqlTableExists(conn, "protHomolog"))) { return; } sqlDisconnect(&conn); if (!sameWord(database, "sacCer1")) { return; } itemName = proteinId; if (sameWord(database, "sacCer1")) { samHttpStr0 = strdup("http://www.soe.ucsc.edu/research/compbio/yeast-protein-predictions"); samHttpStr = strdup("../goldenPath/sacCer1/sam"); /* SAM analysis of SGD proteins uses SGD ID, not Swiss-Prot AC */ itemName = getSgdId(proteinId, database); } if (itemName == NULL) return; sqlSafefFrag(condStr, sizeof condStr, "proteinId='%s'", itemName); samSubDir = sqlGetField(database, "samSubdir", "subdir", condStr); if (samSubDir == NULL) return; hPrintf("<B>UCSC "); hPrintf("<A HREF=\"http://www.soe.ucsc.edu/research/compbio/SAM_T02/sam-t02-faq.html\""); hPrintf(" TARGET=_blank>SAM-T02</A>\n"); hPrintf(" Protein Structure Analysis and Prediction on %s", proteinId); if (!sameWord(proteinId, itemName)) hPrintf(" (aka %s)", itemName); hPrintf("</B><BR>\n"); hPrintf(" <B>Multiple Alignment (sequence logo):</B> \n"); hPrintf("<A HREF=\"%s/%s/%s/%s.t2k.w0.5-logo.pdf\"", samHttpStr, samSubDir, itemName, itemName); hPrintf(" TARGET=_blank>%s</A> (pdf)<BR>\n", itemName); hPrintf("<B> Secondary Structure Predictions:</B> \n"); hPrintf("<A HREF=\"%s/%s/%s/%s.t2k.dssp-ehl2-logo.pdf\"", samHttpStr, samSubDir, itemName, itemName); hPrintf(" TARGET=_blank>%s</A> (pdf)<BR>\n", itemName); hPrintf("<B> Close Homologs:</B> \n"); conn2= hAllocConn(database); sqlSafef(query2, sizeof query2, "select homologID,eValue,SCOPdomain,chain from %s.protHomolog where proteinID='%s' and evalue <= 0.01 order by evalue;", database, itemName); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); homologCount = 0; strcpy(goodSCOPdomain, "dummy"); bestEVal = 100; while (row2 != NULL) { homologID = row2[0]; sscanf(row2[1], "%e", &eValue); if (first) { bestEVal = eValue; bestEValStr = strdup(row2[1]); } SCOPdomain = row2[2]; chp = SCOPdomain+strlen(SCOPdomain)-1; while (*chp != '.') chp--; *chp = '\0'; chain = row2[3]; if (eValue <= 1.0e-10) { strcpy(goodSCOPdomain, SCOPdomain); } else { if (strcmp(goodSCOPdomain,SCOPdomain) != 0) { goto skip; } else { if (eValue > 0.1) goto skip; } } if (first) { first = 0; } else { printf(", "); } hPrintf("\n<A HREF=\"http://www.rcsb.org/pdb/cgi/explore.cgi?job=graphics&pdbId=%s", homologID); if (strlen(chain) >= 1) { hPrintf("\"TARGET=_blank>%s</A>(chain %s)\n", homologID, chain); } else { hPrintf("\"TARGET=_blank>%s</A>\n", homologID); } homologCount++; skip: row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn2); if (homologCount == 0) { hPrintf("None\n"); } hPrintf("<BR> <B>More Details:</B> \n"); hPrintf("<A HREF=\"%s/%s/%s/summary.html\"", samHttpStr0, samSubDir, itemName); hPrintf("\" TARGET=_blank>%s</A><BR>\n", itemName); if (homologCount > 0) { hPrintf(" <B>3D Structure Prediction: </B> \n"); hPrintf("<A HREF=\"%s/%s/%s/%s.t2k.undertaker-align.pdb.gz\"", samHttpStr, samSubDir, itemName, itemName); hPrintf("\" TARGET=_blank>%s</A> (PDB format, gzipped)<BR>\n", itemName); hPrintf(" <B>3D Pictures of the Best Model"); hPrintf(" (E Value: %s):</B><BR>\n", bestEValStr);fflush(stdout); hPrintf("<TABLE><TR>\n"); hPrintf("<TD> </TD>"); hPrintf("<TD><IMG SRC=\"%s/%s/%s/%s.view1_200.jpg\"></A></TD>\n", samHttpStr, samSubDir, itemName, itemName); hPrintf("<TD><IMG SRC=\"%s/%s/%s/%s.view2_200.jpg\"></A></TD>\n", samHttpStr, samSubDir, itemName, itemName); hPrintf("<TD><IMG SRC=\"%s/%s/%s/%s.view3_200.jpg\"></A></TD>\n", samHttpStr, samSubDir, itemName, itemName); hPrintf("</TR>\n"); hPrintf("<TR>"); hPrintf("<TD> </TD>"); hPrintf("<TD ALIGN=CENTER>Front</TD>"); hPrintf("<TD ALIGN=CENTER>Top</TD>"); hPrintf("<TD ALIGN=CENTER>Side</TD>"); hPrintf("</TR>\n"); hPrintf("<TR>"); hPrintf("<TD> </TD>"); hPrintf("<TD ALIGN=CENTER><A HREF=\"%s/%s/%s/%s.view1_500.jpg\">500x500</A></TD>\n", samHttpStr, samSubDir, itemName, itemName); hPrintf("<TD ALIGN=CENTER><A HREF=\"%s/%s/%s/%s.view2_500.jpg\">500x500</A></TD>\n", samHttpStr, samSubDir, itemName, itemName); hPrintf("<TD ALIGN=CENTER><A HREF=\"%s/%s/%s/%s.view3_500.jpg\">500x500</A></TD>\n", samHttpStr, samSubDir, itemName, itemName); hPrintf("</TR>\n"); hPrintf("</TABLE>\n"); } else { hPrintf(" <B>3D Structure Prediction: </B> \n"); hPrintf("No models presented, because none has E-value <= 0.01.<BR>"); } hPrintf("<BR>"); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[255]; char *proteinDatabaseName; FILE *o1, *o2, *o3; FILE *fh[23]; char temp_str[1000];; char *accession; char *aaSeq; char *chp; int i, j, len; int ihi, ilow; char *answer; char *protDisplayId; int aaResCnt[30]; char aaAlphabet[30]; int aaResFound; float fvalue1, fvalue2; float p1, p2; int icnt, jcnt; char *taxon; char *database; int sortedCnt; if (argc != 4) usage(); strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB"); proteinDatabaseName = argv[1]; taxon = argv[2]; database = argv[3]; o2 = mustOpen("pbResAvgStd.tab", "w"); for (i=0; i<20; i++) { safef(temp_str, sizeof(temp_str), "%c.txt", aaAlphabet[i]); fh[i] = mustOpen(temp_str, "w"); } conn = hAllocConn(hDefaultDb()); conn2 = hAllocConn(hDefaultDb()); safef(query2, sizeof(query2), "select proteinID from %s.knownGene;", database); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); icnt = 0; jcnt = 0; for (j=0; j<MAXRES; j++) { sumJ[j] = 0; } while (row2 != NULL) { protDisplayId = row2[0]; safef(cond_str, sizeof(cond_str), "val='%s'", protDisplayId); accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str); if (accession == NULL) { safef(cond_str, sizeof(cond_str), "acc='%s'", protDisplayId); accession = sqlGetField(proteinDatabaseName, "displayId", "acc", cond_str); if (accession == NULL) { verbose(2, "'%s' not found.\n", protDisplayId); goto skip; } } safef(cond_str, sizeof(cond_str), "accession='%s'", accession); answer = sqlGetField("proteins040115", "spXref2", "biodatabaseID", cond_str); if (answer == NULL) { /* this protein might be a variant splice protein, and then it won't be in spXref2 */ goto skip; } if (answer[0] != '1') { /* printf("%s not in SWISS-PROT\n", protDisplayId);fflush(stdout); */ goto skip; } safef(cond_str, sizeof(cond_str), "acc='%s'", accession); aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str); if (aaSeq == NULL) { printf("Can't find peptide sequence for %s, exiting ...\n", protDisplayId); fflush(stdout); exit(1); } len = strlen(aaSeq); if (len < 100) goto skip; lenDouble = (double)len; for (j=0; j<MAXRES; j++) { aaResCnt[j] = 0; } chp = aaSeq; for (i=0; i<len; i++) { aaResFound = 0; for (j=0; j<MAXRES; j++) { if (*chp == aaAlphabet[j]) { aaResFound = 1; aaResCnt[j] ++; } } if (!aaResFound) { fprintf(stderr, "%c %d not a valid AA residue.\n", *chp, *chp); } chp++; } for (j=0; j<MAXRES; j++) { freq[icnt][j] = (double)aaResCnt[j]/lenDouble; sumJ[j] = sumJ[j] + freq[icnt][j]; } for (j=0; j<20; j++) { fprintf(fh[j], "%15.7f\t%s\n", freq[icnt][j], accession); fflush(fh[j]); } icnt++; if (icnt >= MAXN) errAbort("Too many proteins - please set MAXN to be more than %d\n", MAXN); skip: row2 = sqlNextRow(sr2); } recordCnt = icnt; recordCntDouble = (double)recordCnt; for (j=0; j<20; j++) { carefulClose(&(fh[j])); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); for (j=0; j<MAXRES; j++) { avg[j] = sumJ[j]/recordCntDouble; } for (j=0; j<20; j++) { sum = 0.0; for (i=0; i<recordCnt; i++) { sum = sum + (freq[i][j] - avg[j]) * (freq[i][j] - avg[j]); } sigma[j] = sqrt(sum/(double)(recordCnt-1)); fprintf(o2, "%c\t%f\t%f\n", aaAlphabet[j], avg[j], sigma[j]); } carefulClose(&o2); o1 = mustOpen("pbAnomLimit.tab", "w"); for (j=0; j<20; j++) { safef(temp_str, sizeof(temp_str), "cat %c.txt|sort|uniq > %c.srt", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); /* figure out how many unique entries */ safef(temp_str, sizeof(temp_str), "wc %c.srt > %c.tmp", aaAlphabet[j], aaAlphabet[j]); mustSystem(temp_str); safef(temp_str, sizeof(temp_str), "%c.tmp", aaAlphabet[j]); o3 = mustOpen(temp_str, "r"); mustGetLine(o3, temp_str, 1000); chp = temp_str; while (*chp == ' ') chp++; while (*chp != ' ') chp++; *chp = '\0'; sscanf(temp_str, "%d", &sortedCnt); safef(temp_str, sizeof(temp_str), "rm %c.tmp", aaAlphabet[j]); mustSystem(temp_str); /* cal hi and low cutoff threshold */ ilow = (int)((float)sortedCnt * 0.025); ihi = (int)((float)sortedCnt * 0.975); safef(temp_str, sizeof(temp_str), "%c.srt", aaAlphabet[j]); o2 = mustOpen(temp_str, "r"); i=0; for (i=0; i<ilow; i++) { mustGetLine(o2, temp_str, 1000); } sscanf(temp_str, "%f", &fvalue1); mustGetLine(o2, temp_str, 1000); sscanf(temp_str, "%f", &fvalue2); p1 = (fvalue1 + fvalue2)/2.0; for (i=ilow+1; i<ihi; i++) { mustGetLine(o2, temp_str, 1000); } sscanf(temp_str, "%f", &fvalue1); mustGetLine(o2, temp_str, 1000); sscanf(temp_str, "%f", &fvalue2); p2 = (fvalue1 + fvalue2)/2.0; carefulClose(&o2); fprintf(o1, "%c\t%f\t%f\n", aaAlphabet[j], p1, p2); fflush(stdout); for (i=0; i<recordCnt; i++) { measure[i] = freq[i][j]; } safef(temp_str, sizeof(temp_str), "pbAaDist%c.tab", aaAlphabet[j]); calDist(measure, recordCnt, 51, 0.0, 0.005, temp_str); } carefulClose(&o1); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[255]; char *proteinDatabaseName; /* example: sp031112 */ char *protDbName; /* example: proteins031112 */ char emptyStr[1] = {""}; FILE *o2; char *accession; char *aaSeq; char *chp; int i, j, len; int cCnt; char *answer, *answer2; double hydroSum; char *protDisplayId; int aaResCnt[30]; double aaResCntDouble[30]; char aaAlphabet[30]; int aaResFound; int totalResCnt; int molWtCnt; double molWt[100000]; int pIcnt; double pI[100000]; double aa_hydro[256]; int icnt, jExon, pcnt, ipcnt = 0; double aaLenDouble[100000]; double avgHydro[100000]; double cCountDouble[100000]; double exonCountDouble[100000]; double interProCountDouble[100000]; char *taxon; char *database; char *exonCnt; int interProCount; char *kgId; if (argc != 5) usage(); strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB"); /* Ala: 1.800 Arg: -4.500 Asn: -3.500 Asp: -3.500 Cys: 2.500 Gln: -3.500 */ aa_hydro['A'] = 1.800; aa_hydro['R'] = -4.500; aa_hydro['N'] = -3.500; aa_hydro['D'] = -3.500; aa_hydro['C'] = 2.500; aa_hydro['Q'] = -3.500; /* Glu: -3.500 Gly: -0.400 His: -3.200 Ile: 4.500 Leu: 3.800 Lys: -3.900 */ aa_hydro['E'] = -3.500; aa_hydro['G'] = -0.400; aa_hydro['H'] = -3.200; aa_hydro['I'] = 4.500; aa_hydro['L'] = 3.800; aa_hydro['K'] = -3.900; /* Met: 1.900 Phe: 2.800 Pro: -1.600 Ser: -0.800 Thr: -0.700 Trp: -0.900 */ aa_hydro['M'] = 1.900; aa_hydro['F'] = 2.800; aa_hydro['P'] = -1.600; aa_hydro['S'] = -0.800; aa_hydro['T'] = -0.700; aa_hydro['W'] = -0.900; /* Tyr: -1.300 Val: 4.200 Asx: -3.500 Glx: -3.500 Xaa: -0.490 */ aa_hydro['Y'] = -1.300; aa_hydro['V'] = 4.200; proteinDatabaseName = argv[1]; protDbName = argv[2]; taxon = argv[3]; database = argv[4]; o2 = mustOpen("pepResDist.tab", "w"); conn = hAllocConn(database); conn2 = hAllocConn(database); for (j=0; j<23; j++) { aaResCnt[j] = 0; } icnt = jExon = pcnt = 0; pIcnt = 0; molWtCnt = 0; sqlSafef(query2, sizeof(query2), "select acc from %s.accToTaxon where taxon=%s;", proteinDatabaseName, taxon); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { accession = row2[0]; sqlSafefFrag(cond_str, sizeof(cond_str), "acc='%s'", accession); protDisplayId = sqlGetField(proteinDatabaseName, "displayId", "val", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s'", protDisplayId); answer = sqlGetField(database, "knownGene", "name", cond_str); /* count InterPro domains */ if (answer != NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "accession='%s'", accession); answer2 = sqlGetField(protDbName, "swInterPro", "count(*)", cond_str); if (answer2 != NULL) { interProCount = interProCount + atoi(answer2); interProCountDouble[ipcnt] = (double)(atoi(answer2)); ipcnt++; } else { printf("%s is not in InterPro DB.\n", accession);fflush(stdout); } } /* count exons, using coding exons from kgProtMap2 (KG-III) table */ sqlSafefFrag(cond_str, sizeof(cond_str), "spID='%s'", accession); kgId = sqlGetField(database, "kgXref", "kgID", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "qName='%s'", kgId); answer2 = sqlGetField(database, "kgProtMap2", "blockCount", cond_str); if (answer2 != NULL) { exonCnt = strdup(answer2); if (atoi(exonCnt) == 0) { errAbort("%s %s has 0 block count\n", accession, protDisplayId); } exonCountDouble[jExon] = (double)(atoi(exonCnt)); jExon++; } else { exonCnt = emptyStr; } /* process Mol Wt */ sqlSafefFrag(cond_str, sizeof(cond_str), "accession='%s'", accession); answer2 = sqlGetField(database, "pepMwAa", "molWeight", cond_str); if (answer2 != NULL) { molWt[molWtCnt] = (double)(atof(answer2)); molWtCnt++; } /* process pI */ sqlSafefFrag(cond_str, sizeof(cond_str), "accession='%s'", accession); answer2 = sqlGetField(database, "pepPi", "pI", cond_str); if (answer2 != NULL) { pI[pIcnt] = (double)(atof(answer2)); pIcnt++; } sqlSafefFrag(cond_str, sizeof(cond_str), "acc='%s'", accession); aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str); if (aaSeq == NULL) { errAbort("%s does not have protein sequence data in %s, aborting ...\n", accession, proteinDatabaseName); } len = strlen(aaSeq); chp = aaSeq; for (i=0; i<len; i++) { aaResFound = 0; for (j=0; j<23; j++) { if (*chp == aaAlphabet[j]) { aaResFound = 1; aaResCnt[j] ++; } } if (!aaResFound) { warn("%c %d not a valid AA residue in %s:\n%s", *chp, *chp, accession, aaSeq); } chp++; } /* calculate hydrophobicity */ chp = aaSeq; cCnt = 0; hydroSum = 0; for (i=0; i<len; i++) { hydroSum = hydroSum + aa_hydro[(int)(*chp)]; /* count Cysteines */ if ((*chp == 'C') || (*chp == 'c')) { cCnt ++; } chp++; } aaLenDouble[icnt] = len; cCountDouble[icnt] = (double)cCnt; avgHydro[icnt] = hydroSum/(double)len; icnt++; row2 = sqlNextRow(sr2); } totalResCnt = 0; for (i=0; i<23; i++) { totalResCnt = totalResCnt + aaResCnt[i]; } /* write out residue count distribution */ for (i=0; i<20; i++) { aaResCntDouble[i] = ((double)aaResCnt[i])/((double)totalResCnt); fprintf(o2, "%d\t%f\n", i+1, (float)aaResCntDouble[i]); } fprintf(o2, "%d\t%f\n", i+1, 0.0); carefulClose(&o2); /* calculate and write out various distributions */ calDist(molWt, molWtCnt, 21, 0.0, 10000.0,"pepMolWtDist.tab"); calDist(pI, pIcnt, 61, 3.0, 0.2, "pepPiDist.tab"); calDist(avgHydro, icnt, 41, -2.0, 0.1, "pepHydroDist.tab"); calDist(cCountDouble, icnt, 51, 0.0, 1.0, "pepCCntDist.tab"); calDist(exonCountDouble, jExon, 31, 0.0, 1.0, "pepExonCntDist.tab"); calDist(interProCountDouble, ipcnt, 16, 0.0, 1.0, "pepIPCntDist.tab"); sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query[256], query2[256], query3[256]; struct sqlResult *sr, *sr2, *sr3; char **row, **row2, **row3; char *chp; FILE *o1, *o2; char *locusID; /* LocusLink ID */ char *gbAC; /* GenBank accession.version */ char *locusID2; /* LocusLink ID */ char *refAC; /* Refseq accession.version */ char *dbName; char cond_str[200]; char *kgID; char *mapID; char *desc; if (argc != 2) usage(); dbName = argv[1]; conn = hAllocConn(dbName); conn2= hAllocConn(dbName); conn3= hAllocConn(dbName); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); sprintf(query2,"select * from %sTemp.locus2Ref0;", dbName); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { locusID2 = row2[0]; refAC = row2[1]; sprintf(query, "select * from %sTemp.locus2Acc0 where locusID=%s and seqType='m';", dbName, locusID2); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { locusID = row[0]; gbAC = row[1]; chp = strstr(gbAC, "."); if (chp != NULL) *chp = '\0'; chp = strstr(refAC, "."); if (chp != NULL) *chp = '\0'; sprintf(cond_str, "name='%s'", gbAC); kgID = sqlGetField(dbName, "knownGene", "name", cond_str); if (kgID != NULL) { sprintf(query3, "select * from %sTemp.keggList where locusID = '%s'", dbName, locusID); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgID, locusID, mapID); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } row = sqlNextRow(sr); } row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); fclose(o1); fclose(o2); hFreeConn(&conn); hFreeConn(&conn2); mustSystem("cat j.dat|sort|uniq >keggPathway.tab"); mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query2[256]; struct sqlResult *sr2; char **row2; char cond_str[256]; char *protDbDate; char *kgID; char *protDisplayId; FILE *o1; char *kgTempDb; char spDb[255],proteinsDb[255]; char *ro_DB; char *refSeqName; char *hugoID; char *protAcc; /* protein Accession number from NCBI */ char *answer; char *emptyStr; char *parSpID; int leg; /* marker for debugging */ char *spID, *kgProteinID, *geneSymbol, *refseqID, *desc; if (argc != 4) usage(); kgTempDb = cloneString(argv[1]); protDbDate = cloneString(argv[2]); ro_DB = cloneString(argv[3]); safef(spDb, sizeof(spDb), "sp%s", protDbDate); safef(proteinsDb, sizeof(proteinsDb), "proteins%s", protDbDate); conn = hAllocConn(ro_DB); conn2= hAllocConn(ro_DB); conn3= hAllocConn(ro_DB); o1 = mustOpen("j.dat", "w"); emptyStr = strdup(""); sqlSafef(query2, sizeof query2, "select name, proteinID from %s.knownGene;", kgTempDb); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { kgID = row2[0]; kgProteinID = row2[1]; refseqID = strdup(""); geneSymbol = strdup(""); desc = strdup(""); protAcc = strdup(""); sqlSafefFrag(cond_str, sizeof cond_str, "displayID='%s'", kgProteinID); spID = sqlGetField(proteinsDb, "spXref3", "accession", cond_str); /* process variant splice proteins */ if (spID == NULL) { sqlSafefFrag(cond_str, sizeof cond_str, "varAcc='%s'", kgProteinID); spID = kgProteinID; parSpID = sqlGetField(proteinsDb, "splicProt", "parAcc", cond_str); if (parSpID != NULL) { sqlSafefFrag(cond_str, sizeof cond_str, "accession='%s'", parSpID); protDisplayId = sqlGetField(proteinsDb, "spXref3", "displayID", cond_str); } else { fprintf(stderr, "%s not found in kgXref3 nor in varProtein.\n", kgProteinID); exit(1); } } else { protDisplayId = kgProteinID; } /* use description for the protein as default, replace it with HUGO desc if available. */ sqlSafefFrag(cond_str, sizeof cond_str, "displayID='%s'", protDisplayId); desc = sqlGetField(proteinsDb, "spXref3", "description", cond_str); if (strstr(kgID, "NM_") != NULL) { leg = 1; /* special processing for RefSeq DNA based genes */ sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", kgID); refSeqName = sqlGetField(ro_DB, "refLink", "name", cond_str); if (refSeqName != NULL) { geneSymbol = cloneString(refSeqName); refseqID = kgID; sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", kgID); desc = sqlGetField(ro_DB, "refLink", "product", cond_str); sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc='%s'", refseqID); answer = sqlGetField(ro_DB, "refLink", "protAcc", cond_str); if (answer != NULL) { protAcc = strdup(answer); } } } else { sqlSafefFrag(cond_str, sizeof cond_str, "displayID = '%s'", protDisplayId); hugoID = sqlGetField(proteinsDb, "spXref3", "hugoSymbol", cond_str); if (!((hugoID == NULL) || (*hugoID == '\0')) ) { leg = 21; geneSymbol = cloneString(hugoID); sqlSafefFrag(cond_str, sizeof cond_str, "displayID = '%s'", protDisplayId); desc = sqlGetField(proteinsDb, "spXref3", "hugoDesc", cond_str); if (desc == NULL) { printf("%s/%s don't have hugo desc ...\n", kgProteinID, protDisplayId); fflush(stdout); } } refseqID = emptyStr; protAcc = emptyStr; sqlSafefFrag(cond_str, sizeof cond_str, "mrna = '%s'", kgID); answer = sqlGetField(ro_DB, "mrnaRefseq", "refseq", cond_str); if (answer != NULL) { refseqID = answer; } else { /*printf("%s does not have a related RefSeq.\n", kgID);fflush(stdout); */ } if (strlen(geneSymbol) == 0) { leg = 23; if (strlen(refseqID) != 0) { sqlSafefFrag(cond_str, sizeof cond_str, "mrnaAcc = '%s'", refseqID); answer = sqlGetField(ro_DB, "refLink", "name", cond_str); if (answer != NULL) { leg = 24; geneSymbol = strdup(answer); } } } } /* fix missing fields */ if (strlen(refseqID) == 0) { /* printf("%3d %s reseqID is empty.\n", leg, kgID); */ } if (strlen(geneSymbol) == 0) { /* printf("%3d %s geneSymbol is empty.\n", leg, kgID);fflush(stdout);*/ geneSymbol = strdup(kgID); } if (strlen(desc) == 0) { /* printf("%3d %s desc is empty.\n", leg, kgID);fflush(stdout); */ desc = strdup("N/A"); } fprintf(o1, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", kgID, kgID, spID, protDisplayId, geneSymbol, refseqID, protAcc, desc); row2 = sqlNextRow(sr2); } fclose(o1); hFreeConn(&conn); hFreeConn(&conn2); hFreeConn(&conn3); mustSystem("cat j.dat|sort|uniq >kgXref.tab"); mustSystem("rm j.dat"); return(0); }
void processAlign(char *kgTempDb, char *spDb, char *alignID, int cdsCnt, FILE *outf) { struct sqlConnection *conn2, *conn3, *conn4; char query2[256], query3[256]; struct sqlResult *sr2, *sr3; char **row2, **row3; char *score; char *chrom; char *protAcc; char *mrnaID; char *ranking; int protDbId; char condStr[255]; int i; char *chp; char *isCurated; conn2= hAllocConn(kgTempDb); conn3= hAllocConn(kgTempDb); conn4= hAllocConn(kgTempDb); sqlSafef(query2, sizeof(query2), "select * from %s.kgCandidate where alignID='%s'", kgTempDb, alignID); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { mrnaID = row2[0]; chrom = row2[1]; ranking = row2[11]; /* check if it is a composite mrnaID */ /* if yes, select from entries with both protein and mrna specified */ if (alignID[0] == 'U') { chp = strstr(row2[0], "_"); *chp = '\0'; protAcc = row2[0]; chp ++; mrnaID = chp; sqlSafef(query3, sizeof(query3), "select protAcc, score from %s.protMrnaScore where mrnaAcc='%s' and protAcc='%s'", kgTempDb, mrnaID, protAcc); } else { sqlSafef(query3, sizeof(query3), "select protAcc, score from %s.protMrnaScore where mrnaAcc='%s' order by score desc", kgTempDb, mrnaID); } sr3 = sqlMustGetResult(conn3, query3); row3 = sqlNextRow(sr3); while(row3 != NULL) { protAcc = row3[0]; score = row3[1]; chp = strstr(protAcc, "-"); if (chp == NULL) { sqlSafefFrag(condStr, sizeof(condStr), "acc='%s'", protAcc); isCurated = sqlGetField(spDb, "info", "isCurated", condStr); if (sameWord(isCurated, "1")) { protDbId = 1; } else { protDbId = 2; } } else { protDbId = 4; } fprintf(outf, "%s:", chrom); for (i=0; i<cdsCnt; i++) fprintf(outf, "%s", cdsBloc[i]); fprintf(outf, "\t%s\t%d\t%8s\t%s\t%s\t%s\n", ranking, protDbId, score, mrnaID, protAcc, alignID); /* for composite type, process just one record */ if (alignID[0] == 'U') break; row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn2); hFreeConn(&conn3); hFreeConn(&conn4); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn3; char query[256], query3[256]; struct sqlResult *sr, *sr3; char **row, **row3; FILE *o1, *o2; char *locusID; /* LocusLink ID */ char *kgTempDbName, *roDbName; char cond_str[200]; char *kgId; char *mapID; char *desc; char *mRNA; optionInit(&argc, argv, options); if (argc != 3) usage(); kgTempDbName = argv[1]; roDbName = argv[2]; conn = hAllocConn(roDbName); conn3= hAllocConn(roDbName); o1 = fopen("j.dat", "w"); o2 = fopen("jj.dat", "w"); table = optionVal("table", "knownGene"); sqlSafef(query, sizeof(query), "select name from %s.%s", roDbName, table); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { kgId = row[0]; sqlSafefFrag(cond_str, sizeof(cond_str), "kgId='%s'", kgId); mRNA = sqlGetField(roDbName, "kgXref", "mRNA", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "mrna='%s'", mRNA); locusID = sqlGetField("entrez", "entrezMrna", "geneId", cond_str); /* look for RefSeq if not found in mRNAs */ if (locusID == NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "refseq='%s'", mRNA); locusID = sqlGetField("entrez", "entrezRefseq", "geneId", cond_str); } if (locusID != NULL) { sqlSafef(query3, sizeof(query3), "select * from %s.keggList where locusID = '%s'", kgTempDbName, locusID); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgId, locusID, mapID); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } else { /* printf("%s not found in Entrez.\n", kgId);fflush(stdout);*/ if (differentString(table, "knownGene")) { sqlSafefFrag(cond_str, sizeof(cond_str), "name='%s'", kgId); locusID = sqlGetField(roDbName, table, "name2", cond_str); sqlSafef(query3, sizeof(query3), "select * from %s.keggList where locusID = '%s'", kgTempDbName, kgId); sr3 = sqlGetResult(conn3, query3); while ((row3 = sqlNextRow(sr3)) != NULL) { mapID = row3[1]; desc = row3[2]; fprintf(o1, "%s\t%s\t%s\n", kgId, locusID, mapID); fprintf(o2, "%s\t%s\n", mapID, desc); row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); } } row = sqlNextRow(sr); } fclose(o1); fclose(o2); hFreeConn(&conn); mustSystem("cat j.dat|sort|uniq >keggPathway.tab"); mustSystem("cat jj.dat|sort|uniq >keggMapDesc.tab"); mustSystem("rm j.dat"); mustSystem("rm jj.dat"); return(0); }
void doTracks(char *proteinID, char *mrnaID, char *aa, int *yOffp, char *psOutput) /* draw various protein tracks */ { int l; char aaOrigOffsetStr[20]; int hasResFreq; char uniProtDbName[50]; char *protDbDate; char *chrom; char strand; char *kgId, *kgPep, *protPep; char cond_str[255]; char *answer; //int i, ll; //char *chp1, *chp2; g_font = mgSmallFont(); safef(pbScaleStr, sizeof(pbScaleStr), "%d", pbScale); if (psOutput != NULL) { pbScale = atoi(cartOptionalString(cart, "pbt.pbScaleStr")); } if (cgiOptionalString("trackOffset") != NULL) { trackOrigOffset = atoi(cgiOptionalString("trackOffset")); } if (cgiOptionalString("pbScaleStr") != NULL) { pbScale = atoi(cgiOptionalString("pbScaleStr")); } if (cgiOptionalString("pbScale") != NULL) { scaleButtonPushed = TRUE; if (strcmp(cgiOptionalString("pbScale"), "1/6") == 0) pbScale = 1; if (strcmp(cgiOptionalString("pbScale"), "1/2") == 0) pbScale = 3; if (strcmp(cgiOptionalString("pbScale"), "FULL") == 0) pbScale = 6; if (strcmp(cgiOptionalString("pbScale"), "DNA") == 0) pbScale =22; safef(pbScaleStr, sizeof(pbScaleStr), "%d", pbScale); cgiMakeHiddenVar("pbScaleStr", pbScaleStr); } else { scaleButtonPushed = FALSE; } if (psOutput == NULL) { if (cgiVarExists("pbt.left3")) { relativeScroll(-0.95); initialWindow = FALSE; } else if (cgiVarExists("pbt.left2")) { relativeScroll(-0.475); initialWindow = FALSE; } else if (cgiVarExists("pbt.left1")) { relativeScroll(-0.02); initialWindow = FALSE; } else if (cgiVarExists("pbt.right1")) { relativeScroll(0.02); initialWindow = FALSE; } else if (cgiVarExists("pbt.right2")) { relativeScroll(0.475); initialWindow = FALSE; } else if (cgiVarExists("pbt.right3")) { relativeScroll(0.95); initialWindow = FALSE; } } dnaUtilOpen(); l=strlen(aa); /* initialize AA properties */ aaPropertyInit(&hasResFreq); sfCount = getSuperfamilies2(proteinID); if (sfCount == 0) { sfCount = getSuperfamilies(proteinID); } if (mrnaID != NULL) { if (kgVersion == KG_III) { doExonTrack = FALSE; sqlSafefFrag(cond_str, sizeof(cond_str), "spId='%s'", proteinID); kgId = sqlGetField(database, "kgXref", "kgId", cond_str); if (kgId != NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "name='%s'", kgId); kgPep = sqlGetField(database, "knownGenePep", "seq", cond_str); //printf("<pre><br>%s", kgPep);fflush(stdout); if (kgPep != NULL) { if (strstr(protDbName, "proteins") != NULL) { protDbDate = strstr(protDbName, "proteins") + strlen("proteins"); safef(uniProtDbName, sizeof(uniProtDbName),"sp%s", protDbDate); sqlSafefFrag(cond_str, sizeof(cond_str), "acc='%s'", proteinID); protPep = sqlGetField(uniProtDbName, "protein", "val", cond_str); //printf("<br>%s\n", protPep);fflush(stdout); if (protPep != NULL) { if (sameWord(kgPep, protPep)) { //printf("<br>MATCH!\n");fflush(stdout); sqlSafefFrag(cond_str, sizeof(cond_str), "qName='%s'", kgId); answer = sqlGetField(database, kgProtMapTableName, "qName", cond_str); if (answer != NULL) { /* NOTE: passing in kgId instead of proteinID because kgProtMap2's qName uses kgId instead of protein display ID */ getExonInfo(kgId, &exCount, &chrom, &strand); assert(exCount > 0); doExonTrack = TRUE; } } /* else { chp1 = kgPep; printf("<br>"); chp2 = protPep; ll = strlen(kgPep); if (strlen(protPep) < ll) ll= strlen(protPep); for (i=0; i<ll; i++) { if (*chp1 != *chp2) { printf("%c", *chp1); } else { printf("."); } chp1++; chp2++; } } //printf("</pre>");fflush(stdout); */ } } } } } else { doExonTrack = TRUE; getExonInfo(proteinID, &exCount, &chrom, &strand); assert(exCount > 0); } /* do the following only if pbTracks called doTracks() */ if (initialWindow && IAmPbTracks) { prevGBOffsetSav = calPrevGB(exCount, chrom, strand, l, yOffp, proteinID, mrnaID); trackOrigOffset = prevGBOffsetSav; if (trackOrigOffset > (protSeqLen*pbScale - 600)) trackOrigOffset = protSeqLen*pbScale - 600; /* prevent negative value */ if (trackOrigOffset < 0) trackOrigOffset = 0; } /* if this if for PDF/Postscript, the trackOrigOffset is already calculated previously, use the saved value */ if (psOutput != NULL) { trackOrigOffset = atoi(cartOptionalString(cart, "pbt.trackOffset")); } } /*printf("<br>%d %d<br>%d %d\n", prevGBStartPos, prevGBEndPos, blockGenomeStartPositive[exCount-1], blockGenomeStartPositive[0]); fflush(stdout); */ if (strand == '-') { if ((prevGBStartPos <= blockGenomeStartPositive[exCount-1]) && (prevGBEndPos >= blockGenomeStartPositive[0])) { showPrevGBPos = FALSE; } } else { if ((prevGBStartPos <= blockGenomeStartPositive[0]) && (prevGBEndPos >= blockGenomeStartPositive[exCount-1])) { showPrevGBPos = FALSE; } } if ((cgiOptionalString("aaOrigOffset") != NULL) && scaleButtonPushed) { trackOrigOffset = atoi(cgiOptionalString("aaOrigOffset"))*pbScale; } pixWidth = 160+ protSeqLen*pbScale; if (pixWidth > MAX_PB_PIXWIDTH) { pixWidth = MAX_PB_PIXWIDTH; } if ((protSeqLen*pbScale - trackOrigOffset) < MAX_PB_PIXWIDTH) { pixWidth = protSeqLen*pbScale - trackOrigOffset + 160; } if (pixWidth < 550) pixWidth = 550; insideWidth = pixWidth-gfxBorder; if (proteinInSupportedGenome) { pixHeight = 250; } else { pixHeight = 215; } if (sfCount > 0) pixHeight = pixHeight + 20; /* make room for individual residues display */ if (pbScale >=6) pixHeight = pixHeight + 20; if (pbScale >=18) pixHeight = pixHeight + 30; if (psOutput) { vg = vgOpenPostScript(pixWidth, pixHeight, psOutput); suppressHtml = TRUE; hideControls = TRUE; } else { trashDirFile(&gifTn, "pbt", "pbt", ".png"); vg = vgOpenPng(pixWidth, pixHeight, gifTn.forCgi, FALSE); } /* Put up horizontal scroll controls. */ hWrites("Move "); hButton("pbt.left3", "<<<"); hButton("pbt.left2", " <<"); hButton("pbt.left1", " < "); hButton("pbt.right1", " > "); hButton("pbt.right2", ">> "); hButton("pbt.right3", ">>>"); hPrintf("     "); /* Put up scaling controls. */ hPrintf("Current scale: "); if (pbScale == 1) hPrintf("1/6 "); if (pbScale == 3) hPrintf("1/2 "); if (pbScale == 6) hPrintf("FULL "); if (pbScale == 22) hPrintf("DNA "); hPrintf("    Rescale to "); hPrintf("<INPUT TYPE=SUBMIT NAME=\"pbScale\" VALUE=\"1/6\">\n"); hPrintf("<INPUT TYPE=SUBMIT NAME=\"pbScale\" VALUE=\"1/2\">\n"); hPrintf("<INPUT TYPE=SUBMIT NAME=\"pbScale\" VALUE=\"FULL\">\n"); if (kgVersion == KG_III) { /* for KG III, the protein has to exist in the kgProtMap2 table (which will turn on doExonTrack flag) to provide the genomic position data needed for DNA sequence display */ if ((proteinInSupportedGenome) && (doExonTrack)) hPrintf("<INPUT TYPE=SUBMIT NAME=\"pbScale\" VALUE=\"DNA\">\n"); } else { if (proteinInSupportedGenome) hPrintf("<INPUT TYPE=SUBMIT NAME=\"pbScale\" VALUE=\"DNA\">\n"); } hPrintf("<FONT SIZE=1><BR><BR></FONT>\n"); g_vg = vg; pbRed = vgFindColorIx(g_vg, 0xf9, 0x51, 0x59); pbBlue = vgFindColorIx(g_vg, 0x00, 0x00, 0xd0); bkgColor = vgFindColorIx(vg, 255, 254, 232); vgBox(vg, 0, 0, insideWidth, pixHeight, bkgColor); /* Start up client side map. */ hPrintf("<MAP Name=%s>\n", mapName); vgSetClip(vg, 0, gfxBorder, insideWidth, pixHeight - 2*gfxBorder); /* start drawing indivisual tracks */ doAAScale(l, yOffp, 1); if (pbScale >= 6) doResidues(aa, l, yOffp); if (pbScale >= 18) doDnaTrack(chrom, strand, exCount, l, yOffp); if ((mrnaID != NULL) && showPrevGBPos) { doPrevGB(exCount, chrom, strand, l, yOffp, proteinID, mrnaID); } if (mrnaID != NULL) { if (doExonTrack) doExon(exCount, chrom, l, yOffp, proteinID, mrnaID); } doCharge(aa, l, yOffp); doHydrophobicity(aa, l, yOffp); doCysteines(aa, l, yOffp); if (sfCount > 0) doSuperfamily(ensPepName, sfCount, yOffp); if (hasResFreq) doAnomalies(aa, l, yOffp); doAAScale(l, yOffp, -1); vgClose(&vg); /* Finish map and save out picture and tell html file about it. */ hPrintf("</MAP>\n"); /* put tracks image here */ hPrintf( "\n<IMG SRC=\"%s\" BORDER=1 WIDTH=%d HEIGHT=%d USEMAP=#%s><BR>", gifTn.forCgi, pixWidth, pixHeight, mapName); if (proteinInSupportedGenome) { hPrintf("<A HREF=\"../goldenPath/help/pbTracksHelpFiles/pbTracksHelp.shtml#tracks\" TARGET=_blank>"); } else { if (hIsGsidServer()) { hPrintf("<A HREF=\"../goldenPath/help/pbTracksHelpFiles/pbGsid/pbTracksHelp.shtml#tracks\" TARGET=_blank>"); } else { hPrintf("<A HREF=\"../goldenPath/help/pbTracksHelpFiles/pbTracksHelp.shtml#tracks\" TARGET=_blank>"); } } hPrintf("Explanation of Protein Tracks</A><br>"); safef(trackOffset, sizeof(trackOffset), "%d", trackOrigOffset); cgiMakeHiddenVar("trackOffset", trackOffset); /* remember where the AA base origin is so that it can be passed to next PB page */ aaOrigOffset = trackOrigOffset/pbScale; safef(aaOrigOffsetStr, sizeof(aaOrigOffsetStr), "%d", aaOrigOffset); cgiMakeHiddenVar("aaOrigOffset", aaOrigOffsetStr); /* save the following state variables, to be used by PDF/Postcript processing */ cartSetString(cart,"pbt.pbScaleStr", pbScaleStr); cartSetString(cart,"pbt.trackOffset", trackOffset); cartSaveSession(cart); fflush(stdout); }
static void showProtH1n1(char *item, char *geneSymbol) { char query2[256]; struct sqlResult *sr2; char **row2; struct sqlConnection *conn2 = hAllocConn(database); char *subjId, *dnaSeqId; char *aaSeqId= NULL; char *gene=NULL; char cond_str[256]; char *predFN; char *homologID; char *SCOPdomain; char *chain; char goodSCOPdomain[40]; int first = 1; float eValue; char *chp; int homologCount; int gotPDBFile = 0; sqlSafef(query2, sizeof(query2), "select subjId, dnaSeqId, aaSeqId, gene from gisaidXref where dnaSeqId='%s'", item); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); if (row2 != NULL) { subjId = strdup(row2[0]); dnaSeqId = strdup(row2[1]); aaSeqId = strdup(row2[2]); gene = strdup(row2[3]); } else { errAbort("%s not found.", item); } sqlFreeResult(&sr2); printf("<H3>Protein Structure Analysis and Prediction</H3>"); printf("<B>Comparison to 1918 Flu Virus:</B> "); printf("<A HREF=\"%s/%s/%s/1918_%s.mutate", getH1n1StructUrl(), gene, aaSeqId, aaSeqId); printf("\" TARGET=_blank>%s</A><BR>\n", aaSeqId); printf("<B>Comparison to A H1N1 gene %s concensus:</B> ", gene); printf("<A HREF=\"%s/%s/%s/consensus_%s.mutate", getH1n1StructUrl(), gene, aaSeqId, aaSeqId); printf("\" TARGET=_blank>%s</A><BR>\n", aaSeqId); printf("<BR><B>3D Structure Prediction of %s concensus sequence (with variation of sequence %s highlighted):", geneSymbol, item); printf("<BR>PDB file:</B> "); char pdbUrl[PATH_LEN]; safef(pdbUrl, sizeof(pdbUrl), "%s/%s/decoys/%s.try1-opt3.pdb.gz", getH1n1StructUrl(), item, item); // Modeller stuff char modelPdbUrl[PATH_LEN]; if (getH1n1Model(gene, modelPdbUrl)) { struct tempName imageFile, chimeraScript, chimerax; mkH1n1StructData(gene, NULL, aaSeqId, &imageFile, &chimeraScript); mkChimerax(gene, modelPdbUrl, chimeraScript.forCgi, &chimerax); printf("<A HREF=\"%s\" TARGET=_blank>%s</A>, view with <A HREF=\"%s\">Chimera</A><BR>\n", modelPdbUrl, gene, chimerax.forHtml); printf("<TABLE>\n"); printf("<TR>\n"); printf("<TD ALIGN=\"center\"><img src=\"%s\"></TD>", imageFile.forHtml); printf("</TR>\n"); printf("</TABLE>\n"); } return; gotPDBFile = 0; sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s' and evalue <1.0e-5;", item); printf("<TABLE>\n"); printf("<TR><TD ALIGN=\"center\">Front</TD>\n"); printf("<TD ALIGN=\"center\">Top</TD>\n"); printf("<TD ALIGN=\"center\">Side</TD>\n"); printf("</TR>\n"); printf("<TR>\n"); printf("<TD ALIGN=\"center\"><img src=\"%s/%s/%s.undertaker-align.view1_200.jpg\"></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><img src=\"%s/%s/%s.undertaker-align.view2_200.jpg\"></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><img src=\"%s/%s/%s.undertaker-align.view3_200.jpg\"></TD>", getH1n1StructUrl(), item, item); printf("</TR>\n"); printf("<TR>\n"); printf("<TD ALIGN=\"center\"><A HREF=\"%s/%s/%s.undertaker-align.view1_500.jpg\">500x500</A></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><A HREF=\"%s/%s/%s.undertaker-align.view2_500.jpg\">500x500</A></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><A HREF=\"%s/%s/%s.undertaker-align.view3_500.jpg\">500x500</A></TD>", getH1n1StructUrl(), item, item); printf("</TR>\n"); printf("</TABLE>\n"); printf("<BR><B>Detailed results of SAM-T02:</B> "); printf("<A HREF=\"%s/%s/summary.html", getH1n1StructUrl(), item); printf("\" TARGET=_blank>%s</A><BR>\n", item); /* by pass the following additional processing for now, until two necessary tables are built */ hFreeConn(&conn2); return; if (sqlGetField(database, "protHomolog", "proteinID", cond_str) != NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s'", item); predFN = sqlGetField(database, "protPredFile", "predFileName", cond_str); if (predFN != NULL) { printf("<A HREF=\"../SARS/%s/", item); /* printf("%s.t2k.undertaker-align.pdb\">%s</A><BR>\n", item,item); */ printf("%s\">%s</A><BR>\n", predFN,item); gotPDBFile = 1; } } if (!gotPDBFile) { printf("No high confidence level structure prediction available for this sequence."); printf("<BR>\n"); } printf("<B>3D Structure of Close Homologs:</B> "); homologCount = 0; strcpy(goodSCOPdomain, "dummy"); conn2= hAllocConn(database); sqlSafef(query2, sizeof(query2), "select homologID,eValue,SCOPdomain,chain from sc1.protHomolog where proteinID='%s' and evalue <= 0.01;", item); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); if (row2 != NULL) { while (row2 != NULL) { homologID = row2[0]; sscanf(row2[1], "%e", &eValue); SCOPdomain = row2[2]; chp = SCOPdomain+strlen(SCOPdomain)-1; while (*chp != '.') chp--; *chp = '\0'; chain = row2[3]; if (eValue <= 1.0e-10) strcpy(goodSCOPdomain, SCOPdomain); else { if (strcmp(goodSCOPdomain,SCOPdomain) != 0) goto skip; else if (eValue > 0.1) goto skip; } if (first) first = 0; else printf(", "); printf("<A HREF=\"http://www.rcsb.org/pdb/cgi/explore.cgi?job=graphics&pdbId=%s", homologID); if (strlen(chain) >= 1) printf("\"TARGET=_blank>%s(chain %s)</A>", homologID, chain); else printf("\"TARGET=_blank>%s</A>", homologID); homologCount++; skip: row2 = sqlNextRow(sr2); } } hFreeConn(&conn2); sqlFreeResult(&sr2); if (homologCount == 0) printf("None<BR>\n"); printf("<BR><B>Details:</B> "); printf("<A HREF=\"../SARS/%s/summary.html", item); printf("\" TARGET=_blank>%s</A><BR>\n", item); htmlHorizontalLine(); }
int getSuperfamilies2(char *proteinID) /* getSuperfamilies2() superceed getSuperfamilies() starting from hg16, it gets Superfamily data of a protein from ensemblXref3, sfAssign, and sfDes from the proteinsXXXXXX database, and placed them in arrays to be used by doSuperfamily().*/ { struct sqlConnection *conn, *conn2, *conn3; char query[MAXNAMELEN], query2[MAXNAMELEN]; struct sqlResult *sr, *sr2; char **row, **row2; char cond_str[255]; char *sfID, *seqID, *sfDesc, *region; int done; int j; char *chp, *chp2; int sfCnt; int int_start, int_end; if (!hTableExists(protDbName, "sfAssign")) return(0); if (!hTableExists(protDbName, "ensemblXref3")) return(0); conn = hAllocConn(database); conn2 = hAllocConn(database); conn3 = hAllocConn(database); sqlSafef(query2, sizeof(query), "select distinct sfID, seqID from %s.ensemblXref3 x, %s.sfAssign a where (swissAcc='%s' or tremblAcc='%s') and seqID=x.protein and protein != '' and evalue <= 0.02", protDbName, protDbName, proteinID, proteinID); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); sfCnt=0; while (row2 != NULL) { sfID = row2[0]; seqID= row2[1]; sqlSafef(query, sizeof(query), "select region from %s.sfAssign where sfID='%s' and seqID='%s' and evalue <=0.02", protDbName, sfID, seqID); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { region = row[0]; for (j=0; j<sfCnt; j++) { if (sfId[j] == atoi(sfID)) goto skip; } sqlSafefFrag(cond_str, sizeof(cond_str), "id=%s;", sfID); sfDesc = sqlGetField(protDbName, "sfDes", "description", cond_str); /* !!! refine logic here later to be defensive against illegal syntax */ chp = region; done = 0; while (!done) { chp2 = strstr(chp, "-"); *chp2 = '\0'; chp2++; sscanf(chp, "%d", &int_start); chp = chp2; chp2 = strstr(chp, ","); if (chp2 != NULL) { *chp2 = '\0'; } else { done = 1; } chp2++; sscanf(chp, "%d", &int_end); sfId[sfCnt] = atoi(sfID); sfStart[sfCnt] = int_start; sfEnd[sfCnt] = int_end; strncpy(superfam_name[sfCnt], sfDesc, MAXNAMELEN-1); sfCnt++; chp = chp2; } skip: row = sqlNextRow(sr); } sqlFreeResult(&sr); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); hFreeConn(&conn3); return(sfCnt); }
void showSAM_h1n1(char *item) { char query2[256]; struct sqlResult *sr2; char **row2; struct sqlConnection *conn2 = hAllocConn(database); char cond_str[256]; char *predFN; char *homologID; char *SCOPdomain; char *chain; char goodSCOPdomain[40]; int first = 1; float eValue; char *chp; int homologCount; int gotPDBFile = 0; printf("<H3>Protein Structure Analysis and Prediction by "); printf("<A HREF=\"http://www.soe.ucsc.edu/research/compbio/SAM_T02/sam-t02-faq.html\""); printf(" TARGET=_blank>SAM-T02</A></H3>\n"); printf("<B>Multiple Alignment:</B> "); printf("<A HREF=\"%s/%s/summary.html#alignment", getH1n1StructUrl(), item); printf("\" TARGET=_blank>%s</A><BR>\n", item); printf("<B>Secondary Structure Predictions:</B> "); printf("<A HREF=\"%s/%s/summary.html#secondary-structure", getH1n1StructUrl(), item); printf("\" TARGET=_blank>%s</A><BR>\n", item); printf("<B>3D Structure Prediction (PDB file):</B> "); char pdbUrl[PATH_LEN]; safef(pdbUrl, sizeof(pdbUrl), "%s/%s/decoys/%s.try1-opt3.pdb.gz", getH1n1StructUrl(), item, item); struct tempName chimerax; mkChimerax(item, pdbUrl, NULL, &chimerax); printf("<A HREF=\"%s\" TARGET=_blank>%s</A>, view with <A HREF=\"%s\">Chimera</A><BR>\n", pdbUrl, item, chimerax.forHtml); gotPDBFile = 0; sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s' and evalue <1.0e-5;", item); printf("<TABLE>\n"); printf("<TR><TD ALIGN=\"center\">Front</TD>\n"); printf("<TD ALIGN=\"center\">Top</TD>\n"); printf("<TD ALIGN=\"center\">Side</TD>\n"); printf("</TR>\n"); printf("<TR>\n"); printf("<TD ALIGN=\"center\"><img src=\"%s/%s/%s.undertaker-align.view1_200.jpg\"></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><img src=\"%s/%s/%s.undertaker-align.view2_200.jpg\"></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><img src=\"%s/%s/%s.undertaker-align.view3_200.jpg\"></TD>", getH1n1StructUrl(), item, item); printf("</TR>\n"); printf("<TR>\n"); printf("<TD ALIGN=\"center\"><A HREF=\"%s/%s/%s.undertaker-align.view1_500.jpg\">500x500</A></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><A HREF=\"%s/%s/%s.undertaker-align.view2_500.jpg\">500x500</A></TD>", getH1n1StructUrl(), item, item); printf("<TD ALIGN=\"center\"><A HREF=\"%s/%s/%s.undertaker-align.view3_500.jpg\">500x500</A></TD>", getH1n1StructUrl(), item, item); printf("</TR>\n"); printf("</TABLE>\n"); printf("<BR><B>Detailed results of SAM-T02:</B> "); printf("<A HREF=\"%s/%s/summary.html", getH1n1StructUrl(), item); printf("\" TARGET=_blank>%s</A><BR>\n", item); /* by pass the following additional processing for now, until two necessary tables are built */ hFreeConn(&conn2); return; if (sqlGetField(database, "protHomolog", "proteinID", cond_str) != NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "proteinID='%s'", item); predFN = sqlGetField(database, "protPredFile", "predFileName", cond_str); if (predFN != NULL) { printf("<A HREF=\"../SARS/%s/", item); /* printf("%s.t2k.undertaker-align.pdb\">%s</A><BR>\n", item,item); */ printf("%s\">%s</A><BR>\n", predFN,item); gotPDBFile = 1; } } if (!gotPDBFile) { printf("No high confidence level structure prediction available for this sequence."); printf("<BR>\n"); } printf("<B>3D Structure of Close Homologs:</B> "); homologCount = 0; strcpy(goodSCOPdomain, "dummy"); conn2= hAllocConn(database); sqlSafef(query2, sizeof(query2), "select homologID,eValue,SCOPdomain,chain from sc1.protHomolog where proteinID='%s' and evalue <= 0.01;", item); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); if (row2 != NULL) { while (row2 != NULL) { homologID = row2[0]; sscanf(row2[1], "%e", &eValue); SCOPdomain = row2[2]; chp = SCOPdomain+strlen(SCOPdomain)-1; while (*chp != '.') chp--; *chp = '\0'; chain = row2[3]; if (eValue <= 1.0e-10) strcpy(goodSCOPdomain, SCOPdomain); else { if (strcmp(goodSCOPdomain,SCOPdomain) != 0) goto skip; else if (eValue > 0.1) goto skip; } if (first) first = 0; else printf(", "); printf("<A HREF=\"http://www.rcsb.org/pdb/cgi/explore.cgi?job=graphics&pdbId=%s", homologID); if (strlen(chain) >= 1) printf("\"TARGET=_blank>%s(chain %s)</A>", homologID, chain); else printf("\"TARGET=_blank>%s</A>", homologID); homologCount++; skip: row2 = sqlNextRow(sr2); } } hFreeConn(&conn2); sqlFreeResult(&sr2); if (homologCount == 0) printf("None<BR>\n"); printf("<BR><B>Details:</B> "); printf("<A HREF=\"../SARS/%s/summary.html", item); printf("\" TARGET=_blank>%s</A><BR>\n", item); htmlHorizontalLine(); }
void processRefSeq(char *database, char *faFile, char *raFile, char *pslFile, char *loc2refFile, char *pepFile, char *mim2locFile) /* hgRefSeqMrna - Load refSeq mRNA alignments and other info into * refSeqGene table. */ { struct lineFile *lf; struct hash *raHash, *rsiHash = newHash(0); struct hash *loc2mimHash = newHash(0); struct refSeqInfo *rsiList = NULL, *rsi; char *s, *line, *row[5]; int wordCount, dotMod = 0; int noLocCount = 0; int rsiCount = 0; int noProtCount = 0; struct psl *psl; struct sqlConnection *conn = hgStartUpdate(database); struct hash *productHash = loadNameTable(conn, "productName", 16); struct hash *geneHash = loadNameTable(conn, "geneName", 16); char *kgName = "refGene"; FILE *kgTab = hgCreateTabFile(".", kgName); FILE *productTab = hgCreateTabFile(".", "productName"); FILE *geneTab = hgCreateTabFile(".", "geneName"); FILE *refLinkTab = hgCreateTabFile(".", "refLink"); FILE *refPepTab = hgCreateTabFile(".", "refPep"); FILE *refMrnaTab = hgCreateTabFile(".", "refMrna"); struct exon *exonList = NULL, *exon; char *answer; char cond_str[200]; /* Make refLink and other tables table if they don't exist already. */ sqlMaybeMakeTable(conn, "refLink", refLinkTableDef); sqlUpdate(conn, "NOSQLINJ delete from refLink"); sqlMaybeMakeTable(conn, "refGene", refGeneTableDef); sqlUpdate(conn, "NOSQLINJ delete from refGene"); sqlMaybeMakeTable(conn, "refPep", refPepTableDef); sqlUpdate(conn, "NOSQLINJ delete from refPep"); sqlMaybeMakeTable(conn, "refMrna", refMrnaTableDef); sqlUpdate(conn, "NOSQLINJ delete from refMrna"); /* Scan through locus link to omim ID file and put in hash. */ { char *row[2]; printf("Scanning %s\n", mim2locFile); lf = lineFileOpen(mim2locFile, TRUE); while (lineFileRow(lf, row)) { hashAdd(loc2mimHash, row[1], intToPt(atoi(row[0]))); } lineFileClose(&lf); } /* Scan through .ra file and make up start of refSeqInfo * objects in hash and list. */ printf("Scanning %s\n", raFile); lf = lineFileOpen(raFile, TRUE); while ((raHash = hashNextRa(lf)) != NULL) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } AllocVar(rsi); slAddHead(&rsiList, rsi); if ((s = hashFindVal(raHash, "acc")) == NULL) errAbort("No acc near line %d of %s", lf->lineIx, lf->fileName); rsi->mrnaAcc = cloneString(s); if ((s = hashFindVal(raHash, "siz")) == NULL) errAbort("No siz near line %d of %s", lf->lineIx, lf->fileName); rsi->size = atoi(s); if ((s = hashFindVal(raHash, "gen")) != NULL) rsi->geneName = cloneString(s); //!!!else //!!! warn("No gene name for %s", rsi->mrnaAcc); if ((s = hashFindVal(raHash, "cds")) != NULL) parseCds(s, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd); else rsi->cdsEnd = rsi->size; if ((s = hashFindVal(raHash, "ngi")) != NULL) rsi->ngi = atoi(s); rsi->geneNameId = putInNameTable(geneHash, geneTab, rsi->geneName); s = hashFindVal(raHash, "pro"); if (s != NULL) rsi->productName = cloneString(s); rsi->productNameId = putInNameTable(productHash, productTab, s); hashAdd(rsiHash, rsi->mrnaAcc, rsi); freeHashAndVals(&raHash); } lineFileClose(&lf); if (clDots) printf("\n"); /* Scan through loc2ref filling in some gaps in rsi. */ printf("Scanning %s\n", loc2refFile); lf = lineFileOpen(loc2refFile, TRUE); while (lineFileNext(lf, &line, NULL)) { char *mrnaAcc; if (line[0] == '#') continue; wordCount = chopTabs(line, row); if (wordCount < 5) errAbort("Expecting at least 5 tab-separated words line %d of %s", lf->lineIx, lf->fileName); mrnaAcc = row[1]; mrnaAcc = accWithoutSuffix(mrnaAcc); if (mrnaAcc[2] != '_') warn("%s is and odd name %d of %s", mrnaAcc, lf->lineIx, lf->fileName); if ((rsi = hashFindVal(rsiHash, mrnaAcc)) != NULL) { rsi->locusLinkId = lineFileNeedNum(lf, row, 0); rsi->omimId = ptToInt(hashFindVal(loc2mimHash, row[0])); rsi->proteinAcc = cloneString(accWithoutSuffix(row[4])); } } lineFileClose(&lf); /* Report how many seem to be missing from loc2ref file. * Write out knownInfo file. */ printf("Writing %s\n", "refLink.tab"); for (rsi = rsiList; rsi != NULL; rsi = rsi->next) { ++rsiCount; if (rsi->locusLinkId == 0) ++noLocCount; if (rsi->proteinAcc == NULL) ++noProtCount; fprintf(refLinkTab, "%s\t%s\t%s\t%s\t%u\t%u\t%u\t%u\n", emptyForNull(rsi->geneName), emptyForNull(rsi->productName), emptyForNull(rsi->mrnaAcc), emptyForNull(rsi->proteinAcc), rsi->geneNameId, rsi->productNameId, rsi->locusLinkId, rsi->omimId); } if (noLocCount) printf("Missing locusLinkIds for %d of %d\n", noLocCount, rsiCount); if (noProtCount) printf("Missing protein accessions for %d of %d\n", noProtCount, rsiCount); /* Process alignments and write them out as genes. */ lf = pslFileOpen(pslFile); dotMod = 0; while ((psl = pslNext(lf)) != NULL) { if (hashFindVal(rsiHash, psl->qName) != NULL) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } sqlSafefFrag(cond_str, sizeof cond_str, "extAC='%s'", psl->qName); answer = sqlGetField(proteinDB, "spXref2", "displayID", cond_str); if (answer == NULL) { fprintf(stderr, "%s NOT FOUND.\n", psl->qName); fflush(stderr); } if (answer != NULL) { struct genePred *gp = NULL; exonList = pslToExonList(psl); fprintf(kgTab, "%s\t%s\t%c\t%d\t%d\t", psl->qName, psl->tName, psl->strand[0], psl->tStart, psl->tEnd); rsi = hashMustFindVal(rsiHash, psl->qName); gp = genePredFromPsl(psl, rsi->cdsStart, rsi->cdsEnd, genePredStdInsertMergeSize); if (!gp) errAbort("Cannot convert psl (%s) to genePred.\n", psl->qName); fprintf(kgTab, "%d\t%d\t", gp->cdsStart, gp->cdsEnd); fprintf(kgTab, "%d\t", slCount(exonList)); fflush(kgTab); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(kgTab, "%d,", exon->start); fprintf(kgTab, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(kgTab, "%d,", exon->end); fprintf(kgTab, "\n"); slFreeList(&exonList); } } else { fprintf(stderr, "%s found in psl, but not in .fa or .ra data files.\n", psl->qName); fflush(stderr); } } if (clDots) printf("\n"); if (!clTest) { writeSeqTable(pepFile, refPepTab, FALSE, TRUE); writeSeqTable(faFile, refMrnaTab, FALSE, FALSE); } carefulClose(&kgTab); carefulClose(&productTab); carefulClose(&geneTab); carefulClose(&refLinkTab); carefulClose(&refPepTab); carefulClose(&refMrnaTab); if (!clTest) { printf("Loading database with %s\n", kgName); fflush(stdout); hgLoadTabFile(conn, ".", kgName, NULL); printf("Loading database with %s\n", "productName"); fflush(stdout); hgLoadTabFile(conn, ".", "productName", NULL); printf("Loading database with %s\n", "geneName"); fflush(stdout); hgLoadTabFile(conn, ".", "geneName", NULL); printf("Loading database with %s\n", "refLink"); fflush(stdout); hgLoadTabFile(conn, ".", "refLink", NULL); printf("Loading database with %s\n", "refPep"); fflush(stdout); hgLoadTabFile(conn, ".", "refPep", NULL); printf("Loading database with %s\n", "refMrna"); fflush(stdout); hgLoadTabFile(conn, ".", "refMrna", NULL); } }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2, *conn3; char query2[256], query3[256]; struct sqlResult *sr2, *sr3; char **row2, **row3; char condStr[255]; char *answer; char *kgTempDb; char *outfileName; FILE *outf; int i; char *chp; char *acc2; char *name, *txStart, *txEnd; char *chrom; char *acc, *stat; char *frame, *start, *stop; char *causes; char *genomeDb; char *geneName; char srcType; int alignCnt = 0; char *candTable, *chkTable; int orfStop, cdsGap, cdsSplice, numCdsIntrons; boolean passed; float ranking; if (argc != 6) usage(); kgTempDb = argv[1]; genomeDb = argv[2]; candTable = argv[3]; chkTable = argv[4]; outfileName = argv[5]; outf = mustOpen(outfileName, "w"); conn = hAllocConn(genomeDb); conn2= hAllocConn(genomeDb); conn3= hAllocConn(genomeDb); /* go through each protein */ safef(query2, sizeof(query2), "select * from %s.%s", kgTempDb, candTable); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { name = row2[0]; chrom = row2[1]; txStart = row2[3]; txEnd = row2[4]; /* retrieve gene-check results */ safef(query3, sizeof(query3), "select * from %s.%s where acc='%s' and chrStart=%s and chrEnd = %s", kgTempDb, chkTable, name, txStart, txEnd); sr3 = sqlMustGetResult(conn3, query3); row3 = sqlNextRow(sr3); { passed = FALSE; ranking = 3; acc = row3[0]; stat = row3[5]; frame = row3[6]; start = row3[7]; stop = row3[8]; orfStop = atoi(row3[9]); cdsGap = atoi(row3[10]); cdsSplice = atoi(row3[12]); numCdsIntrons = atoi(row3[18]); causes = row3[21]; ranking = 9; /* all genes passed gene-check with status ok are considered good */ if (sameWord(stat, "ok")) { passed = TRUE; ranking = 1; } else { /* frame, start, orfStop, and stop conditions must be met for KG candidates */ if ((sameWord(frame, "ok")) && (sameWord(start, "ok")) && (orfStop == 0) && (sameWord(stop, "ok")) ) { ranking = 2; /* accept cdsSplice = 0 or (cdsSplice = 1 and numCdsIntrons > 1) */ if ((cdsSplice == 0) || ((numCdsIntrons > 1) && (cdsSplice == 1))) { passed = TRUE; } /* if cdsGap > 0, degrade it ranking by 1. If cdsGap is not a multiple of 3, degrade its ranking further */ if (cdsGap > 0) { ranking = ranking + 1; if ((cdsGap - (cdsGap/3)*3) != 0) ranking = ranking + 1; } } } /* give RefSeq entries 0.5 advantage in its ranking */ safef(condStr, sizeof(condStr), "name='%s'", acc); answer = sqlGetField(genomeDb, "refGene", "name", condStr); if (answer != NULL) { ranking = ranking - 0.5; } else { chp = strstr(acc, "_"); if (chp != NULL) { acc2 = chp + 1; } else { acc2 = acc; } safef(condStr, sizeof(condStr), "name='%s'", acc2); /* If it is an MGC gene, give it a 0.3 advantable */ answer = sqlGetField(genomeDb, "mgcGenes", "name", condStr); if (answer != NULL) { ranking = ranking - 0.3; } } /* print out entries, with their rankings, that passed the above criteria */ if (passed) { /*for (i=0; i<10; i++) { fprintf(outf, "%s\t", row2[i]); } */ geneName = strdup(row2[0]); chp = strstr(geneName, "_"); if (chp != NULL) { if (strstr(geneName, "NM_") != NULL) { srcType = 'R'; /* src is RefSeq */ } else { chp++; /* keep the composite name, so that kgGetCds can process correctly */ /* geneName = chp; */ srcType = 'U'; /* src is UCSC prot/mrna alignment */ } } else { srcType = 'G'; /* src is GenBank */ } alignCnt++; fprintf(outf, "%s\t", geneName); for (i= 1; i<10; i++) fprintf(outf, "%s\t", row2[i]); fprintf(outf, "%c%d\t", srcType, alignCnt); fprintf(outf, "%.2f\n", ranking); } row3 = sqlNextRow(sr3); } sqlFreeResult(&sr3); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); hFreeConn(&conn); hFreeConn(&conn2); hFreeConn(&conn3); fclose(outf); return(0); }
void makeActiveImagePB(char *psOutput, char *psOutput2) /* Make image and image map. */ { char *mapName = "map"; int pixWidth, pixHeight; char *answer; char cond_str[255]; struct sqlConnection *conn; struct sqlConnection *connCentral; char query[256]; struct sqlResult *sr; char **row; int iypos; char *blatGbDb; char *sciName, *commonName; char *spDisplayId; char *oldDisplayId; conn = sqlConnect(UNIPROT_DB_NAME); hPrintf("<br><font size=4>Protein "); hPrintf("<A HREF=\"http://www.uniprot.org/uniprot/%s\" TARGET=_blank><B>%s</B></A>\n", proteinID, proteinID); spDisplayId = spAccToId(conn, spFindAcc(conn, proteinID)); if (strstr(spDisplayId, spFindAcc(conn, proteinID)) == NULL) { hPrintf(" (aka %s", spDisplayId); /* show once if the new and old displayId are the same */ oldDisplayId = oldSpDisplayId(spDisplayId); if (oldDisplayId != NULL) { if (!sameWord(spDisplayId, oldDisplayId)) { hPrintf(" or %s", oldSpDisplayId(spDisplayId)); } } hPrintf(")\n"); } hPrintf(" %s\n", description); hPrintf("</font><br>"); hPrintf("Organism: "); /* get scientific and Genbank common name of this organism */ sciName = NULL; commonName = NULL; sqlSafefFrag(cond_str, sizeof(cond_str),"accession='%s'", proteinID); answer = sqlGetField(PROTEOME_DB_NAME, "spXref3", "division", cond_str); if (answer != NULL) { sqlSafefFrag(cond_str, sizeof(cond_str), "id=%s and nameType='scientific name'", answer); sciName = sqlGetField(PROTEOME_DB_NAME, "taxonNames", "name", cond_str); sqlSafefFrag(cond_str, sizeof(cond_str), "id=%s and nameType='genbank common name'", answer); commonName = sqlGetField(PROTEOME_DB_NAME, "taxonNames", "name", cond_str); } if (sciName != NULL) { hPrintf("%s", sciName); } if (commonName != NULL) { hPrintf(" (%s)", commonName); } hPrintf("<br>"); protSeq = getAA(proteinID); if (protSeq == NULL) { hUserAbort("%s is not a current valid entry in UniProtKB\n", proteinID); } protSeqLen = strlen(protSeq); fflush(stdout); iypos = 15; doTracks(proteinID, mrnaID, protSeq, &iypos, psOutput); if (!hTableExists(database, "pbStamp")) goto histDone; pbScale = 3; pixWidth = 765; insideWidth = pixWidth-gfxBorder; pixHeight = 350; if (psOutput2) { vg2 = vgOpenPostScript(pixWidth, pixHeight, psOutput2); } else { trashDirFile(&gifTn2, "pbt", "pbt", ".png"); vg2 = vgOpenPng(pixWidth, pixHeight, gifTn2.forCgi, FALSE); } g_vg = vg2; pbRed = vgFindColorIx(vg2, 0xf9, 0x51, 0x59); pbBlue = vgFindColorIx(g_vg, 0x00, 0x00, 0xd0); normalColor = pbBlue; abnormalColor = pbRed; bkgColor = vgFindColorIx(vg2, 255, 254, 232); vgBox(vg2, 0, 0, insideWidth, pixHeight, bkgColor); /* Start up client side map. */ mapName=cloneString("pbStamps"); hPrintf("\n<MAP Name=%s>\n", mapName); vgSetClip(vg2, 0, gfxBorder, insideWidth, pixHeight - 2*gfxBorder); iypos = 15; /* Draw stamps. */ doStamps(proteinID, mrnaID, protSeq, vg2, &iypos); /* Finish map. */ hPrintf("</MAP>\n"); /* Save out picture and tell html file about it. */ vgClose(&vg2); hPrintf("<P>"); hPrintf("\n<IMG SRC=\"%s\" BORDER=1 WIDTH=%d HEIGHT=%d USEMAP=#%s><BR>", gifTn2.forCgi, pixWidth, pixHeight, mapName); if (proteinInSupportedGenome) { hPrintf("\n<A HREF=\"../goldenPath/help/pbTracksHelpFiles/pbTracksHelp.shtml#histograms\" TARGET=_blank>"); } else { hPrintf("\n<A HREF=\"../goldenPath/help/pbTracksHelpFiles/pbTracksHelp.shtml#histograms\" TARGET=_blank>"); } hPrintf("Explanation of Protein Property Histograms</A><BR>"); hPrintf("<P>"); histDone: hPrintf("<P>"); fflush(stdout); /* See if a UCSC Genome Browser exist for this organism. If so, display BLAT link. */ connCentral = hConnectCentral(); sqlSafef(query, sizeof(query), "select defaultDb.name from dbDb, defaultDb where dbDb.scientificName='%s' and dbDb.name=defaultDb.name", sciName); sr = sqlGetResult(connCentral, query); row = sqlNextRow(sr); if (row != NULL) { blatGbDb = strdup(row[0]); } else { blatGbDb = NULL; } sqlFreeResult(&sr); hDisconnectCentral(&connCentral); if (proteinInSupportedGenome || (blatGbDb != NULL)) { hPrintf("\n<B>UCSC Links:</B><BR>\n "); hPrintf("<UL>\n"); /* Show GB links only if the protein belongs to a supported genome */ if (proteinInSupportedGenome) { doGenomeBrowserLink(proteinID, mrnaID, hgsidStr); doGeneDetailsLink(proteinID, mrnaID, hgsidStr); } /* Show Gene Sorter link only if it is valid for this genome */ if (hgNearOk(database)) { doGeneSorterLink(protDisplayID, mrnaID, hgsidStr); } /* Show BLAT link if we have UCSC Genome Browser for it */ if (blatGbDb != NULL) { doBlatLink(blatGbDb, sciName, commonName, protSeq); } hPrintf("</UL><P>"); } /* This section shows various types of domains */ conn = sqlConnect(UNIPROT_DB_NAME); domainsPrint(conn, proteinID); hPrintf("<P>"); /* Do Pathway section only if the protein belongs to a supported genome */ if (proteinInSupportedGenome); { doPathwayLinks(proteinID, mrnaID); } printFASTA(proteinID, protSeq); }
int main(int argc, char *argv[]) { struct sqlConnection *conn, *conn2; struct sqlResult *sr2; char query2[256]; char **row2; char condStr[255]; char *uniProtDb; char *score; FILE *outf; FILE *dupOutf; char *chrom, *cdsStart, *cdsEnd; char *displayID; char *oldDisplayID; char *chp, *chp1; int i; int isDuplicate; char *kgTempDb; char *infileName, *outfileName, *dupOutfileName; if (argc != 6) usage(); kgTempDb = argv[1]; uniProtDb = argv[2]; infileName = argv[3]; outfileName = argv[4]; dupOutfileName = argv[5]; inf = mustOpen(infileName, "r"); outf = mustOpen(outfileName, "w"); dupOutf = mustOpen(dupOutfileName, "w"); conn = hAllocConn(); conn2= hAllocConn(); strcpy(oldInfo, ""); isDuplicate = 0; oldMrnaStr = cloneString(""); oldAlignStr = cloneString(""); oldProteinStr = cloneString(""); oldDisplayID = cloneString(""); mrnaStr = cloneString(""); proteinStr = cloneString(""); while (fgets(line_in, 500, inf) != NULL) { strcpy(line, line_in); strcpy(line2, line_in); chp = strstr(line, "\t"); *chp = '\0'; mrnaStr = strdup(line); chp ++; chp1 = chp; chp = strstr(chp, "\t"); *chp = '\0'; chrom = strdup(chp1); chp ++; chp1 = chp; chp = strstr(chp, "\t"); *chp = '\0'; cdsStart = strdup(chp1); chp ++; chp1 = chp; chp = strstr(chp, "\t"); *chp = '\0'; cdsEnd = strdup(chp1); chp1 = line2 + (chp - line); *chp1 = '\0'; chp ++; chp1 = chp; chp = strstr(chp, "\t"); *chp = '\0'; score= strdup(chp1); chp ++; chp1 = chp; chp = strstr(chp, "\n"); *chp = '\0'; proteinStr= strdup(chp1); strcpy(newInfo, line2); if (sameString(oldInfo, newInfo)) { isDuplicate = 1; sqlSafefFrag(condStr, sizeof(condStr), "acc='%s'", proteinStr); displayID = sqlGetField(uniProtDb, "displayId", "val", condStr); if (displayID == NULL) { printf("!!! %s not found\n", proteinStr);fflush(stdout); } sqlSafefFrag(condStr, sizeof(condStr), "acc='%s'", oldProteinStr); oldDisplayID = sqlGetField(uniProtDb, "displayId", "val", condStr); if (oldDisplayID == NULL) { printf("!!! %s not found\n", oldProteinStr);fflush(stdout); } fprintf(dupOutf, "%s\t%s\t%s\t%s\n", oldMrnaStr, oldDisplayID, mrnaStr, displayID);fflush(stdout); } else { /* remember previous record as old only if it is not a duplicate */ if (!isDuplicate) { oldMrnaStr = mrnaStr; oldProteinStr = proteinStr; } strcpy(oldInfo, newInfo); isDuplicate = 0; sqlSafef(query2, sizeof(query2), "select * from %s.kgCandidate2 where name='%s' and proteinID='%s' and chrom='%s' and cdsStart='%s' and cdsEnd='%s'", kgTempDb, mrnaStr, proteinStr, chrom, cdsStart, cdsEnd); sr2 = sqlMustGetResult(conn2, query2); row2 = sqlNextRow(sr2); while (row2 != NULL) { for (i=0; i<10; i++) fprintf(outf, "%s\t", row2[i]); if (!sameWord(proteinStr, row2[10])) { printf("\n??? %s\t%s\n", proteinStr, row2[10]);fflush(stdout); } sqlSafefFrag(condStr, sizeof(condStr), "acc='%s'", proteinStr); displayID = sqlGetField(uniProtDb, "displayId", "val", condStr); if (displayID == NULL) { printf("!!! %s not found\n", proteinStr);fflush(stdout); } fprintf(outf, "%s\t", displayID); fprintf(outf, "%s\n", row2[11]); row2 = sqlNextRow(sr2); } sqlFreeResult(&sr2); } } fclose(inf); fclose(outf); fclose(dupOutf); return(0); }
int main(int argc, char *argv[]) { FILE *inf; char *mrnaDate; int months; char dirName[PATH_MAX]; struct sqlConnection *conn, *conn3; char query[256]; struct sqlResult *sr; char **row; char *protAcc, *mrnaAcc, *matchStr; char *protSizeStr, *mrnaSizeStr; int protSize, mrnaSize, match; char *protMrnaTableName; char condStr[255]; int score; if (argc != 5) usage(); proteinDataDate = argv[1]; kgTempDb = argv[2]; genomeReadOnly = argv[3]; protMrnaTableName = argv[4]; sprintf(spDB, "sp%s", proteinDataDate); sprintf(proteinsDB, "proteins%s", proteinDataDate); sprintf(gbTempDB, "%sTemp", kgTempDb); inf = fopen("protein.lis", "r"); if ((FILE *) NULL == inf) errAbort("ERROR: Can not open input file: protein.lis"); o3 = fopen("kgBestMrna.out", "w"); if ((FILE *) NULL == o3) errAbort("ERROR: Can not open output file: kgBestMrna.out"); o7 = fopen("best.lis", "w"); if ((FILE *) NULL == o7) errAbort("ERROR: Can not open output file: best.lis"); conn = hAllocConn(genomeReadOnly); conn3= hAllocConn(genomeReadOnly); proteinCount = 0; snprintf(dirName, (size_t) sizeof(dirName), "%s", "./clusterRun" ); sqlSafef(query, sizeof query,"select qName, tName, matches, qSize, tSize from %s.%s", kgTempDb, protMrnaTableName); sr = sqlMustGetResult(conn, query); row = sqlNextRow(sr); while (row != NULL) { protAcc = row[0]; mrnaAcc = row[1]; matchStr = row[2]; protSizeStr = row[3]; mrnaSizeStr = row[4]; sscanf(matchStr, "%d", &match); sscanf(protSizeStr, "%d", &protSize); sscanf(mrnaSizeStr, "%d", &mrnaSize); sscanf(matchStr, "%d", &match); if ((float)match/(float)protSize > 0.3) { sqlSafefFrag(cond_str, sizeof cond_str, "acc='%s'", mrnaAcc); mrnaDate = sqlGetField(genomeReadOnly, "gbCdnaInfo", "moddate", condStr); if (mrnaDate != NULL) { months = cal_months(mrnaDate); score = mrnaSize + months*2 - (protSize - match) *50; printf("%s\t%s\t%d\n", protAcc, mrnaAcc, score);fflush(stdout); } } row = sqlNextRow(sr); } hFreeConn(&conn); hFreeConn(&conn3); fclose(o3); fclose(o7); return(0); }
int main(int argc, char *argv[]) { struct sqlConnection *conn2; char condStr[500]; FILE *inf; FILE *outf; char line[1000]; char *chrStart; char *inFileName, *outFileName; char contig[100], start[100], end[100]; char num[100], code[100], id[100], oStart[100], oEnd[100], strnd[100]; char *database; char *oldContig; int oldNum = 0; int lastNum = 0; int lastEnd = 0; if (argc != 4)usage(); database = argv[1]; inFileName = argv[2]; outFileName = argv[3]; hSetDb(database); outf = fopen(outFileName, "w"); conn2= hAllocConn(); inf = mustOpen(inFileName, "r"); oldContig = strdup(""); while (fgets(line, 1000, inf) != NULL) { sscanf(line, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", contig, start, end, num, code, id, oStart, oEnd, strnd); sqlSafefFrag(condStr, sizeof condStr, "ctg_acc='%s'", contig); chrStart = sqlGetField(database, "seq_contig", "chr_start", condStr); if (!sameWord(oldContig, contig)) { if (!sameWord(oldContig, "")) { lastNum++; fprintf(outf, "%s\t%d\t%d\t", oldContig, lastEnd+1, atoi(chrStart)+atoi(start)-2); fprintf(outf, "%d\t%s\t%d\t%s\t%s\n", lastNum, "N", atoi(chrStart)+atoi(start)-2 - (lastEnd+1) +1, "contig", "no"); } oldContig = strdup(contig); oldNum = lastNum; } lastNum = atoi(num) + oldNum; fprintf(outf, "%s\t%d\t%d\t", contig, atoi(chrStart)+atoi(start)-1, atoi(chrStart)+atoi(end)-1); fprintf(outf, "%d\t%s\t%s\t%s\t%s\t%s\n", lastNum, code, id, oStart, oEnd, strnd); lastEnd = atoi(chrStart)+atoi(end)-1; } hFreeConn(&conn2); fclose(outf); return(0); }