void findGenePredOverlap(struct hash *chromHash, char **row, FILE *outFh) /* find and output overlaps with a genePred object */ { struct genePred *gene = genePredLoad(row); struct binKeeper *chromBins = getChromBins(chromHash, gene->chrom, gene->strand); struct geneLoc *geneLocList = NULL; struct geneLoc *geneLoc; int iExon; /* get any with overlaping exons */ for (iExon = 0; iExon < gene->exonCount; iExon++) { int exonStart = gene->exonStarts[iExon]; int exonEnd = gene->exonEnds[iExon]; if (gCdsOnly) { exonStart = max(exonStart, gene->cdsStart); exonEnd = min(exonEnd, gene->cdsEnd); } if (exonStart < exonEnd) findOverlapingExons(&geneLocList, chromBins, exonStart, exonEnd); } for (geneLoc = geneLocList; geneLoc != NULL; geneLoc = geneLoc->next) fprintf(outFh, "%s\t%s\t%s\t%d\t%d\t%s\t%d\t%d\t%d\n", geneLoc->chrom, geneLoc->strand, gene->name, gene->txStart, gene->txEnd, geneLoc->name, geneLoc->start, geneLoc->end, geneLoc->numOverlap); geneLocUnlink(&geneLocList); genePredFree(&gene); }
static void wrapHgGeneLink(struct sqlConnection *conn, char *name, char *label, char *geneTable) /* Wrap label with link to hgGene if possible. */ { char query[256]; struct sqlResult *sr; char **row; int rowOffset = hOffsetPastBin(database, seqName, "sgdGene"); sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", geneTable, name); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row+rowOffset); printf("<A HREF=\"../cgi-bin/hgGene?db=%s", database); printf("&hgg_gene=%s", gp->name); printf("&hgg_chrom=%s", gp->chrom); printf("&hgg_start=%d", gp->txStart); printf("&hgg_end=%d", gp->txEnd); printf("\">"); printf("%s", label); printf("</A>"); } else printf("%s", label); sqlFreeResult(&sr); }
static void chkGenePredRows(struct gbSelect* select, struct sqlConnection* conn, char* table, boolean isRefFlat, struct metaDataTbls* metaDataTbls, unsigned typeFlags) /* check rows of genePred or refFlat table */ { unsigned iRow = 0; char **row; char *geneName = NULL; int rowOff = (isRefFlat ? 1 : 0); /* columns to skip to genePred */ if (sqlFieldIndex(conn, table, "bin") >= 0) rowOff++; char query[512]; sqlSafef(query, sizeof(query), "SELECT * FROM %s", table); struct sqlResult *sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct genePred* gene = genePredLoad(row+rowOff); if (isRefFlat) geneName = row[0]; chkGenePred(gene, geneName, iRow, select->release->genome->database, table, metaDataTbls, typeFlags); genePredFree(&gene); iRow++; } sqlFreeResult(&sr); }
void liftGenePred(char *destFile, struct hash *liftHash, int sourceCount, char *sources[]) /* Lift a genePred files. */ { char *row[GENEPRED_NUM_COLS]; struct lineFile* lf; FILE* dest = mustOpen(destFile, "w"); int iSrc; for (iSrc = 0; iSrc < sourceCount; iSrc++) { verbose(1, "Lifting %s\n", sources[iSrc]); lf = lineFileOpen(sources[iSrc], TRUE); while (lineFileChopNextTab(lf, row, ArraySize(row))) { struct genePred* gp = genePredLoad(row); if (liftGenePredObj(liftHash, gp, lf)) genePredTabOut(gp, dest); genePredFree(&gp); } lineFileClose(&lf); if (dots) verbose(1, "\n"); } carefulClose(&dest); }
void freen(char *chrom) /* Test something */ { uglyTime(NULL); struct sqlConnection *conn = sqlConnect("hg19"); uglyTime("connect"); char query[512]; sqlSafef(query, sizeof(query), "select * from knownGene where chrom='%s'", chrom); struct sqlResult *sr = sqlGetResult(conn, query); uglyTime("get result"); char **row; struct rbTree *rt = rangeTreeNew(); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row); int i; int exonCount = gp->exonCount; for (i=0; i<exonCount; ++i) rangeTreeAdd(rt, gp->exonStarts[i], gp->exonEnds[i]); } uglyTime("Add rows"); struct range *list = rangeTreeList(rt); uglyTime("Did list"); uglyf("%d items in chrom %s\n", slCount(list), chrom); }
static void showMrnaFromGenePred(struct sqlConnection *conn, char *geneId, char *geneName) /* Get mRNA sequence for gene from gene prediction. */ { char *table = genomeSetting("knownGene"); struct sqlResult *sr; char **row; char query[256]; boolean hasBin = hIsBinned(sqlGetDatabase(conn), table); hPrintf("<TT><PRE>"); safef(query, sizeof(query), "select * from %s where name='%s'" " and chrom='%s' and txStart=%d and txEnd=%d", table, geneId, curGeneChrom, curGeneStart, curGeneEnd); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct genePred *gene = genePredLoad(row+hasBin); struct bed *bed = bedFromGenePred(gene); struct dnaSeq *seq = hSeqForBed(sqlGetDatabase(conn), bed); hPrintf(">%s (%s predicted mRNA)\n", geneId, geneName); writeSeqWithBreaks(stdout, seq->dna, seq->size, 50); dnaSeqFree(&seq); bedFree(&bed); genePredFree(&gene); } else errAbort("Couldn't find %s at %s:%d-%d", geneId, curGeneChrom, curGeneStart, curGeneEnd); sqlFreeResult(&sr); hPrintf("</TT></PRE>"); }
static struct chromAnn* chromAnnGenePredReaderRead(struct chromAnnReader *car) /* Read the next genePred row and create a chromAnn object row read from a * GenePred file or table. If there is no CDS, and chromAnnCds is specified, * it will return a record with zero-length range.*/ { struct rowReader *rr = car->data; if (!rowReaderNext(rr)) return NULL; rowReaderExpectAtLeast(rr, GENEPRED_NUM_COLS); char **rawCols = (car->opts & chromAnnSaveLines) ? rowReaderCloneColumns(rr) : NULL; struct genePred *gp = genePredLoad(rr->row); struct chromAnn* ca = chromAnnNew(gp->chrom, gp->strand[0], gp->name, rawCols, strVectorWrite, strVectorFree); if (car->opts & chromAnnRange) { if (car->opts & chromAnnCds) { if (gp->cdsStart < gp->cdsEnd) chromAnnBlkNew(ca, gp->cdsStart, gp->cdsEnd); } else chromAnnBlkNew(ca, gp->txStart, gp->txEnd); } else addGenePredBlocks(ca, car->opts, gp); chromAnnFinish(ca); genePredFree(&gp); return ca; }
void readGenes(char *fileName, struct hash **retHash, struct chromGenes **retList) /* Read genes into a hash of chromGenes. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = newHash(8); struct chromGenes *chrom, *chromList = NULL; struct genePred *gp; char *row[10]; int count = 0; while (lineFileRow(lf, row)) { gp = genePredLoad(row); if ((chrom = hashFindVal(hash, gp->chrom)) == NULL) { AllocVar(chrom); hashAddSaveName(hash, gp->chrom, chrom, &chrom->name); slAddHead(&chromList, chrom); } slAddHead(&chrom->geneList, gp); ++count; } printf("Read %d genes in %d chromosomes in %s\n", count, slCount(chromList), fileName); lineFileClose(&lf); slSort(&chromList, chromGenesCmpName); *retHash = hash; *retList = chromList; }
void printBands(char *database, struct refLink *rl, FILE *f) /* Print name of genes and bands it occurs on. */ { struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; struct genePred *gp; char query[512]; int count = 0; struct dyString *bands = newDyString(0); char band[64]; sprintf(query, "select * from refGene where name = '%s'", rl->mrnaAcc); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { ++count; gp = genePredLoad(row); if (hChromBand(database, gp->chrom, (gp->txStart + gp->txEnd)/2, band)) dyStringPrintf(bands, "%s,", band); else dyStringPrintf(bands, "n/a,"); } if (count > 0) fprintf(f, "%s\t%s\t%d\t%s\n", rl->name, rl->mrnaAcc, count, bands->string); dyStringFree(&bands); sqlFreeResult(&sr); hFreeConn(&conn); }
static struct genePred *getCurGenePred(struct sqlConnection *conn) /* Return current gene in genePred. */ { char *track = genomeSetting("knownGene"); char table[HDB_MAX_TABLE_STRING]; boolean hasBin; char query[256]; struct sqlResult *sr; char **row; struct genePred *gp = NULL; if (!hFindSplitTable(sqlGetDatabase(conn), curGeneChrom, track, table, sizeof table, &hasBin)) errAbort("track %s not found", track); bool hasAttrId = sqlColumnExists(conn, table, "alignId"); sqlSafef(query, sizeof(query), "select * from %s where name = '%s' " "and chrom = '%s' and txStart=%d and txEnd=%d" , table, curGeneId, curGeneChrom, curGeneStart, curGeneEnd); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row + hasBin); #define ALIGNIDFIELD 11 // Gencode Id if (hasAttrId) curAlignId = cloneString(row[ALIGNIDFIELD]); else curAlignId = gp->name; } sqlFreeResult(&sr); if (gp == NULL) errAbort("getCurGenePred: Can't find %s", query); return gp; }
struct psl *getParentAligns(struct sqlConnection *conn, struct mappingInfo *mi, char **table) { struct ucscRetroInfo *pg = mi->pg; struct psl *pslList = NULL; char query[512]; if (startsWith("August",mi->geneSet)) { if (hTableExists(database, "augustusXAli")) { *table = cloneString( "augustusXAli"); pslList = loadPslRangeT(*table, mi->seqId, pg->gChrom, pg->gStart, pg->gEnd); } else if (hTableExists(database, "augustusX")) { struct sqlResult *sr; char **row; int targetSize = 0; *table = cloneString( "augustusX"); sqlSafef(query, sizeof(query), "select * from augustusX where chrom = '%s' and txEnd > %d and txStart < %d and name like '%s%%'", pg->gChrom, pg->gStart, pg->gEnd , mi->seqId ); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row+1); sqlSafef(query, sizeof(query), "select size from chromInfo where chrom = '%s' " , gp->chrom); sqlFreeResult(&sr); targetSize = sqlNeedQuickNum(conn, query) ; pslList = pslFromGenePred(gp, targetSize); } } } else if (hTableExists(database, "all_mrna")) { char parent[255]; char *dotPtr ; *table = cloneString( "all_mrna"); safef(parent, sizeof(parent), "%s",pg->name); /* strip off version and unique suffix when looking for parent gene*/ dotPtr = rStringIn(".",parent) ; if (dotPtr != NULL) *dotPtr = '\0'; pslList = loadPslRangeT(*table, mi->gbAcc, pg->gChrom, pg->gStart, pg->gEnd); if (pslList == NULL) { *table = cloneString( "refSeqAli"); pslList = loadPslRangeT(*table, mi->gbAcc, pg->gChrom, pg->gStart, pg->gEnd); } } else printf("no all_mrna table found<br>\n"); return pslList; }
struct genePred *readGenes(char *chrom) /* Slurp in the genes for one chrom */ { struct genePred *list=NULL, *el; char query[512]; struct sqlConnection *conn = hAllocConn(); struct sqlResult *sr; char **row; sqlSafef(query, sizeof(query), "select * from %s where chrom='%s' ", geneTable, chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = genePredLoad(row); slAddHead(&list,el); } sqlFreeResult(&sr); hFreeConn(&conn); slReverse(&list); /* could possibly skip if it made much difference in speed. */ return list; }
struct genePred *loadGeneFromTable(struct sqlConnection *conn, char *table, char *chrom, int chromStart, int chromEnd) /** Load all of the genes between chromstart and chromEnd */ { struct sqlResult *sr = NULL; char **row = NULL; int rowOffset = -100; struct genePred *geneList = NULL; struct genePred *gene = NULL; int i=0; sr = hRangeQuery(conn, table, chrom, chromStart, chromEnd, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { gene = genePredLoad(row+rowOffset); slSafeAddHead(&geneList, gene); } sqlFreeResult(&sr); slReverse(&geneList); return geneList; }
static void capAliTextOnTrack(struct mafAli *maf, char *db, char *chrom, char *track, boolean onlyCds) /* Capitalize exons in alignment. */ { int rowOffset; struct sqlConnection *conn = sqlConnect(db); struct mafComp *selfMc = maf->components, *mc; int start = selfMc->start; int end = start + selfMc->size; struct sqlResult *sr = hRangeQuery(conn, track, chrom, start, end, NULL, &rowOffset); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row+rowOffset); int i; for (i=0; i<gp->exonCount; ++i) { int s = gp->exonStarts[i]; int e = gp->exonEnds[i]; if (onlyCds) { if (s < gp->cdsStart) s = gp->cdsStart; if (e > gp->cdsEnd) e = gp->cdsEnd; } if (s < start) s = start; if (e > end) e = end; if (findAliRange(selfMc->text, maf->textSize, s-start, e-start, &s, &e)) { for (mc = maf->components; mc != NULL; mc = mc->next) if (mc->text) toUpperN(mc->text + s, e-s); } } genePredFree(&gp); } sqlFreeResult(&sr); sqlDisconnect(&conn); }
void geneStarts(char *chromosome, int start, int end) /* geneStarts - print start of genes in database. */ { struct sqlConnection *conn = sqlConnect("hg3"); struct sqlResult *sr; char **row; char query[256]; struct genePred *gp; sprintf(query, "select * from genieKnown where chrom = '%s' and txStart >= %d and txStart < %d", chromosome, start, end); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row); printf("%s on %s:%d-%d\n", gp->name, gp->chrom, gp->txStart, gp->txEnd); } sqlFreeResult(&sr); sqlDisconnect(&conn); }
struct genePred *loadGenePred(char *database, char *chrom, char *track, struct binKeeper *bk) /* Load in a gene prediction track to bk. */ { struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; struct genePred *list = NULL, *el; sr = hChromQuery(conn, track, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { el = genePredLoad(row + rowOffset); binKeeperAdd(bk, el->txStart, el->txEnd, el); slAddHead(&list, el); } sqlFreeResult(&sr); hFreeConn(&conn); slReverse(&list); return list; }
void txCdsGoodBed(char *database, char *outBed, char *outCds) /* txCdsGoodBed - Create positive example training set for SVM. This is based on * the refSeq reviewed genes, but we fragment a certain percentage of them so as * not to end up with a SVM that *requires* a complete transcript. */ { struct sqlConnection *conn = sqlConnect(database); char *refTrack = "refGene"; char *statusTable = "refSeqStatus"; if (!sqlTableExists(conn, refTrack)) errAbort("table %s doesn't exist in %s", refTrack, database); if (!sqlTableExists(conn, statusTable)) errAbort("table %s doesn't exist in %s", statusTable, database); FILE *fBed = mustOpen(outBed, "w"); FILE *fCds = mustOpen(outCds, "w"); char *query = "NOSQLINJ select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds " "from refGene r,refSeqStatus s where r.name=s.mrnaAcc and s.status='Reviewed'"; struct sqlResult *sr = sqlGetResult(conn, query); char **row; double randScale = 1.0/RAND_MAX; int id = 0; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row); int start = gp->txStart, end = gp->txEnd; char *type = "refReviewed"; if (rand()*randScale < frag) { double midRatio = rand()*randScale; if (midRatio > 0.5) gpFragLimits(gp, 0, midRatio, &start, &end); else gpFragLimits(gp, midRatio, 1.0, &start, &end); type = "refFrag"; } gpPartOutAsBed(gp, start, end, fBed, type, ++id, 0); gpPartOutAsCds(gp, start, end, fCds, type, id); } carefulClose(&fBed); }
void addGenePred(struct hash *chromHash, char **row) /* add a genePred's exons to the approriate binkeeper object in hash */ { struct genePred *gene = genePredLoad(row); int iExon; struct binKeeper *chromBins = getChromBins(chromHash, gene->chrom, gene->strand); struct geneLoc *geneLoc = geneLocNew(chromHash->lm, gene->name, gene->chrom, gene->strand, gene->txStart, gene->txEnd); for (iExon = 0; iExon < gene->exonCount; iExon++) { int exonStart = gene->exonStarts[iExon]; int exonEnd = gene->exonEnds[iExon]; if (gCdsOnly) { exonStart = max(exonStart, gene->cdsStart); exonEnd = min(exonEnd, gene->cdsEnd); } if (exonStart < exonEnd) binKeeperAdd(chromBins, exonStart, exonEnd, geneLoc); } genePredFree(&gene); }
struct genePred *getCurGenePred(struct sqlConnection *conn) /* Return current gene in genePred. */ { char *track = genomeSetting("knownGene"); char table[64]; boolean hasBin; char query[256]; struct sqlResult *sr; char **row; struct genePred *gp = NULL; hFindSplitTable(sqlGetDatabase(conn), curGeneChrom, track, table, &hasBin); sqlSafef(query, sizeof(query), "select * from %s where name = '%s' " "and chrom = '%s' and txStart=%d and txEnd=%d" , table, curGeneId, curGeneChrom, curGeneStart, curGeneEnd); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) gp = genePredLoad(row + hasBin); sqlFreeResult(&sr); if (gp == NULL) errAbort("getCurGenePred: Can't find %s", query); return gp; }
void intronSizes(char *database, char *table) /* intronSizes - Output list of intron sizes.. */ { struct dyString *query = newDyString(1024); struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *gp; int rowOffset; struct bed *bedList = NULL, *bed = NULL; hSetDb(database); rowOffset = hOffsetPastBin(NULL, table); conn = hAllocConn(database); sqlDyStringPrintf(query, "select * from %s", table); if (chromName != NULL) dyStringPrintf(query, " where chrom = '%s'", chromName); if (cgiBoolean("withUtr")) { dyStringPrintf(query, " %s txStart != cdsStart", (chromName == NULL ? "where" : "and")); } sr = sqlGetResult(conn, query->string); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row+rowOffset); genePredIntrons(gp, &bedList); slReverse(&bedList); for (bed = bedList ; bed != NULL ; bed=bed->next) bedTabOutN(bed,6, stdout); bedFreeList(&bedList); genePredFree(&gp); } sqlFreeResult(&sr); hFreeConn(&conn); }
void txCdsBadBed(char *database, char *altSpliceBed, char *outBed) /* txCdsBadBed - Create a bed file with regions that don't really have CDS, * but that might look like it.. */ { /* Open up database and make sure all the tables we want are there. */ char *refTrack = "refGene"; char *vegaPseudo = "vegaPseudoGene"; char *retroPseudo = "retroMrnaInfo"; struct sqlConnection *conn = sqlConnect(database); if (!sqlTableExists(conn, refTrack)) errAbort("table %s doesn't exist in %s", refTrack, database); if (!sqlTableExists(conn, vegaPseudo)) errAbort("table %s doesn't exist in %s", vegaPseudo, database); if (!sqlTableExists(conn, retroPseudo)) errAbort("table %s doesn't exist in %s", retroPseudo, database); /* Read in alt file and output larger retained and bleeding introns. */ struct bed *bed, *intronyList = loadRetainedAndBleeding(altSpliceBed); FILE *f = mustOpen(outBed, "w"); for (bed = intronyList; bed != NULL; bed = bed->next) { int size = bed->chromEnd - bed->chromStart; if (size > 400) { fprintf(f, "%s\t%d\t%d\t", bed->chrom, bed->chromStart, bed->chromEnd); fprintf(f, "%s%d\t", bed->name, ++id); fprintf(f, "%d\t%s\t", bed->score, bed->strand); fprintf(f, "0\t0\t0\t1\t"); fprintf(f, "%d,\t%d,\n", bed->chromEnd - bed->chromStart, 0); } } /* Read in refGene, and write out larger 3' UTRs, and occassional antisense copies. */ char query[512]; safef(query, sizeof(query), "select * from %s", refTrack); int rowOffset = 0; if (sqlFieldIndex(conn, refTrack, "bin") == 0) rowOffset = 1; struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row + rowOffset); int start, end; if (gp->strand[0] == '+') { start = gp->cdsEnd; end = gp->txEnd; } else { start = gp->txStart; end = gp->cdsStart; } if (end - start > 400) { gpPartOutAsBed(gp, start, end, f, "utr", ++id, 400); } if (rand()%20 == 0) { gp->strand[0] = (gp->strand[0] == '+' ? '-' : '+'); gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "anti", ++id, 0); } } sqlFreeResult(&sr); /* Write out vega pseudo-genes. */ safef(query, sizeof(query), "select * from %s", vegaPseudo); rowOffset = 0; if (sqlFieldIndex(conn, vegaPseudo, "bin") == 0) rowOffset = 1; sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row + rowOffset); gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "vega", ++id, 0); } /* Write out retroGenes. */ safef(query, sizeof(query), "select * from %s where score > 600", retroPseudo); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct bed *bed = bedLoad12(row); char name[128]; safef(name, sizeof(name), "retro_%d_%s", ++id, bed->name); bed->name = name; bedTabOutN(bed, 12, f); } carefulClose(&f); }
void intronEnds(char *database, char *table) /* intronEnds - Gather stats on intron ends.. */ { struct dyString *query = newDyString(1024); struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *gp; int total = 0; int gtag = 0; int gcag = 0; int atac = 0; int ctac = 0; DNA ends[4]; int exonIx, txStart; struct dnaSeq *seq; int rowOffset; char strand; rowOffset = hOffsetPastBin(database, NULL, table); conn = hAllocConn(database); sqlDyStringPrintf(query, "select * from %s", table); if (chromName != NULL) dyStringPrintf(query, " where chrom = '%s'", chromName); if (cgiBoolean("withUtr")) { dyStringPrintf(query, " %s txStart != cdsStart", (chromName == NULL ? "where" : "and")); } sr = sqlGetResult(conn, query->string); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row+rowOffset); strand = gp->strand[0]; txStart = gp->txStart; seq = hDnaFromSeq(database, gp->chrom, txStart, gp->txEnd, dnaLower); for (exonIx=1; exonIx < gp->exonCount; ++exonIx) { ++total; memcpy(ends, seq->dna + gp->exonEnds[exonIx-1] - txStart, 2); memcpy(ends+2, seq->dna + gp->exonStarts[exonIx] - txStart - 2, 2); if (strand == '-') reverseComplement(ends, 4); if (ends[0] == 'g' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'g') ++gtag; if (ends[0] == 'g' && ends[1] == 'c' && ends[2] == 'a' && ends[3] == 'g') ++gcag; if (ends[0] == 'a' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'c') ++atac; if (ends[0] == 'c' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'c') ++ctac; } freeDnaSeq(&seq); genePredFree(&gp); } sqlFreeResult(&sr); hFreeConn(&conn); printf("gt/ag %d (%4.2f)\n", gtag, 100.0*gtag/total); printf("gc/ag %d (%4.2f)\n", gcag, 100.0*gcag/total); printf("at/ac %d (%4.2f)\n", atac, 100.0*atac/total); printf("ct/ac %d (%4.2f)\n", ctac, 100.0*ctac/total); printf("Total %d\n", total); }
void knownToVisiGene(char *database) /* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */ { char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); struct sqlConnection *hConn = sqlConnect(database); struct sqlConnection *iConn = sqlConnect(visiDb); struct sqlResult *sr; char **row; struct hash *geneImageHash = newHash(18); struct hash *locusLinkImageHash = newHash(18); struct hash *refSeqImageHash = newHash(18); struct hash *genbankImageHash = newHash(18); struct hash *probeImageHash = newHash(18); struct hash *knownToLocusLinkHash = newHash(18); struct hash *knownToRefSeqHash = newHash(18); struct hash *knownToGeneHash = newHash(18); struct hash *favorHugoHash = newHash(18); struct hash *knownToProbeHash = newHash(18); struct hash *knownToAllProbeHash = newHash(18); struct genePred *knownList = NULL, *known; struct hash *dupeHash = newHash(17); probesDb = optionVal("probesDb", database); struct sqlConnection *probesConn = sqlConnect(probesDb); vgProbes = sqlTableExists(probesConn,"vgProbes"); vgAllProbes = sqlTableExists(probesConn,"vgAllProbes"); /* Go through and make up hashes of images keyed by various fields. */ sr = sqlGetResult(iConn, NOSQLINJ "select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank" ",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id" " from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap" " where image.imageFile = imageFile.id" " and image.id = imageProbe.image" " and imageProbe.probe = probe.id" " and probe.gene = gene.id" " and image.submissionSet=submissionSet.id" " and vgPrbMap.probe = probe.id"); while ((row = sqlNextRow(sr)) != NULL) { int id = sqlUnsigned(row[0]); float priority = atof(row[1]); int privateUser = sqlSigned(row[7]); char vgPrb_Id[256]; safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]); int geneId = sqlUnsigned(row[9]); if (privateUser == 0) { addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id); addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]); addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]); addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]); addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]); } } verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d" ", genbankImageHash %d probeImageHash %d\n", geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, genbankImageHash->elCount, probeImageHash->elCount); sqlFreeResult(&sr); /* Build up list of known genes. */ sr = sqlGetResult(hConn, NOSQLINJ "select * from knownGene"); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *known = genePredLoad(row); if (!hashLookup(dupeHash, known->name)) { hashAdd(dupeHash, known->name, NULL); slAddHead(&knownList, known); } } slReverse(&knownList); sqlFreeResult(&sr); verbose(2, "Got %d known genes\n", slCount(knownList)); /* Build up hashes from knownGene to other things. */ if (vgProbes) bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash); if (vgAllProbes) bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash); foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE); foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE); foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE); foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount, knownToProbeHash->elCount, knownToAllProbeHash->elCount); /* Try and find an image for each gene. */ for (known = knownList; known != NULL; known = known->next) { char *name = known->name; struct prioritizedImage *best = NULL; { best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash); if (!best) best = bestImage(name, knownToRefSeqHash, refSeqImageHash); if (!best) { best = hashFindVal(genbankImageHash, name); } if (!best) best = bestImage(name, knownToGeneHash, geneImageHash); if (vgProbes && !best) best = bestImage(name, knownToProbeHash, probeImageHash); if (vgAllProbes && !best) best = bestImage(name, knownToAllProbeHash, probeImageHash); } if (best) { fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId); } } createTable(hConn, outTable); hgLoadTabFile(hConn, tempDir, outTable, &f); hgRemoveTabFile(tempDir, outTable); }
int main(int argc, char *argv[]) { long enteredMainTime = clock1000(); struct dyString *output = newDyString(10000); setUdcCacheDir(); cgiSpoof(&argc, argv); pushWarnHandler(htmlVaBadRequestAbort); pushAbortHandler(htmlVaBadRequestAbort); char *database = cgiString("db"); char *cmd = cgiString("cmd"); char *jsonp = cgiOptionalString("jsonp"); if (!hDbExists(database)) errAbort("Invalid database '%s'", database); if (!strcmp(cmd, "defaultPos")) { dyStringPrintf(output, "{\"pos\": \"%s\"}", hDefaultPos(database)); } else if (!strcmp(cmd, "metaDb")) { // Return list of values for given metaDb var // e.g. http://genome.ucsc.edu/hgApi?db=hg18&cmd=metaDb&var=cell struct sqlConnection *conn = hAllocConn(database); boolean metaDbExists = sqlTableExists(conn, "metaDb"); if (metaDbExists) { char *var = cgiOptionalString("var"); if (!var) errAbort("Missing var parameter"); boolean fileSearch = (cgiOptionalInt("fileSearch",0) == 1); struct slPair *pairs = mdbValLabelSearch(conn, var, MDB_VAL_STD_TRUNCATION, FALSE, !fileSearch, fileSearch); struct slPair *pair; dyStringPrintf(output, "[\n"); for (pair = pairs; pair != NULL; pair = pair->next) { if (pair != pairs) dyStringPrintf(output, ",\n"); dyStringPrintf(output, "['%s','%s']", javaScriptLiteralEncode(mdbPairLabel(pair)), javaScriptLiteralEncode(mdbPairVal(pair))); } dyStringPrintf(output, "\n]\n"); } else errAbort("Assembly does not support metaDb"); } // TODO: move to lib since hgTracks and hgApi share #define METADATA_VALUE_PREFIX "hgt_mdbVal" else if (startsWith(METADATA_VALUE_PREFIX, cmd)) { // Returns metaDb value control: drop down or free text, with or without help link. // e.g. http://genome.ucsc.edu/hgApi?db=hg18&cmd=hgt_mdbVal3&var=cell // TODO: Move guts to lib, so that hgTracks::searchTracks.c and hgApi.c can share struct sqlConnection *conn = hAllocConn(database); boolean metaDbExists = sqlTableExists(conn, "metaDb"); if (metaDbExists) { char *var = cgiOptionalString("var"); if (!var) errAbort("Missing var parameter"); int ix = atoi(cmd+strlen(METADATA_VALUE_PREFIX)); // 1 based index if (ix == 0) // errAbort("Unsupported 'cmd' parameter"); enum cvSearchable searchBy = cvSearchMethod(var); char name[128]; safef(name,sizeof name,"%s%i",METADATA_VALUE_PREFIX,ix); if (searchBy == cvSearchBySingleSelect || searchBy == cvSearchByMultiSelect) { boolean fileSearch = (cgiOptionalInt("fileSearch",0) == 1); struct slPair *pairs = mdbValLabelSearch(conn, var, MDB_VAL_STD_TRUNCATION, FALSE, !fileSearch, fileSearch); if (slCount(pairs) > 0) { char *dropDownHtml = cgiMakeSelectDropList((searchBy == cvSearchByMultiSelect), name, pairs, NULL, ANYLABEL, "mdbVal", "style='min-width: 200px; font-size: .9em;' " "onchange='findTracksMdbValChanged(this);'"); if (dropDownHtml) { dyStringAppend(output,dropDownHtml); freeMem(dropDownHtml); } slPairFreeList(&pairs); } } else if (searchBy == cvSearchByFreeText) { dyStringPrintf(output,"<input type='text' name='%s' value='' class='mdbVal freeText' " "onchange='findTracksMdbValChanged(this);' style='max-width:310px; " "width:310px; font-size:.9em;'>", name); } else if (searchBy == cvSearchByWildList) { dyStringPrintf(output,"<input type='text' name='%s' value='' class='mdbVal wildList' " "title='enter comma separated list of values' " "onchange='findTracksMdbValChanged(this);' style='max-width:310px; " "width:310px; font-size:.9em;'>", name); } else if (searchBy == cvSearchByDateRange || searchBy == cvSearchByIntegerRange) { // TO BE IMPLEMENTED } else errAbort("Metadata variable not searchable"); dyStringPrintf(output,"<span id='helpLink%i'> </span>",ix); } else errAbort("Assembly does not support metaDb"); } else if (!strcmp(cmd, "tableMetadata")) { // returns an html table with metadata for a given track char *trackName = cgiOptionalString("track"); boolean showLonglabel = (NULL != cgiOptionalString("showLonglabel")); boolean showShortLabel = (NULL != cgiOptionalString("showShortLabel")); if (trackName != NULL) { // hTrackDbForTrackAndAncestors avoids overhead of getting whole track list! struct trackDb *tdb = hTrackDbForTrackAndAncestors(database, trackName); if (tdb != NULL) { char * html = metadataAsHtmlTable(database,tdb,showLonglabel,showShortLabel); if (html) { dyStringAppend(output,html); freeMem(html); } else dyStringPrintf(output,"No metadata found for track %s.",trackName); } else dyStringPrintf(output,"Track %s not found",trackName); } else dyStringAppend(output,"No track variable found"); } else if (sameString(cmd, "codonToPos") || sameString(cmd, "exonToPos")) { char query[256]; struct sqlResult *sr; char **row; struct genePred *gp; char *name = cgiString("name"); char *table = cgiString("table"); int num = cgiInt("num"); struct sqlConnection *conn = hAllocConn(database); sqlSafef(query, sizeof(query), "select name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds from %s where name = '%s'", table, name); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row); boolean found; int start, end; if (sameString(cmd, "codonToPos")) found = codonToPos(gp, num, &start, &end); else found = exonToPos(gp, num, &start, &end); if (found) dyStringPrintf(output, "{\"pos\": \"%s:%d-%d\"}", gp->chrom, start + 1, end); else dyStringPrintf(output, "{\"error\": \"%d is an invalid %s for this gene\"}", num, sameString(cmd, "codonToPos") ? "codon" : "exon"); } else dyStringPrintf(output, "{\"error\": \"Couldn't find item: %s\"}", name); sqlFreeResult(&sr); hFreeConn(&conn); } else { warn("unknown cmd: %s",cmd); errAbort("Unsupported 'cmd' parameter"); } apiOut(dyStringContents(output), jsonp); cgiExitTime("hgApi", enteredMainTime); return 0; }
static void displayMappingInfo(struct sqlConnection *conn, struct mappingInfo *mi) /* display information from a transMap table */ { struct ucscRetroInfo *pg = mi->pg; double wt[12]; /* weights on score function*/ char query[512]; char *name; char alignTbl[128]; char scoreSql[128]; struct psl *psl; float coverFactor = 0; float maxOverlap = 0; if (mi->suffix == NULL) { safef(alignTbl, sizeof(alignTbl), "%s%sAli", mi->tblPre, mi->geneSet); sqlSafef(scoreSql, sizeof(scoreSql), "select max(score) from %s%sInfo", mi->tblPre, mi->geneSet); } else { safef(alignTbl, sizeof(alignTbl), "%s%sAli%s", mi->tblPre, mi->geneSet, mi->suffix); sqlSafef(scoreSql, sizeof(scoreSql), "select max(score) from %s%sInfo%s", mi->tblPre, mi->geneSet, mi->suffix); } printf("<TABLE class=\"transMap\">\n"); printf("<H3>Retrogene Statistics:</H3>\n"); printf("<THEAD>\n"); printf("<TR><TH>Feature<TH>Value </TR>\n"); printf("</THEAD><TBODY>\n"); if (sameString(pg->type, "singleExon")) printf("<TR><TH>Type of Parent<TD>%s</tr>\n",pg->type); else printf("<TR><TH>Expression of Retrogene<TD>%s</TR>\n",pg->type); printf("<TR><TH>Score <TD>%d (range from 0 - %d)</TR>\n", pg->score, sqlQuickNum(conn, scoreSql) ); printf("<TR><TH>Parent Gene Alignment Coverage (Bases Matching Parent) <TD>%d %% (%d bp) </TR>\n", pg->coverage, pg->matches); printf("<TR><TH>Introns Processed Out <TD>%d out of %d (%d exons covered)\n", pg->processedIntrons, (pg->parentSpliceCount/2), pg->exonCover); printf("<TR><TH>Possible Introns or Gaps in Retrogene<TD>%d,%d\n", pg->intronCount, pg->alignGapCount); printf("<TR><TH>Conserved Splice Sites<TD>%d</TR>\n", pg->conservedSpliceSites); printf("<TR><TH>Parent Splice Sites<TD>%d</TR>\n", pg->parentSpliceCount); psl = getAlignments(conn, alignTbl, mi->pg->name); if (psl != NULL) { maxOverlap = (float)pg->maxOverlap/(float)(psl->match+psl->misMatch+psl->repMatch) ; coverFactor = ((float)(psl->qSize-psl->qEnd)/(float)psl->qSize); } else { maxOverlap = 0; } wt[0] = 0; wt[1] = 0.85; wt[2] = 0.2; wt[3] = 0.3; wt[4] = 0.8; wt[5] = 1; wt[6] = 1 ; wt[7] = 0.5; wt[8] = 0.5; wt[9] = 1; wt[10] = 1; #ifdef debug char table[512]; struct psl *pslList = getParentAligns(conn, mi, &table); if (psl != NULL) { printf("<TR><TH>Blocks in retro:gap%%/intronsSpliced <TD>\n"); printBlocks(psl, MAXBLOCKGAP, pslList); printf("</td></TR>\n"); } if (pslList != NULL) { printf("<TR><TH>Exons in parent:gap%% <TD>\n"); printBlocks(pslList, MAXBLOCKGAP, NULL); printf("</td></TR>\n"); pslFreeList(&pslList); } #endif printf("<TR><TH>Length of PolyA Tail<TD>%d As out of %d bp </TR><TR><TH>%% A's from Parent PolyA tail (Position)<TD>%5.1f %%\n",pg->polyA,pg->polyAlen, (float)pg->polyA*100/(float)pg->polyAlen); if (pg->polyAstart < 0) printf(" (%d bp before end of retrogene)<br>\n",-(pg->polyAstart)); else printf(" (%d bp past end of retrogene)<br>\n",pg->polyAstart); printf("<tr><th>mRNA Expression Evidence<td>"); if (!sameString(pg->overName, "none")) printf("%s (overlap: %d bp)\n", pg->overName, pg->maxOverlap); else printf("No overlapping"); printf("<TR><TH>BESTORF Score (>50 is good)<TD>%4.0f</td></TR>\n",pg->posConf); #ifdef score printf("<TR><TH>score function<TD>1:xon %d %4.1f conSS %d 2: ax %4.1f 3: pA %4.1f 4: net + %4.1f max (%d, %d) 5: procIntrons %d %4.1f 6:in.cnt %d -%4.1f 7:overlap - %4.1f 8:cov %d*(qe %d- qsz %d)/%d=%4.1f 9:tRep - %4.1f 10:oldintron %d %4.1f </td></TR>\n", pg->exonCover, wt[1]*(log(pg->exonCover+1)/log(2))*200 , pg->conservedSpliceSites, wt[2]*(((log(pg->axtScore>0?pg->axtScore:1)/log(2))*170)-1000), wt[3]*(log(pg->polyAlen+2)*200) , wt[4]*overlapOrtholog*10 , pg->overlapMouse, pg->overlapDog, pg->processedIntrons, wt[5]*(((log(pg->processedIntrons > 0 ? pg->processedIntrons : 1))/log(2))*600) , pg->intronCount, wt[6]*pow(pg->intronCount,0.5)*750 , wt[7]*(maxOverlap*300), pg->coverage, pg->qEnd, pg->qSize , pg->qSize, wt[8]*((pg->coverage/100.0)*(1.0-coverFactor)*300.0), wt[9]*(pg->tReps*10), pg->alignGapCount, wt[10]*pg->alignGapCount); printf("<TR><TH>score function<TD>%4.1f+ %4.1f+ %4.1f+ %4.1f+ %4.1f - %4.1f - %4.1f+ %4.1f - %4.1f - %4.1f</td></TR>\n", wt[1]*(log(pg->exonCover+1)/log(2))*200 , wt[2]*(((log(pg->axtScore>0?pg->axtScore:1)/log(2))*170)-1000), wt[3]*(log(pg->polyAlen+2)*200) , wt[4]*overlapOrtholog*10 , wt[5]*(((log(pg->processedIntrons > 0 ? pg->processedIntrons : 1))/log(2))*600) , (float)wt[6]*pow(pg->intronCount,0.5)*750 , (float)wt[7]*(maxOverlap*300), wt[8]*((pg->coverage/100.0)*(1.0-coverFactor)*300.0), wt[9]*(pg->tReps*10), wt[10]*pg->alignGapCount); if (pg->kaku > 0 && pg->kaku < 1000000) printf("<TR><TH>KA/KU mutation rate in non-syn sites vs utr with repect to parent gene<TD>%4.2f</TR>\n", pg->kaku); #endif #ifdef xxx sqlSafef(query, sizeof(query), "select * from refGene where chrom = '%d' and txEnd > %d and txStart %d and name = '%s'", pg->chrom, pg->gStart, pg->gEnd , pg->overName ); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) overlappingGene = genePredLoad(row); if (overlappingGene != NULL) { printf ("CDS exons %d ",genePredcountCdsExons(overlappingGene)); } #endif printf("</tr>\n"); if ( differentString("none",pg->overName) && sqlFieldIndex(conn, "refGene", "exonFrames") != -1) { sqlSafef(query, sizeof(query), "select concat(exonFrames,'(',cdsStart,')') from refGene where name = '%s' and chrom = '%s'" , pg->overName, pg->chrom); if (sqlQuickString(conn, query) != NULL) printf("<TR><TH>Frame of retro %s (start)<TD>%s</TR>\n", pg->overName, sqlQuickString(conn, query)); } name = cloneString(pg->name); chopSuffix(name); sqlSafef(query, sizeof(query), "select concat(exonFrames,'(',cdsStart,')') from rbRetroParent where name like '%s%%' and chrom = '%s'" , name, pg->chrom); if (hTableExists(database, "rbRetroParent")) { if ( sqlQuickString(conn, query) != NULL) printf("<TR><TH>Frames of mapped parent %s (start)<TD>%s</TR>\n", name, sqlQuickString(conn, query)); } printf("</TBODY></TABLE>\n"); }
struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf, struct annoStreamRows *primaryData, boolean *retRJFilterFailed, struct lm *callerLm) // integrate a variant and a genePred, generate as many rows as // needed to capture all the changes { struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; lmCleanup(&(self->lm)); self->lm = lmInit(0); // Temporarily tweak primaryRow's start and end to find upstream/downstream overlap: struct annoRow *primaryRow = primaryData->rowList; int pStart = primaryRow->start, pEnd = primaryRow->end; if (primaryRow->start <= GPRANGE) primaryRow->start = 0; else primaryRow->start -= GPRANGE; primaryRow->end += GPRANGE; struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm); primaryRow->start = pStart; primaryRow->end = pEnd; if (self->variantFromRow == NULL) setVariantFromRow(self, primaryData); if (self->curChromSeq == NULL || differentString(self->curChromSeq->name, primaryRow->chrom)) { dnaSeqFree(&self->curChromSeq); struct twoBitFile *tbf = self->grator.streamer.assembly->tbf; self->curChromSeq = twoBitReadSeqFragLower(tbf, primaryRow->chrom, 0, 0); } // TODO Performance improvement: instead of creating the transcript sequence for each // variant that intersects the transcript, cache transcript sequence; possibly // an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd, // exonStarts, exonEnds} as the name, and sequence as the val. When something in // the list is no longer in the list of rows from the internal annoGratorIntegrate call, // drop it. // BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it. char *refAllele = getGenomicSequence(self->curChromSeq->dna, primaryRow->start, primaryRow->end, self->lm); struct variant *variant = self->variantFromRow(self, primaryRow, refAllele); if (rows == NULL) { // No genePreds means that the primary variant is intergenic. if (self->funcFilter != NULL && self->funcFilter->intergenic) return aggvIntergenicRow(self, variant, retRJFilterFailed, callerLm); else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return NULL; } if (retRJFilterFailed && *retRJFilterFailed) return NULL; struct annoRow *outRows = NULL; int hasFrames = (asColumnFindIx(gSelf->mySource->asObj->columnList, "exonFrames") >= 0); for(; rows; rows = rows->next) { char **inWords = rows->data; // work around genePredLoad's trashing its input char *saveExonStarts = lmCloneString(self->lm, inWords[8]); char *saveExonEnds = lmCloneString(self->lm, inWords[9]); struct genePred *gp = hasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) : genePredLoad(inWords); inWords[8] = saveExonStarts; inWords[9] = saveExonEnds; struct annoRow *outRow = aggvGenRows(self, variant, gp, rows, callerLm); if (outRow != NULL) { slReverse(&outRow); outRows = slCat(outRow, outRows); } genePredFree(&gp); } slReverse(&outRows); // If all rows failed the filter, and we must overlap, set *retRJFilterFailed. if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return outRows; }