void txCdsToGene(char *txBed, char *txFa, char *txCds, char *outGtf, char *outFa) /* txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and * protein sequence. */ { struct hash *txSeqHash = faReadAllIntoHash(txFa, dnaLower); verbose(2, "Read %d transcript sequences from %s\n", txSeqHash->elCount, txFa); struct hash *cdsHash = cdsEvidenceReadAllIntoHash(txCds); verbose(2, "Read %d cdsEvidence from %s\n", cdsHash->elCount, txCds); struct lineFile *lf = lineFileOpen(txBed, TRUE); FILE *fGtf = mustOpen(outGtf, "w"); FILE *fFa = mustOpen(outFa, "w"); char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(2, "processing %s\n", bed->name); struct cdsEvidence *cds = hashFindVal(cdsHash, bed->name); struct dnaSeq *txSeq = hashFindVal(txSeqHash, bed->name); char *cdsSource = NULL; if (txSeq == NULL) errAbort("%s is in %s but not %s", bed->name, txBed, txFa); if (cds != NULL) { outputProtein(cds, txSeq, fFa); if (cds->cdsCount > 1) { struct bed *newBed = breakUpBedAtCdsBreaks(cds, bed); if (fTweaked) fprintf(fTweaked, "%s\n", newBed->name); bedFree(&bed); bed = newBed; } cdsSource = cds->accession; if (sameString(cds->accession, ".")) cdsSource = cds->source; } /* Set bed CDS bounds and optionally output bed. */ cdsEvidenceSetBedThick(cds, bed); if (fBed) bedTabOutN(bed, 12, fBed); /* Parse out bed name, which is in format chrom.geneId.txId.accession */ char *geneName = cloneString(bed->name); char *accession = strrchr(geneName, '.'); assert(accession != NULL); *accession++ = 0; chopSuffix(geneName); /* Output as GTF */ bedToGtf(bed, accession, cdsSource, geneName, fGtf); /* Clean up for next iteration of loop. */ freez(&geneName); bedFree(&bed); } lineFileClose(&lf); carefulClose(&fFa); carefulClose(&fGtf); }
struct bed *loadBedFileWithHeader(char *fileName) /* Read in a bed file into a bed list, dealing with header for custom track if necessary. */ { struct bed *bedList = NULL, *bed = NULL; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[12]; int lineSize; char *line; /* Skip the headers. */ while(lineFileNext(lf, &line, &lineSize)) { if(countChars(line, '\t') > 10) { lineFileReuse(lf); break; } } /* Load in the records. */ while(lineFileRow(lf, row)) { bed = bedLoad12(row); slAddHead(&bedList, bed); } lineFileClose(&lf); slReverse(&bedList); return bedList; }
char *altGraphId(struct sqlConnection *conn, struct genePred *gp) /* Return altGraphId that overlaps most with genePred. */ { int rowOffset; struct sqlResult *sr; char **row; struct bed *bestBed = NULL; int intersect, bestIntersect = 0; char extra[64]; char *ret = NULL; safef(extra, sizeof(extra), "strand='%s'", gp->strand); sr = hRangeQuery(conn, "agxBed", gp->chrom, gp->txStart, gp->txEnd, extra, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct bed *bed = bedLoad12(row+rowOffset); intersect = gpBedBasesShared(gp, bed); if (intersect > bestIntersect) { bestIntersect = intersect; bestBed = bed; } else bedFree(&bed); } if (bestBed != NULL) { ret = cloneString(bestBed->name); bedFree(&bestBed); } return ret; }
void loadGappedBed(struct track *tg) /* Convert bed info in window to linked feature. */ { struct sqlResult *sr; char **row; int rowOffset; struct bed *bed; struct linkedFeatures *lfList = NULL, *lf; struct trackDb *tdb = tg->tdb; int scoreMin = atoi(trackDbSettingClosestToHomeOrDefault(tdb, "scoreMin", "0")); int scoreMax = atoi(trackDbSettingClosestToHomeOrDefault(tdb, "scoreMax", "1000")); boolean useItemRgb = FALSE; useItemRgb = bedItemRgb(tdb); if (tg->isBigBed) { // avoid opening an unneeded db connection for bigBed; required not to use mysql for parallel fetch tracks bigBedAddLinkedFeaturesFrom(tg, chromName, winStart, winEnd, scoreMin, scoreMax, useItemRgb, 12, &lfList); } else { /* Use tg->tdb->track because subtracks inherit composite track's tdb * by default, and the variable is named after the composite track. */ struct sqlConnection *conn = hAllocConn(database); char *scoreFilterClause = getScoreFilterClause(cart, tg->tdb,NULL); if (scoreFilterClause != NULL) { sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd,scoreFilterClause, &rowOffset); freeMem(scoreFilterClause); } else { sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd, NULL, &rowOffset); } while ((row = sqlNextRow(sr)) != NULL) { bed = bedLoad12(row+rowOffset); adjustBedScoreGrayLevel(tdb, bed, scoreMin, scoreMax); lf = lfFromBedExtra(bed, scoreMin, scoreMax); if (useItemRgb) { lf->extra = (void *)USE_ITEM_RGB; /* signal for coloring */ lf->filterColor=bed->itemRgb; } slAddHead(&lfList, lf); bedFree(&bed); } sqlFreeResult(&sr); hFreeConn(&conn); } slReverse(&lfList); if(tg->extraUiData) filterBed(tg, &lfList); slSort(&lfList, linkedFeaturesCmp); tg->items = lfList; }
struct bed *bedLoadAll(char *fileName) /* Load all bed's in file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct bed *bedList = NULL, *bed; char *row[12]; while(lineFileRow(lf,row)) { bed = bedLoad12(row); slAddHead(&bedList, bed); } slReverse(&bedList); lineFileClose(&lf); return bedList; }
struct bed *bedLoadTwoBlocks(struct lineFile *lf) /* Load a bed12, and make sure each item is 1 or 2 blocks. Return list of * the two block ones. */ { struct bed *bedList = NULL, *bed; char *row[12]; int totalCount = 0, pairCount = 0; while (lineFileRow(lf, row)) { bed = bedLoad12(row); if (bed->blockCount != 1 && bed->blockCount != 2) errAbort("Line %d of %s got blockCount of %d, expecting 1 or 2", lf->lineIx, lf->fileName, bed->blockCount); ++totalCount; if (bed->blockCount == 2) { slAddHead(&bedList, bed); ++pairCount; } } slReverse(&bedList); verbose(1, "Got %d items including %d pairs in %s\n", totalCount, pairCount, lf->fileName); return bedList; }
void txCdsBadBed(char *database, char *altSpliceBed, char *outBed) /* txCdsBadBed - Create a bed file with regions that don't really have CDS, * but that might look like it.. */ { /* Open up database and make sure all the tables we want are there. */ char *refTrack = "refGene"; char *vegaPseudo = "vegaPseudoGene"; char *retroPseudo = "retroMrnaInfo"; struct sqlConnection *conn = sqlConnect(database); if (!sqlTableExists(conn, refTrack)) errAbort("table %s doesn't exist in %s", refTrack, database); if (!sqlTableExists(conn, vegaPseudo)) errAbort("table %s doesn't exist in %s", vegaPseudo, database); if (!sqlTableExists(conn, retroPseudo)) errAbort("table %s doesn't exist in %s", retroPseudo, database); /* Read in alt file and output larger retained and bleeding introns. */ struct bed *bed, *intronyList = loadRetainedAndBleeding(altSpliceBed); FILE *f = mustOpen(outBed, "w"); for (bed = intronyList; bed != NULL; bed = bed->next) { int size = bed->chromEnd - bed->chromStart; if (size > 400) { fprintf(f, "%s\t%d\t%d\t", bed->chrom, bed->chromStart, bed->chromEnd); fprintf(f, "%s%d\t", bed->name, ++id); fprintf(f, "%d\t%s\t", bed->score, bed->strand); fprintf(f, "0\t0\t0\t1\t"); fprintf(f, "%d,\t%d,\n", bed->chromEnd - bed->chromStart, 0); } } /* Read in refGene, and write out larger 3' UTRs, and occassional antisense copies. */ char query[512]; safef(query, sizeof(query), "select * from %s", refTrack); int rowOffset = 0; if (sqlFieldIndex(conn, refTrack, "bin") == 0) rowOffset = 1; struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row + rowOffset); int start, end; if (gp->strand[0] == '+') { start = gp->cdsEnd; end = gp->txEnd; } else { start = gp->txStart; end = gp->cdsStart; } if (end - start > 400) { gpPartOutAsBed(gp, start, end, f, "utr", ++id, 400); } if (rand()%20 == 0) { gp->strand[0] = (gp->strand[0] == '+' ? '-' : '+'); gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "anti", ++id, 0); } } sqlFreeResult(&sr); /* Write out vega pseudo-genes. */ safef(query, sizeof(query), "select * from %s", vegaPseudo); rowOffset = 0; if (sqlFieldIndex(conn, vegaPseudo, "bin") == 0) rowOffset = 1; sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row + rowOffset); gpPartOutAsBed(gp, gp->txStart, gp->txEnd, f, "vega", ++id, 0); } /* Write out retroGenes. */ safef(query, sizeof(query), "select * from %s where score > 600", retroPseudo); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct bed *bed = bedLoad12(row); char name[128]; safef(name, sizeof(name), "retro_%d_%s", ++id, bed->name); bed->name = name; bedTabOutN(bed, 12, f); } carefulClose(&f); }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void txCdsPick(char *inBed, char *inTce, char *refToPepTab, char *outTce, char *outPick) /* txCdsPick - Pick best CDS if any for transcript given evidence.. */ { struct hash *pepToRefHash, *refToPepHash; hashRefToPep(refToPepTab, &refToPepHash, &pepToRefHash); struct hash *txCdsInfoHash = loadAndWeighTce(inTce, refToPepHash, pepToRefHash); verbose(2, "Read info on %d transcripts from %s\n", txCdsInfoHash->elCount, inTce); struct lineFile *lf = lineFileOpen(inBed, TRUE); FILE *fTce = mustOpen(outTce, "w"); FILE *fPick = mustOpen(outPick, "w"); char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); struct txCdsInfo *tx = hashFindVal(txCdsInfoHash, bed->name); struct cdsPick pick; ZeroVar(&pick); pick.name = bed->name; pick.refSeq = pick.refProt = pick.swissProt = pick.uniProt = pick.ccds = ""; if (tx != NULL && tx->cdsList->score >= weightedThreshold) { struct cdsEvidence *cds, *bestCds = tx->cdsList; int bestSize = bestCds->end - bestCds->start; int minSize = bestSize*0.50; cdsEvidenceTabOut(bestCds, fTce); pick.start = bestCds->start; pick.end = bestCds->end; pick.source = bestCds->source; pick.score = bestCds->score; pick.startComplete = bestCds->startComplete; pick.endComplete = bestCds->endComplete; for (cds = tx->cdsList; cds != NULL; cds = cds->next) { char *source = cds->source; if (rangeIntersection(bestCds->start, bestCds->end, cds->start, cds->end) >= minSize) { if (startsWith("RefPep", source)) { if (pick.refProt[0] == 0) { pick.refProt = cds->accession; if (pick.refSeq[0] == 0) pick.refSeq = hashMustFindVal(pepToRefHash, cds->accession); } } else if (startsWith("RefSeq", source)) { if (pick.refSeq[0] == 0) pick.refSeq = cds->accession; } else if (sameString("swissProt", source)) { if (pick.swissProt[0] == 0) { pick.swissProt = cds->accession; if (pick.uniProt[0] == 0) pick.uniProt = cds->accession; } } else if (sameString("trembl", source)) { if (pick.uniProt[0] == 0) pick.uniProt = cds->accession; } else if (sameString("txCdsPredict", source)) { } else if (sameString("genbankCds", source)) { } else if (sameString("ccds", source)) { if (pick.ccds[0] == 0) pick.ccds = cds->accession; } else errAbort("Unknown source %s", source); } } if (exceptionsOut) transferExceptions(bestCds->accession, bestCds->source, pepToRefHash, bed->name, exceptionsOut); } else { pick.source = "noncoding"; } cdsPickTabOut(&pick, fPick); bedFree(&bed); } carefulClose(&fPick); carefulClose(&fTce); }
static void pubsLoadKeywordYearItems(struct track *tg) /* load items that fulfill keyword and year filter */ { pubsParseClassColors(); struct sqlConnection *conn = hAllocConn(database); char *keywords = cartOptionalStringClosestToHome(cart, tg->tdb, FALSE, "pubsFilterKeywords"); char *yearFilter = cartOptionalStringClosestToHome(cart, tg->tdb, FALSE, "pubsFilterYear"); char *publFilter = cartOptionalStringClosestToHome(cart, tg->tdb, FALSE, "pubsFilterPublisher"); char *articleTable = pubsArticleTable(tg); if(sameOk(yearFilter, "anytime")) yearFilter = NULL; if(sameOk(publFilter, "all")) publFilter = NULL; if(isNotEmpty(keywords)) keywords = makeMysqlMatchStr(keywords); if (isEmpty(yearFilter) && isEmpty(keywords) && isEmpty(publFilter)) { loadGappedBed(tg); } else { // put together an "extra" query to hExtendedRangeQuery that removes articles // without the keywords specified in hgTrackUi char *oldLabel = tg->longLabel; tg->longLabel = catTwoStrings(oldLabel, " (filter activated)"); freeMem(oldLabel); char **row; struct linkedFeatures *lfList = NULL; struct trackDb *tdb = tg->tdb; int scoreMin = atoi(trackDbSettingClosestToHomeOrDefault(tdb, "scoreMin", "0")); int scoreMax = atoi(trackDbSettingClosestToHomeOrDefault(tdb, "scoreMax", "1000")); boolean useItemRgb = bedItemRgb(tdb); char *extra = NULL; struct dyString *extraDy = dyStringNew(0); struct hash *articleIds = searchForKeywords(conn, articleTable, keywords); if (!sameWord(tg->table, "pubsBlat")) // new table schema: filter fields are on main bed table { if (isNotEmpty(yearFilter)) sqlDyStringPrintfWithSep(extraDy, " AND ", " year >= '%s'", yearFilter); if (isNotEmpty(publFilter)) sqlDyStringPrintfWithSep(extraDy, " AND ", " publisher = '%s'", publFilter); } else // old table schema, filter by doing a join on article table { if(isNotEmpty(yearFilter)) sqlDyStringPrintfFrag(extraDy, "name IN (SELECT articleId FROM %s WHERE year>='%s')", articleTable, \ yearFilter); } if (extraDy->stringSize > 0) extra = extraDy->string; else extra = NULL; int rowOffset = 0; struct sqlResult *sr = hExtendedRangeQuery(conn, tg->table, chromName, winStart, winEnd, extra, FALSE, NULL, &rowOffset); freeDyString(&extraDy); while ((row = sqlNextRow(sr)) != NULL) { struct bed *bed = bedLoad12(row+rowOffset); if (articleIds==NULL || hashFindVal(articleIds, bed->name)) slAddHead(&lfList, bedMungToLinkedFeatures(&bed, tdb, 12, scoreMin, scoreMax, useItemRgb)); } sqlFreeResult(&sr); slReverse(&lfList); slSort(&lfList, linkedFeaturesCmp); tg->items = lfList; } hFreeConn(&conn); }