void doParDetails(struct trackDb *tdb, char *name) /* show details of a PAR item. */ { // load entire PAR table (t's tiny) and partition struct bed *pars = loadParTable(tdb); if (slCount(pars) & 1) errAbort("par items not paired in %s", tdb->table); struct bed *clickedPar = getClickedPar(name, &pars); struct bed *homPar = getHomologousPar(clickedPar, &pars); slSort(&pars, parCmp); cartWebStart(cart, database, "Pseudoautosomal regions"); webPrintLinkTableStart(); // header webPrintLabelCell(""); webPrintLabelCell("Selected PAR"); webPrintLabelCell("Homologous PAR"); // selected webPrintLinkTableNewRow(); printHomPairRow(clickedPar, homPar); if (pars != NULL) printOtherPars(clickedPar, pars); webPrintLinkTableEnd(); printTrackHtml(tdb); webEnd(); bedFreeList(&pars); bedFree(&clickedPar); bedFree(&homPar); }
void txCdsToGene(char *txBed, char *txFa, char *txCds, char *outGtf, char *outFa) /* txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and * protein sequence. */ { struct hash *txSeqHash = faReadAllIntoHash(txFa, dnaLower); verbose(2, "Read %d transcript sequences from %s\n", txSeqHash->elCount, txFa); struct hash *cdsHash = cdsEvidenceReadAllIntoHash(txCds); verbose(2, "Read %d cdsEvidence from %s\n", cdsHash->elCount, txCds); struct lineFile *lf = lineFileOpen(txBed, TRUE); FILE *fGtf = mustOpen(outGtf, "w"); FILE *fFa = mustOpen(outFa, "w"); char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(2, "processing %s\n", bed->name); struct cdsEvidence *cds = hashFindVal(cdsHash, bed->name); struct dnaSeq *txSeq = hashFindVal(txSeqHash, bed->name); char *cdsSource = NULL; if (txSeq == NULL) errAbort("%s is in %s but not %s", bed->name, txBed, txFa); if (cds != NULL) { outputProtein(cds, txSeq, fFa); if (cds->cdsCount > 1) { struct bed *newBed = breakUpBedAtCdsBreaks(cds, bed); if (fTweaked) fprintf(fTweaked, "%s\n", newBed->name); bedFree(&bed); bed = newBed; } cdsSource = cds->accession; if (sameString(cds->accession, ".")) cdsSource = cds->source; } /* Set bed CDS bounds and optionally output bed. */ cdsEvidenceSetBedThick(cds, bed); if (fBed) bedTabOutN(bed, 12, fBed); /* Parse out bed name, which is in format chrom.geneId.txId.accession */ char *geneName = cloneString(bed->name); char *accession = strrchr(geneName, '.'); assert(accession != NULL); *accession++ = 0; chopSuffix(geneName); /* Output as GTF */ bedToGtf(bed, accession, cdsSource, geneName, fGtf); /* Clean up for next iteration of loop. */ freez(&geneName); bedFree(&bed); } lineFileClose(&lf); carefulClose(&fFa); carefulClose(&fGtf); }
char *altGraphId(struct sqlConnection *conn, struct genePred *gp) /* Return altGraphId that overlaps most with genePred. */ { int rowOffset; struct sqlResult *sr; char **row; struct bed *bestBed = NULL; int intersect, bestIntersect = 0; char extra[64]; char *ret = NULL; safef(extra, sizeof(extra), "strand='%s'", gp->strand); sr = hRangeQuery(conn, "agxBed", gp->chrom, gp->txStart, gp->txEnd, extra, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct bed *bed = bedLoad12(row+rowOffset); intersect = gpBedBasesShared(gp, bed); if (intersect > bestIntersect) { bestIntersect = intersect; bestBed = bed; } else bedFree(&bed); } if (bestBed != NULL) { ret = cloneString(bestBed->name); bedFree(&bestBed); } return ret; }
static void mafFrags(char *database, char *track, char *bedFile, char *mafFile) /* mafFrags - Collect MAFs from regions specified in a 6 column bed file. */ { struct slName *orgList = NULL; struct lineFile *lf = lineFileOpen(bedFile, TRUE); FILE *f = mustOpen(mafFile, "w"); if (optionExists("orgs")) { char *orgFile = optionVal("orgs", NULL); char *buf; readInGulp(orgFile, &buf, NULL); orgList = stringToSlNames(buf); /* Ensure that org list starts with database. */ struct slName *me = slNameFind(orgList, database); if (me == NULL) errAbort("Need to have reference database '%s' in %s", database, orgFile); if (me != orgList) { slRemoveEl(&orgList, me); slAddHead(&orgList, me); } } mafWriteStart(f, "zero"); if (bed12) { char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoadN(row, ArraySize(row)); struct mafAli *maf = mafFromBed12(database, track, bed, orgList); if (meFirst) moveMeToFirst(maf, bed->name); mafWrite(f, maf); mafAliFree(&maf); bedFree(&bed); } } else { char *row[6]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoadN(row, ArraySize(row)); processBed6(database, track, f, bed, orgList); bedFree(&bed); } } mafWriteEnd(f); carefulClose(&f); }
void pslToBed(char *pslFile, char *bedFile, struct hash *cdsHash, bool doPosName) /* pslToBed -- tranform a psl format file to a bed format file */ { struct lineFile *pslLf = pslFileOpen(pslFile); FILE *bedFh = mustOpen(bedFile, "w"); struct psl *psl; while ((psl = pslNext(pslLf)) != NULL) { struct bed *bed = bedFromPsl(psl); if (doPosName) { char *newName = needMem(512); safef(newName, 512, "%s:%d-%d", psl->qName, psl->qStart, psl->qEnd); freeMem(bed->name); bed->name = newName; } if (cdsHash) { struct cds *cds = hashFindVal(cdsHash, psl->qName); if (cds == NULL) bed->thickStart = bed->thickEnd = bed->chromStart; else setThick(psl, bed, cds); } bedTabOutN(bed, 12, bedFh); bedFree(&bed); pslFree(&psl); } carefulClose(&bedFh); lineFileClose(&pslLf); }
void gffToBed(char *inGff, char *outBed) /* gffToBed - Convert a gff file (gff1 or gff2) to bed. Not tested with gff3 */ { struct gffFile *gff = gffRead(inGff); FILE *f = mustOpen(outBed, "w"); char *exonFeature = bestExonFeature(gff); gffGroupLines(gff); separateGroupsByChromosome(gff); struct gffGroup *group; for (group = gff->groupList; group != NULL; group = group->next) { struct genePred *gp; if (gff->isGtf) gp = genePredFromGroupedGtf(gff, group, group->name, FALSE, FALSE); else gp = genePredFromGroupedGff(gff, group, group->name, exonFeature, FALSE, FALSE); if (gp != NULL) { assert(gp->txStart == gp->exonStarts[0]); struct bed *bed = bedFromGenePred(gp); bedTabOutN(bed, 12, f); bedFree(&bed); } } carefulClose(&f); }
static struct chromAnn* chromAnnBedReaderRead(struct chromAnnReader *car) /* read next BED and convert to a chromAnn */ { struct rowReader *rr = car->data; if (!rowReaderNext(rr)) return NULL; rowReaderExpectAtLeast(rr, 3); char **rawCols = (car->opts & chromAnnSaveLines) ? rowReaderCloneColumns(rr) : NULL; struct bed *bed = bedLoadN(rr->row, rr->numCols); struct chromAnn *ca = chromAnnNew(bed->chrom, bed->strand[0], bed->name, rawCols, strVectorWrite, strVectorFree); if ((bed->blockCount == 0) || (car->opts & chromAnnRange)) { if (car->opts & chromAnnCds) { if (bed->thickStart < bed->thickEnd) chromAnnBlkNew(ca, bed->thickStart, bed->thickEnd); } else chromAnnBlkNew(ca, bed->chromStart, bed->chromEnd); } else addBedBlocks(ca, car->opts, bed); chromAnnFinish(ca); bedFree(&bed); return ca; }
void outputBed(struct cassetteSeq *cseq, FILE *primerBed) /* Output a bed linked features track to see where primers are. */ { struct bed *bed = NULL; struct bed *cbed = cseq->bed; int leftStart=0, rightStart =0; bed = cloneBed(cbed); if(cseq->leftPrimer == NULL || cseq->rightPrimer == NULL) return; leftStart = calcGenomePos(cbed, cseq->leftPrimer, cseq->seq); rightStart = calcGenomePos(cbed, cseq->rightPrimer, cseq->seq); if(sameString(bed->strand, "+")) { bed->chromStart = bed->thickStart = leftStart; bed->chromStarts[0] = 0; bed->blockSizes[0] = strlen(cseq->leftPrimer); bed->chromStarts[1] = rightStart - leftStart; bed->blockSizes[1] = strlen(cseq->leftPrimer); bed->chromEnd = bed->thickEnd = bed->chromStarts[1] + bed->chromStart + bed->blockSizes[1]; } else { bed->chromStart = bed->thickStart = rightStart; bed->chromStarts[0] = 0; bed->blockSizes[0] = strlen(cseq->rightPrimer); bed->chromStarts[1] = leftStart - rightStart; bed->blockSizes[1] = strlen(cseq->rightPrimer); bed->chromEnd = bed->thickEnd = bed->chromStarts[1] + bed->chromStart + bed->blockSizes[1]; } bed->blockCount = 2; checkBedMatchesSeqs(cseq, bed); bedTabOutN(bed, 12, primerBed); bedFree(&bed); }
static void showMrnaFromGenePred(struct sqlConnection *conn, char *geneId, char *geneName) /* Get mRNA sequence for gene from gene prediction. */ { char *table = genomeSetting("knownGene"); struct sqlResult *sr; char **row; char query[256]; boolean hasBin = hIsBinned(sqlGetDatabase(conn), table); hPrintf("<TT><PRE>"); safef(query, sizeof(query), "select * from %s where name='%s'" " and chrom='%s' and txStart=%d and txEnd=%d", table, geneId, curGeneChrom, curGeneStart, curGeneEnd); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct genePred *gene = genePredLoad(row+hasBin); struct bed *bed = bedFromGenePred(gene); struct dnaSeq *seq = hSeqForBed(sqlGetDatabase(conn), bed); hPrintf(">%s (%s predicted mRNA)\n", geneId, geneName); writeSeqWithBreaks(stdout, seq->dna, seq->size, 50); dnaSeqFree(&seq); bedFree(&bed); genePredFree(&gene); } else errAbort("Couldn't find %s at %s:%d-%d", geneId, curGeneChrom, curGeneStart, curGeneEnd); sqlFreeResult(&sr); hPrintf("</TT></PRE>"); }
static struct bed* regionsLoad(char* sectionsBed) /* return a bed3 list of regions for times when -regions is used. */ /* If the filename has a comma then a number, then take just that line */ { struct bed* list = NULL; unsigned ix = 0; if (strchr(sectionsBed, ',')) { char* number_part = chopPrefixAt(sectionsBed, ','); if (number_part) ix = sqlUnsigned(number_part); } list = readAtLeastBed3(sectionsBed); if (list && (ix > 0)) { struct bed* single = slElementFromIx(list, ix - 1); if (single) { struct bed* rem; while ((rem = slPopHead(&list)) != single) bedFree(&rem); rem = single->next; bedFreeList(&rem); single->next = NULL; list = single; } } return list; }
void checkInputOpenFiles(struct inInfo *array, int count) /* Make sure all of the input is there and of right format before going forward. Since * this is going to take a while we want to fail fast. */ { int i; for (i=0; i<count; ++i) { struct inInfo *in = &array[i]; switch (in->type) { case itBigWig: { /* Just open and close, it will abort if any problem. */ in->bbi = bigWigFileOpen(in->fileName); break; } case itPromoterBed: case itUnstrandedBed: case itBlockedBed: { struct lineFile *lf = in->lf = lineFileOpen(in->fileName, TRUE); char *line; lineFileNeedNext(lf, &line, NULL); char *dupe = cloneString(line); char *row[256]; int wordCount = chopLine(dupe, row); struct bed *bed = NULL; switch (in->type) { case itPromoterBed: lineFileExpectAtLeast(lf, 6, wordCount); bed = bedLoadN(row, 6); char strand = bed->strand[0]; if (strand != '+' && strand != '-') errAbort("%s must be stranded, got %s in that field", lf->fileName, row[6]); break; case itUnstrandedBed: lineFileExpectAtLeast(lf, 4, wordCount); bed = bedLoadN(row, 4); break; case itBlockedBed: lineFileExpectAtLeast(lf, 4, wordCount); bed = bedLoadN(row, 12); break; default: internalErr(); break; } bedFree(&bed); freez(&dupe); lineFileReuse(lf); break; } default: internalErr(); break; } } }
void loadGappedBed(struct track *tg) /* Convert bed info in window to linked feature. */ { struct sqlResult *sr; char **row; int rowOffset; struct bed *bed; struct linkedFeatures *lfList = NULL, *lf; struct trackDb *tdb = tg->tdb; int scoreMin = atoi(trackDbSettingClosestToHomeOrDefault(tdb, "scoreMin", "0")); int scoreMax = atoi(trackDbSettingClosestToHomeOrDefault(tdb, "scoreMax", "1000")); boolean useItemRgb = FALSE; useItemRgb = bedItemRgb(tdb); if (tg->isBigBed) { // avoid opening an unneeded db connection for bigBed; required not to use mysql for parallel fetch tracks bigBedAddLinkedFeaturesFrom(tg, chromName, winStart, winEnd, scoreMin, scoreMax, useItemRgb, 12, &lfList); } else { /* Use tg->tdb->track because subtracks inherit composite track's tdb * by default, and the variable is named after the composite track. */ struct sqlConnection *conn = hAllocConn(database); char *scoreFilterClause = getScoreFilterClause(cart, tg->tdb,NULL); if (scoreFilterClause != NULL) { sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd,scoreFilterClause, &rowOffset); freeMem(scoreFilterClause); } else { sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd, NULL, &rowOffset); } while ((row = sqlNextRow(sr)) != NULL) { bed = bedLoad12(row+rowOffset); adjustBedScoreGrayLevel(tdb, bed, scoreMin, scoreMax); lf = lfFromBedExtra(bed, scoreMin, scoreMax); if (useItemRgb) { lf->extra = (void *)USE_ITEM_RGB; /* signal for coloring */ lf->filterColor=bed->itemRgb; } slAddHead(&lfList, lf); bedFree(&bed); } sqlFreeResult(&sr); hFreeConn(&conn); } slReverse(&lfList); if(tg->extraUiData) filterBed(tg, &lfList); slSort(&lfList, linkedFeaturesCmp); tg->items = lfList; }
void slRefAndBedFree(struct slRef **pRef) /* designed to be called by slRefListAndBedFree. */ { struct slRef *ref; if ((ref = *pRef) == NULL) return; bedFree((struct bed **)&(ref->val)); freez(pRef); }
void bedPlusFree(struct bedPlus **pBp) /* freedom. */ { struct bedPlus *bp; if ((bp = *pBp) == NULL) return; bedFree((struct bed **)&(bp->bed)); freeMem((char *)bp->rest); freez(pBp); }
struct bed *orthoBedFromPsl(struct sqlConnection *conn, char *db, char *orthoDb, char *netTable, struct psl *psl) /** Produce a bed on the orthologous genome from the original psl. */ { struct bed *bed = NULL, *orthoBed = NULL; int i; bed = bedFromPsl(psl); orthoBed = orthoBedFromBed(conn, db, orthoDb, netTable, bed); bedFree(&bed); return orthoBed; }
void cassetteSeqFree(struct cassetteSeq **pCseq) /* Free a cassetteSeq */ { struct cassetteSeq *cseq = *pCseq; dnaSeqFree(&cseq->seq); bedFree(&cseq->bed); freez(&cseq->name); freez(&cseq->rightPrimer); freez(&cseq->leftPrimer); freez(&cseq); pCseq = NULL; }
/* convert one line read from a bed file to a PSL */ void cnvBedRec(char *line, struct hash *chromSizes, FILE *pslFh) { char *row[12]; int numCols = chopByWhite(line, row, ArraySize(row)); if (numCols < 4) errAbort("bed must have at least 4 columns"); struct bed *bed = bedLoadN(row, numCols); struct psl* psl = bedToPsl(bed, chromSizes); pslTabOut(psl, pslFh); pslFree(&psl); bedFree(&bed); }
/* convert one line read from a bed file to a genePred */ void cnvBedRec(char *line, FILE *gpFh) { char *row[12]; int numCols = chopByWhite(line, row, ArraySize(row)); if (numCols < 4) errAbort("bed must have at least 4 columns"); struct bed *bed = bedLoadN(row, numCols); struct genePred* gp = bedToGenePred(bed); genePredTabOut(gp, gpFh); genePredFree(&gp); bedFree(&bed); }
void doStrand(struct bed *start, struct bed *end, FILE *f) /* Assuming all beds from start up to end are on same strand, * make a merged bed with all their blocks and output it. */ { struct rbTree *rangeTree = rangeTreeNew(); struct bed *bed; for (bed = start; bed != end; bed = bed->next) bedIntoRangeTree(bed, rangeTree); bed = bedFromRangeTree(rangeTree, start->chrom, start->name, start->strand); bedTabOutN(bed, 12, f); bedFree(&bed); rangeTreeFree(&rangeTree); }
struct linkedFeatures *bedMungToLinkedFeatures(struct bed **pBed, struct trackDb *tdb, int fieldCount, int scoreMin, int scoreMax, boolean useItemRgb) /* Convert bed to a linkedFeature, destroying bed in the process. */ { struct bed *bed = *pBed; if (fieldCount < 12) bed8To12(bed); adjustBedScoreGrayLevel(tdb, bed, scoreMin, scoreMax); struct linkedFeatures *lf = lfFromBedExtra(bed, scoreMin, scoreMax); if (useItemRgb) { lf->extra = (void *)USE_ITEM_RGB; /* signal for coloring */ lf->filterColor=bed->itemRgb; } bedFree(pBed); return lf; }
void doPsls(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, char *netTable, char *pslFileName, char *pslTableName, char *outBedName, char *selectedFileName, int *foundCount, int *notFoundCount) /* Map over psls. */ { FILE *bedOut = NULL; FILE *selectedOut = NULL; struct bed *bed = NULL; struct psl *psl=NULL, *pslList = NULL; /* Load psls. */ warn("Loading psls."); if(pslFileName) pslList=pslLoadAll(pslFileName); else pslList=loadPslFromTable(conn, pslTableName, chrom, 0, BIGNUM); /* Convert psls. */ warn("Converting psls."); assert(outBedName); bedOut = mustOpen(outBedName, "w"); if (selectedFileName != NULL) selectedOut = mustOpen(selectedFileName, "w"); for(psl = pslList; psl != NULL; psl = psl->next) { if(differentString(psl->tName, chrom)) continue; occassionalDot(); bed = orthoBedFromPsl(conn, db, orthoDb, netTable, psl); if(bed != NULL && bed->blockCount > 0) { (*foundCount)++; bedTabOutN(bed, 12, bedOut); if (selectedOut != NULL) pslTabOut(psl, selectedOut); } else (*notFoundCount)++; bedFree(&bed); } carefulClose(&selectedOut); carefulClose(&bedOut); }
static void subset_with_sections(struct metaBig* mb, struct bed** p_list) /* mainly for chopgenome */ { struct genomeRangeTree* grt = genomeRangeTreeNew(); struct bed* sec; struct bed* list; struct bed* newlist = NULL; struct bed* head; for (sec = mb->sections; sec != NULL; sec = sec->next) genomeRangeTreeAdd(grt, sec->chrom, sec->chromStart, sec->chromEnd); list = *p_list; while ((head = slPopHead(&list)) != NULL) { if (genomeRangeTreeOverlaps(grt, head->chrom, head->chromStart, head->chromEnd) && genomeRangeTreeFindEnclosing(grt, head->chrom, head->chromStart, head->chromEnd)) slAddHead(&newlist, head); else bedFree(&head); } slReverse(&newlist); *p_list = newlist; }
void doBeds(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, char *netTable, char *bedFileName, char *bedTableName, char *outBedName, char *selectedFileName, int *foundCount, int *notFoundCount) /* Map over beds. */ { FILE *bedOut = NULL; FILE *selectedOut = NULL; struct bed *bed=NULL, *bedList = NULL, *orthoBed=NULL; /* Load beds. */ warn("Loading beds."); if(bedFileName) bedList=bedLoadAll(bedFileName); else bedList=loadBedFromTable(conn, bedTableName, chrom, 0, BIGNUM); /* Convert beds. */ warn("Converting beds."); assert(outBedName); bedOut = mustOpen(outBedName, "w"); if (selectedFileName != NULL) selectedOut = mustOpen(selectedFileName, "w"); for(bed = bedList; bed != NULL; bed = bed->next) { if(differentString(bed->chrom, chrom)) continue; occassionalDot(); orthoBed = orthoBedFromBed(conn, db, orthoDb, netTable, bed); if(orthoBed != NULL && orthoBed->blockCount > 0) { (*foundCount)++; bedTabOutN(orthoBed, 12, bedOut); if (selectedOut != NULL) bedTabOutN(bed, 12, selectedOut); } else (*notFoundCount)++; bedFree(&orthoBed); } bedFreeList(&bedList); carefulClose(&selectedOut); carefulClose(&bedOut); }
void bedViewOut(struct altSpliceSite *as, FILE *out) { struct bed *bed = NULL; AllocVar(bed); bed->chrom = cloneString(as->chrom); bed->chromStart = as->chromStart; bed->chromEnd = maxInArray(as->altStarts, as->altCount); AllocArray(bed->chromStarts, 2); AllocArray(bed->blockSizes, 2); bed->thickStart = as->altBpStarts[1]; bed->thickEnd = as->altBpEnds[1]; bed->blockCount = 2; bed->chromStarts[0] = 0; bed->chromStarts[1] = bed->chromEnd - bed->chromStart -1; bed->blockSizes[0] = bed->blockSizes[1] = 1; bed->name = cloneString(as->agName); bed->score = as->spliceTypes[1]; safef(bed->strand, sizeof(bed->strand), "%s", as->strand); bedTabOutN(bed, 12, out); bedFree(&bed); }
static void addFilteredBedsOnRegion(struct bbiFile *bbi, struct region *region, char *table, struct asFilter *filter, struct lm *bedLm, struct bed **pBedList) /* Add relevant beds in reverse order to pBedList */ { struct lm *bbLm = lmInit(0); struct bigBedInterval *ivList = NULL, *iv; ivList = bigBedIntervalQuery(bbi, region->chrom, region->start, region->end, 0, bbLm); char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; for (iv = ivList; iv != NULL; iv = iv->next) { bigBedIntervalToRow(iv, region->chrom, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { struct bed *bed = bedLoadN(row, bbi->definedFieldCount); struct bed *lmBed = lmCloneBed(bed, bedLm); slAddHead(pBedList, lmBed); bedFree(&bed); } } lmCleanup(&bbLm); }
void affyPslAndAtlasToBedNew(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line, *name; int i, wordCount, expCount; char **row; double *data, median; double invMedian, ratio, logRatio; char *affyId; struct hash *hash = newHash(17); struct psl *psl; struct bed *bed; FILE *f = NULL; int dataCount = 0, pslCount = 0, bedCount = 0; int minExpVal = 20; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExp(line+1, expRecOut); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = mustOpen(bedOut, "w"); /* Build up a hash keyed by affyID with an int array of data * for value. Do output in short case. */ AllocArray(row, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } AllocArray(data, expCount); for (i=0; i<expCount; ++i) { data[i] = atof(row[i]); if (data[i] < minExpVal) data[i] = minExpVal; } median = findPositiveMedian(data, expCount, minExpVal); if (median >= 0) { invMedian = 1.0/median; for (i=0; i<expCount; ++i) { double val = data[i]; val = safeLog2(invMedian*val); data[i] = val; } if (shortOut) shortDataOut(f, affyId, expCount, data); else hashAdd(hash, affyId, data); } data = NULL; ++dataCount; } lineFileClose(&lf); warn("%d rows of expression data\n", dataCount); /* Stream through psl file, converting it to bed with expression data. */ if (!shortOut) { lf = pslFileOpen(pslFile); while ((psl = pslNext(lf)) != NULL) { ++pslCount; /* get probe id from sequence name */ name=parseNameFromHgc(psl->qName); data = hashFindVal(hash, name); if (data != NULL) { struct bed *bed = bedFromPsl(psl); bed->expCount = expCount; AllocArray(bed->expIds, expCount); AllocArray(bed->expScores, expCount); for (i=0; i<expCount; ++i) { bed->expScores[i] = data[i]; bed->expIds[i] = i; } bedTabOutN(bed, 15, f); ++bedCount; bedFree(&bed); } pslFree(&psl); } warn("%d records in %s", pslCount, pslFile); warn("%d records written to %s", bedCount, bedOut); } lineFileClose(&lf); carefulClose(&f); }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
struct bed *pathToBed(struct path *path, struct splice *splice, int source, int sink, boolean spoofEnds) /* Construct a bed for the path. If spoofEnds is TRUE, ensure that there is at least a 1bp exon at splice sites. */ { struct bed *bed = NULL; int vertIx = 0; int *verts = path->vertices; int *vPos = splice->vPositions; unsigned char *vTypes = splice->vTypes; int i = 0; struct dyString *buff = newDyString(256); AllocVar(bed); bed->chrom = cloneString(splice->tName); bed->chromStart = BIGNUM; bed->chromEnd = 0; safef(bed->strand, sizeof(bed->strand), "%s", splice->strand); bed->score = splice->type; AllocArray(bed->chromStarts, path->vCount); AllocArray(bed->blockSizes, path->vCount); /* If necessary tack on a fake exon. */ if(spoofEnds && verts[vertIx] != source && verts[vertIx+1] <= splice->vCount && pathEdgeType(vTypes, verts[vertIx], verts[vertIx+1]) != ggExon) { bed->blockSizes[bed->blockCount] = 1; bed->chromStarts[bed->blockCount] = vPos[verts[vertIx]] - 1; bed->chromStart = bed->thickStart = min(bed->chromStart, vPos[verts[vertIx]] - 1 ); bed->chromEnd = bed->thickEnd = max(bed->chromEnd, vPos[verts[vertIx+1]]); bed->blockCount++; } /* For each edge that is an exon count up the base pairs. */ for(vertIx = 0; vertIx < path->vCount - 1; vertIx++) { if(verts[vertIx] != source && verts[vertIx] <= splice->vCount) { /* If exon add up the base pairs. */ if(pathEdgeType(vTypes, verts[vertIx], verts[vertIx+1]) == ggExon) { bed->blockSizes[bed->blockCount] = vPos[verts[vertIx+1]] - vPos[verts[vertIx]]; bed->chromStarts[bed->blockCount] = vPos[verts[vertIx]]; bed->chromStart = bed->thickStart = min(bed->chromStart, vPos[verts[vertIx]]); bed->chromEnd = bed->thickEnd = max(bed->chromEnd, vPos[verts[vertIx+1]]); bed->blockCount++; } } } /* if spoofing ends tack on a 1bp exon as necessary. */ vertIx = path->vCount - 2; if(spoofEnds && verts[vertIx] != source && verts[vertIx+1] <= splice->vCount && pathEdgeType(vTypes, verts[vertIx], verts[vertIx+1]) != ggExon) { bed->blockSizes[bed->blockCount] = 1; bed->chromStarts[bed->blockCount] = vPos[verts[vertIx+1]]; bed->chromStart = bed->thickStart = min(bed->chromStart, vPos[verts[vertIx+1]]); bed->chromEnd = bed->thickEnd = max(bed->chromEnd, vPos[verts[vertIx+1]]+1); bed->blockCount++; } /* Fix up the name and adjust the chromStarts. */ dyStringPrintf(buff, "%s.%d.", splice->name, slIxFromElement(splice->paths, path)); for(i = 0; i < path->vCount; i++) { if(path->vertices[i] != sink && path->vertices[i] <= splice->vCount) dyStringPrintf(buff, "%d,", path->vertices[i]); } if(splice->type == alt5Prime || splice->type == alt3Prime || splice->type == altRetInt || splice->type == altCassette) { int pathIx = slIxFromElement(splice->paths, path); if(pathIx == 0) dyStringPrintf(buff, "-Ex"); else if(pathIx == 1) dyStringPrintf(buff, "-Inc"); } bed->name = cloneString(buff->string); for(i = 0; i < bed->blockCount; i++) bed->chromStarts[i] -= bed->chromStart; /* If we don't have any blocks, quit now. */ if(bed->blockCount == 0) bedFree(&bed); dyStringFree(&buff); return bed; }
void txCdsPick(char *inBed, char *inTce, char *refToPepTab, char *outTce, char *outPick) /* txCdsPick - Pick best CDS if any for transcript given evidence.. */ { struct hash *pepToRefHash, *refToPepHash; hashRefToPep(refToPepTab, &refToPepHash, &pepToRefHash); struct hash *txCdsInfoHash = loadAndWeighTce(inTce, refToPepHash, pepToRefHash); verbose(2, "Read info on %d transcripts from %s\n", txCdsInfoHash->elCount, inTce); struct lineFile *lf = lineFileOpen(inBed, TRUE); FILE *fTce = mustOpen(outTce, "w"); FILE *fPick = mustOpen(outPick, "w"); char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); struct txCdsInfo *tx = hashFindVal(txCdsInfoHash, bed->name); struct cdsPick pick; ZeroVar(&pick); pick.name = bed->name; pick.refSeq = pick.refProt = pick.swissProt = pick.uniProt = pick.ccds = ""; if (tx != NULL && tx->cdsList->score >= weightedThreshold) { struct cdsEvidence *cds, *bestCds = tx->cdsList; int bestSize = bestCds->end - bestCds->start; int minSize = bestSize*0.50; cdsEvidenceTabOut(bestCds, fTce); pick.start = bestCds->start; pick.end = bestCds->end; pick.source = bestCds->source; pick.score = bestCds->score; pick.startComplete = bestCds->startComplete; pick.endComplete = bestCds->endComplete; for (cds = tx->cdsList; cds != NULL; cds = cds->next) { char *source = cds->source; if (rangeIntersection(bestCds->start, bestCds->end, cds->start, cds->end) >= minSize) { if (startsWith("RefPep", source)) { if (pick.refProt[0] == 0) { pick.refProt = cds->accession; if (pick.refSeq[0] == 0) pick.refSeq = hashMustFindVal(pepToRefHash, cds->accession); } } else if (startsWith("RefSeq", source)) { if (pick.refSeq[0] == 0) pick.refSeq = cds->accession; } else if (sameString("swissProt", source)) { if (pick.swissProt[0] == 0) { pick.swissProt = cds->accession; if (pick.uniProt[0] == 0) pick.uniProt = cds->accession; } } else if (sameString("trembl", source)) { if (pick.uniProt[0] == 0) pick.uniProt = cds->accession; } else if (sameString("txCdsPredict", source)) { } else if (sameString("genbankCds", source)) { } else if (sameString("ccds", source)) { if (pick.ccds[0] == 0) pick.ccds = cds->accession; } else errAbort("Unknown source %s", source); } } if (exceptionsOut) transferExceptions(bestCds->accession, bestCds->source, pepToRefHash, bed->name, exceptionsOut); } else { pick.source = "noncoding"; } cdsPickTabOut(&pick, fPick); bedFree(&bed); } carefulClose(&fPick); carefulClose(&fTce); }
/* bedGraphLoadItems - an ordinary bed load, but we are interested * in only the chrom, start, end, and the graphColumn */ static void bedGraphLoadItems(struct track *tg) { struct sqlConnection *conn; struct sqlResult *sr = (struct sqlResult *) NULL; char **row = (char **)NULL; int rowOffset = 0; struct bedGraphItem *bgList = NULL; int itemsLoaded = 0; int colCount = 0; struct wigCartOptions *wigCart = (struct wigCartOptions *) tg->wigCartData; int graphColumn = 5; char *tableName; if(sameString(tg->table, "affyTranscription")) wigCart->colorTrack = "affyTransfrags"; graphColumn = wigCart->graphColumn; #ifndef GBROWSE if (isCustomTrack(tg->table) && tg->customPt) { struct customTrack *ct = (struct customTrack *) tg->customPt; tableName = ct->dbTableName; conn = hAllocConn(CUSTOM_TRASH); } else #endif /* GBROWSE */ { tableName = tg->table; conn = hAllocConnTrack(database, tg->tdb); } sr = hRangeQuery(conn, tableName, chromName, winStart, winEnd, NULL, &rowOffset); colCount = sqlCountColumns(sr) - rowOffset; /* Must have at least four good columns */ if (colCount < 4) errAbort("bedGraphLoadItems: table %s only has %d data columns, must be at least 4", tableName, colCount); if (colCount < graphColumn) errAbort("bedGraphLoadItems: table %s only has %d data columns, specified graph column %d does not exist", tableName, colCount, graphColumn); /* before loop, determine actual row[graphColumn] index */ graphColumn += (rowOffset - 1); while ((row = sqlNextRow(sr)) != NULL) { struct bedGraphItem *bg; struct bed *bed; ++itemsLoaded; /* load chrom, start, end */ bed = bedLoadN(row+rowOffset, 3); AllocVar(bg); bg->start = bed->chromStart; bg->end = bed->chromEnd; if ((colCount > 4) && ((graphColumn + rowOffset) != 4)) bg->name = cloneString(row[3+rowOffset]); else { char name[128]; safef(name,ArraySize(name),"%s.%d", bed->chrom, itemsLoaded); bg->name = cloneString(name); } bg->dataValue = sqlFloat(row[graphColumn]); /* filled in by DrawItems */ bg->graphUpperLimit = wigEncodeStartingUpperLimit; bg->graphLowerLimit = wigEncodeStartingLowerLimit; slAddHead(&bgList, bg); bedFree(&bed); } sqlFreeResult(&sr); hFreeConn(&conn); slReverse(&bgList); tg->items = bgList; } /* bedGraphLoadItems() */