void bulkChr2XRegression(char *spliceFile, char *spliceSelectionFile) /* Top level function to load files and iterate through splices of interest. */ { FILE *tmpFile = NULL; struct genomeBit *gpList = NULL, *gp = NULL; struct bed *bedList = NULL, *bed = NULL, *retList = NULL; warn("Loading beds from %s", spliceFile); bedList = loadBedFileWithHeader(spliceFile); warn("Loading splices of interest from %s", spliceSelectionFile); gpList = loadGpList(spliceSelectionFile); warn("Loaded %d splices, and %d splices of interest.", slCount(bedList), slCount(gpList)); warn("Analyzing splices of interest."); /* Clean out the summary files produced by R script. */ tmpFile = mustOpen("maxScores.html", "w"); fprintf(tmpFile, "<head><body><table>\n"); fprintf(tmpFile, "<tr><th>Position</th><th>MaxDiff Levels</th><th>MaxDiff</th><th>Var Diff</th><th>MaxDiff/Var</th><th>Percent Diff</th><th>Cass Var/Stable Var</th><th>Plot</th></tr>\n"); carefulClose(&tmpFile); tmpFile = mustOpen("allScores.tab", "w"); carefulClose(&tmpFile); tmpFile = mustOpen("cassettes.sample", "w"); carefulClose(&tmpFile); for(gp = gpList; gp != NULL; gp = gp->next) { retList = findBedsFromGp(gp, bedList); if(retList != NULL) { for(bed = retList; bed != NULL; bed = bed->next) { doAnalysisForBed(bed); } bedFreeList(&retList); } else { bedsNotFound++; warn("Couldn't find bed for genome bit %s:%d-%d", gp->chrom, gp->chromStart, gp->chromEnd); } } warn(""); warn("%d genome bits had multiple beds, %d had no bed, %d analyzed", multipleBedForGp, bedsNotFound, bedsAnalyzed); warn("Cleaning up."); tmpFile = mustOpen("maxScores.html", "a"); fprintf(tmpFile, "</table></body></html>\n"); carefulClose(&tmpFile); bedFreeList(&bedList); }
static struct bed* regionsLoad(char* sectionsBed) /* return a bed3 list of regions for times when -regions is used. */ /* If the filename has a comma then a number, then take just that line */ { struct bed* list = NULL; unsigned ix = 0; if (strchr(sectionsBed, ',')) { char* number_part = chopPrefixAt(sectionsBed, ','); if (number_part) ix = sqlUnsigned(number_part); } list = readAtLeastBed3(sectionsBed); if (list && (ix > 0)) { struct bed* single = slElementFromIx(list, ix - 1); if (single) { struct bed* rem; while ((rem = slPopHead(&list)) != single) bedFree(&rem); rem = single->next; bedFreeList(&rem); single->next = NULL; list = single; } } return list; }
void metaBigClose(struct metaBig** pMb) /* close the file and free up everything. */ { struct metaBig* mb = *pMb; hashFree(&mb->chromSizeHash); if (mb->rgList) hashFree(&mb->rgList); if (mb->sections) bedFreeList(&mb->sections); if (mb->originalFileName) freeMem(mb->originalFileName); if (mb->fileName) freeMem(mb->fileName); if (mb->baseFileName) freeMem(mb->baseFileName); if (mb->remoteSiteAndDir) freeMem(mb->remoteSiteAndDir); #ifdef USE_HTSLIB if (mb->idx) hts_idx_destroy(mb->idx); #endif if (mb->type == isaBigBed) bigBedFileClose(&mb->big.bbi); #ifdef USE_HTSLIB else if (mb->type == isaBam) sam_close(mb->big.bam); #endif else bigWigFileClose(&mb->big.bbi); #ifdef USE_HTSLIB if (mb->header) bam_hdr_destroy(mb->header); #endif freez(pMb); }
long metaBigNumItems(struct metaBig* mb, boolean verbose) /* return the total number of items in a bigBed or BAM */ /* used on a bigWig will return 0 */ /* unfortunately this is a loop through the entire file basically. */ /* nicer would be something that just glances at the index, but doing that */ /* might count items that would be filtered out upon fetching. */ { long sum = 0; struct bed* section; struct bed* chroms = NULL; if (mb->type == isaBigWig) return 0; else if (mb->type == isaBigBed) return (long)bigBedItemCount(mb->big.bbi); else chroms = sectionsFromChromSizes(mb->chromSizeHash); for (section = chroms; section != NULL; section = section->next) { struct lm* lm = lmInit(0); struct bed6* list = metaBigBed6Fetch(mb, section->chrom, section->chromStart, section->chromEnd, lm); int num = slCount(list); if (verbose) printf("Number of items in %s of %s: %d\n", section->chrom, mb->fileName, num); sum += num; lmCleanup(&lm); } bedFreeList(&chroms); return sum; }
int countCassetteExons(struct altGraphX *agList, float minConfidence, FILE *outfile, FILE *bedOutFile) /* count up the number of cassette exons that have a certain confidence, returns number of edges. If outfile != NULL will output fasta sequences to outfile. */ { struct altGraphX *ag = NULL; int edge =0; int cassetteCount = 0; int i =0; int mod3 = 0; int counter =0; boolean outputted = FALSE; float estPrior = cgiOptionalDouble("estPrior", 10); FILE *log = mustOpen("confidences.log", "w"); FILE *html = mustOpen("confidences.html", "w"); FILE *sizes = mustOpen("sizes.log", "w"); int minSize = cgiOptionalInt("minSize", 0); startHtml(html); for(ag = agList; ag != NULL; ag = ag->next) { outputted = FALSE; for(i=0;i<ag->edgeCount; i++) { if(ag->edgeTypes[i] == ggCassette) { float conf = altGraphCassetteConfForEdge(ag, i, estPrior); struct bed *bed, *bedList = altGraphGetExonCassette(ag, i); char buff[256]; int size = ag->vPositions[ag->edgeEnds[i]] - ag->vPositions[ag->edgeStarts[i]]; boolean filtersOk = FALSE; if(ag->name == NULL) ag->name = cloneString(""); slSort(&bedList, bedCmpMaxScore); for(bed=bedList; bed != NULL; bed = bed->next) { snprintf(buff, sizeof(buff), "%s.%d", ag->name, counter); bed->name = cloneString(buff); fprintf(log, "%f\n", conf); fprintf(sizes, "%d\n%d\n%d\n", bed->blockSizes[0], bed->blockSizes[1], bed->blockSizes[2]); filtersOk = bedPassFilters(bed, ag, i); if(conf >= minConfidence && size >= minSize && filtersOk) { writeCassetteExon(bed, ag, i, &outputted, bedOutFile, outfile, html, conf); cassetteCount++; if((size % 3) == 0) mod3++; } counter++; } bedFreeList(&bedList); } } } endHtml(html); carefulClose(&html); carefulClose(&log); warn("%d cassettes are mod 3", mod3); return cassetteCount; }
void doParDetails(struct trackDb *tdb, char *name) /* show details of a PAR item. */ { // load entire PAR table (t's tiny) and partition struct bed *pars = loadParTable(tdb); if (slCount(pars) & 1) errAbort("par items not paired in %s", tdb->table); struct bed *clickedPar = getClickedPar(name, &pars); struct bed *homPar = getHomologousPar(clickedPar, &pars); slSort(&pars, parCmp); cartWebStart(cart, database, "Pseudoautosomal regions"); webPrintLinkTableStart(); // header webPrintLabelCell(""); webPrintLabelCell("Selected PAR"); webPrintLabelCell("Homologous PAR"); // selected webPrintLinkTableNewRow(); printHomPairRow(clickedPar, homPar); if (pars != NULL) printOtherPars(clickedPar, pars); webPrintLinkTableEnd(); printTrackHtml(tdb); webEnd(); bedFreeList(&pars); bedFree(&clickedPar); bedFree(&homPar); }
void hgPhMouse(char *database, char *track, int fileCount, char *fileNames[]) /* hgPhMouse - Load phMouse track. */ { int i; char *fileName; char *tabName = "phMouse.tab"; FILE *f = mustOpen(tabName, "w"); struct lineFile *lf; char *words[32], *s, c; int wordCount; int oneSize, totalSize = 0; for (i=0; i<fileCount; ++i) { struct bed *bedList = NULL, *bed; fileName = fileNames[i]; lf = lineFileOpen(fileName, TRUE); printf("Reading %s ", fileName); fflush(stdout); while ((wordCount = lineFileChop(lf, words)) > 0) { if (wordCount < 7) errAbort("Expecting at least 7 words line %d of %s", lf->lineIx, fileName); AllocVar(bed); bed->chrom = cloneString(words[0]); bed->chromStart = lineFileNeedNum(lf, words, 1); bed->chromEnd = lineFileNeedNum(lf, words, 2); bed->score = lineFileNeedNum(lf, words, 6); s = strrchr(words[3], '|'); c = s[1]; s[0] = 0; if (c != '+' && c != '-') errAbort("Misformed strandless trace name line %d of %s", lf->lineIx, lf->fileName); bed->name = cloneString(words[3]); bed->strand[0] = c; slAddHead(&bedList, bed); } oneSize = slCount(bedList); printf("%d alignments ", oneSize); totalSize += oneSize; fflush(stdout); slSort(&bedList, bedCmp); printf("sorted "); fflush(stdout); for (bed = bedList; bed != NULL; bed = bed->next) { int bin = hFindBin(bed->chromStart, bed->chromEnd); fprintf(f, "%d\t", bin); bedTabOutN(bed, 6, f); } printf("tabbed out\n"); bedFreeList(&bedList); } carefulClose(&f); printf("Loading %d items into %s.%s\n", totalSize, database, track); loadDatabase(database, track, tabName); remove(tabName); }
static struct bed* subset_beds(char* sectionString, struct bed** pRegions, struct hash* chromHash) /* in the situation where both a regions bed file is given AND the filename specifies subsections, */ /* intersect the two. For simplictity sake, */ { struct bed* fname_ranges = parseSectionString(sectionString, chromHash); struct bed* bed; struct bed* subset = NULL; struct bed* regions = *pRegions; slSort(&fname_ranges, bedCmp); bed = fname_ranges; while (bed != NULL) { /* each iteration of the loop should be a separate chrom */ struct bed* region; struct rbTree* tree = rangeTreeNew(); while ((bed != NULL) && (bed->next != NULL) && (sameString(bed->chrom, bed->next->chrom))) { rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); bed = bed->next; } rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); /* now we're at a point that we're dealing only with one chromosome. */ for (region = regions; region != NULL; region = region->next) { if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd) && rangeTreeFindEnclosing(tree, region->chromStart, region->chromEnd)) { struct bed* clone = cloneBed(region); slAddHead(&subset, clone); } else if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd)) errAbort("range specified in file overlaps but is not contained by range specified on command-line"); } rangeTreeFree(&tree); bed = bed->next; } if (subset == NULL) { errAbort("no ranges specified in file were contained in ranges specified on command-line"); } slReverse(&subset); bedFreeList(&fname_ranges); bedFreeList(pRegions); return subset; }
void wigsax_bed4(FILE *out, struct metaBig *mb, struct bed *region, int alpha, int window, double mean, double std, boolean wig_out) /* output the bed4 style when it's being run over an interval */ { struct bed *outBedList = NULL; struct bed *bed; struct perBaseWig *wigList = perBaseWigLoadContinue(mb, region->chrom, region->chromStart, region->chromEnd); struct perBaseWig *pbw; struct slDouble *datList = NULL; struct slDouble *oneDub; /* Maybe sometime I'll put back the option to use multiple alphabets at a time. */ int alphaS = alpha; int alphaE = alpha; for (pbw = wigList; pbw != NULL; pbw = pbw->next) { struct bed *bedList = make_initial_bed_list(pbw, alphaE - alphaS + 2); int i, j; int data_len = pbw->chromEnd - pbw->chromStart; for (i = alphaS; i <= alphaE; i++) { char *sax = sax_from_array_force_window(pbw->data, data_len, i, window, mean, std); for (j = 0, bed = bedList; ((j < data_len) && (bed != NULL)); j++, bed = bed->next) bed->name[i-alphaS] = sax[j]; freeMem(sax); } if (wig_out) for (j = 0; j < data_len; j++) { struct slDouble *dub = newSlDouble(pbw->data[j]); slAddHead(&datList, dub); } while ((bed = slPopHead(&bedList)) != NULL) slAddHead(&outBedList, bed); } slReverse(&outBedList); slReverse(&datList); perBaseWigFreeList(&wigList); oneDub = datList; for (bed = outBedList; bed != NULL; bed = bed->next) { bedOutputN(bed, 4, out, '\t', (wig_out) ? '\t' : '\n'); if (wig_out) { if (oneDub == NULL) errAbort("data inconsistency. programmer error\n"); fprintf(out, "%0.4f\n", oneDub->val); oneDub = oneDub->next; } } bedFreeList(&outBedList); slFreeList(&datList); }
void perBaseWigFree(struct perBaseWig** pRegion) /* Free-up a perBaseWig */ { struct perBaseWig* pbw = *pRegion; if (!pRegion || !pbw) return; if (pbw->subsections) bedFreeList(&pbw->subsections); if (pbw->name) freeMem(pbw->name); freeMem(pbw->chrom); freez(&pbw->data); freez(pRegion); }
void findBeds(struct cutter *cutters, struct dnaSeq *seqs, char *outputFile) /* Output all beds found to a file. */ { struct dnaSeq *seq; FILE *f = mustOpen(outputFile, "w"); for (seq = seqs; seq != NULL; seq = seq->next) { struct bed *bedList = matchEnzymes(cutters, seq, 0); if (bedList) { spitBedList(bedList, f); bedFreeList(&bedList); } } carefulClose(&f); }
void findCounts(struct cutter *cutters, struct dnaSeq *seqs, char *outputFile) /* Go through each sequence, and each time add the counts of the enzymes */ /* encountered to the hash of counts. */ { struct dnaSeq *seq; struct hash *countHash = initCutterCountHash(cutters); for (seq = seqs; seq != NULL; seq = seq->next) { struct bed *bedList = matchEnzymes(cutters, seq, 0); if (bedList) { addCountsToHash(countHash, bedList); bedFreeList(&bedList); } } writeHashToFile(countHash, outputFile); }
void affyPslAndAtlasToBedOld(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut) /** Main function that does all the work for old-style*/ { struct hash *bedHash = NULL; struct affyAtlas *aaList=NULL, *aa=NULL; struct expRecord *erList=NULL, *er=NULL; struct bed *bedList=NULL, *bed=NULL; int expCount = 0; FILE *erOut = NULL, *bOut=NULL; warn("loading atlas file"); aaList = affyAtlasLoadAll(atlasFile); expCount = countExperiments(aaList); warn("creating list of beds from alignments"); bedList = createBedsFromPsls(pslFile, expCount); warn("creating hash from list of beds"); bedHash = createBedHash(bedList); warn("appending experiments to beds in hash"); appendExperiments(bedHash, aaList, &erList); warn("Running sanity Checks"); checkAllBeds(&bedList, expCount); warn("%d beds were missing experiments." , missingExpsCount); warn("%d beds had no experiments.", noExpCount); warn("Calculating average intensities"); convertIntensitiesToRatios(bedList); calculateAverages(bedList); warn("writing expRecords out"); erOut = mustOpen(expRecOut, "w"); for(er = erList; er != NULL; er = er->next) expRecordTabOut(er, erOut); carefulClose(&erOut); warn("writing beds out"); bOut = mustOpen(bedOut, "w"); for(bed = bedList; bed != NULL; bed = bed->next) bedTabOutN(bed, 15, bOut); carefulClose(&bOut); warn("cleaning up.."); freeHash(&bedHash); bedFreeList(&bedList); warn("Done."); }
void doBeds(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, char *netTable, char *bedFileName, char *bedTableName, char *outBedName, char *selectedFileName, int *foundCount, int *notFoundCount) /* Map over beds. */ { FILE *bedOut = NULL; FILE *selectedOut = NULL; struct bed *bed=NULL, *bedList = NULL, *orthoBed=NULL; /* Load beds. */ warn("Loading beds."); if(bedFileName) bedList=bedLoadAll(bedFileName); else bedList=loadBedFromTable(conn, bedTableName, chrom, 0, BIGNUM); /* Convert beds. */ warn("Converting beds."); assert(outBedName); bedOut = mustOpen(outBedName, "w"); if (selectedFileName != NULL) selectedOut = mustOpen(selectedFileName, "w"); for(bed = bedList; bed != NULL; bed = bed->next) { if(differentString(bed->chrom, chrom)) continue; occassionalDot(); orthoBed = orthoBedFromBed(conn, db, orthoDb, netTable, bed); if(orthoBed != NULL && orthoBed->blockCount > 0) { (*foundCount)++; bedTabOutN(orthoBed, 12, bedOut); if (selectedOut != NULL) bedTabOutN(bed, 12, selectedOut); } else (*notFoundCount)++; bedFree(&orthoBed); } bedFreeList(&bedList); carefulClose(&selectedOut); carefulClose(&bedOut); }
int main(int argc, char *argv[]) /* The program */ { struct bed *bedList = NULL; struct dnaSeq *targets = NULL, *target; struct dnaSeq *queries = NULL, *query; if (argc != 4) usage(); targets = dnaLoadAll(argv[2]); queries = dnaLoadAll(argv[1]); for (target = targets; target != NULL; target = target->next) for (query = queries; query != NULL; query = query->next) { struct bed *oneList = oligoMatch(target, query); bedList = slCat(bedList, oneList); } outputBed6(bedList, argv[3]); bedFreeList(&bedList); dnaSeqFreeList(&targets); dnaSeqFreeList(&queries); return 0; }
struct genePred *convertBedsToGps(char *bedFile) /* Load beds from a file and convert to bare bones genePredictions. */ { struct genePred *gpList = NULL, *gp =NULL; struct bed *bedList=NULL, *bed=NULL; bedList = bedLoadNAll(bedFile, 6); if(bedList->strand == NULL) errAbort("Beds must have strand information."); for(bed=bedList; bed!=NULL; bed=bed->next) { AllocVar(gp); gp->chrom = cloneString(bed->chrom); gp->txStart = gp->cdsStart = bed->chromStart; gp->txEnd = gp->cdsEnd = bed->chromEnd; gp->name = cloneString(bed->name); safef(gp->strand, sizeof(gp->strand), "%s", bed->strand); slAddHead(&gpList, gp); } bedFreeList(&bedList); slReverse(&gpList); return gpList; }
int hgSeqItemsInRange(char *db, char *table, char *chrom, int chromStart, int chromEnd, char *sqlConstraints) /* Print out dna sequence of all items (that match sqlConstraints, if nonNULL) in the given range in table. Return number of items. */ { struct hTableInfo *hti; struct bed *bedList; char rootName[256]; char parsedChrom[32]; int itemCount; hParseTableName(db, table, rootName, parsedChrom); hti = hFindTableInfo(db, chrom, rootName); if (hti == NULL) webAbort("Error", "Could not find table info for table %s (%s)", rootName, table); bedList = hGetBedRange(db, table, chrom, chromStart, chromEnd, sqlConstraints); itemCount = hgSeqBed(db, hti, bedList); bedFreeList(&bedList); return itemCount; }
void pickCassettePcrPrimers(char *db, char *bedFileName, char *primerFaName, char *primerBedName) /* pickCassettePcrPrimers - Takes a bedFile with three exons and for each bed calls primer3 to pick primers that will detect the inclusion or exclusion of the exon.. */ { struct bed *bed=NULL, *bedList = NULL; FILE *primerFa = NULL; FILE *primerBed = NULL; struct cassetteSeq *cseq = NULL; int targetExon = optionInt("targetExon", 1); hSetDb(db); bed = bedList = bedLoadAll(bedFileName); primerFa = mustOpen(primerFaName, "w"); primerBed = mustOpen(primerBedName, "w"); for(bed=bedList; bed != NULL; bed = bed->next) { cseq = cassetteSeqFromBed(bed, targetExon); callPrimer3(cseq, primerFa, primerBed); cassetteSeqFree(&cseq); } bedFreeList(&bedList); carefulClose(&primerFa); carefulClose(&primerBed); }
void createIntronBeds(char *agxFile, char *bedFile) /* Make intron beds for evaluation. */ { struct altGraphX *ag=NULL, *agList = NULL; struct bed *bed=NULL, *bedList=NULL; FILE *bedOut = NULL; int count; warn("Rading AltGraphX list."); agList = altGraphXLoadAll(agxFile); warn("Converting to intron beds."); bedOut = mustOpen(bedFile, "w"); for(ag = agList; ag != NULL; ag = ag->next) { occassionalDot(); bedList = bedIntronsFromAgx(ag); for(bed=bedList; bed != NULL; bed=bed->next) { bedTabOutN(bed, 12, bedOut); } bedFreeList(&bedList); } altGraphXFreeList(&agList); }
void calculateBinomialP(char* regdomFn, char* antigapFn, int totalRegions, int hitRegions) /* Calculate binomial p-value of enrichment based on regulatory domains and regions hit */ { struct regdom* regdoms = readInitializedRegdomFile(regdomFn); // This will hold the union of all regulatory domains for quick search struct genomeRangeTree *ranges = getRangeTreeOfRegdoms(regdoms); // NOTE: Each of these regions must be non-overlapping. struct bed* antigaps = bedLoadAll(antigapFn); long totalNonGapBases = getTotalNonGapBases(antigaps); long annotatedNonGapBases = getAnnotatedNonGapBases(ranges, antigaps); double annotationWeight = (double)annotatedNonGapBases/(double)totalNonGapBases; double binomP = getBinomPval(totalRegions, hitRegions, annotationWeight); printf("%e\n", binomP); regdomFreeList(®doms); bedFreeList(&antigaps); genomeRangeTreeFree(&ranges); }
void intronSizes(char *database, char *table) /* intronSizes - Output list of intron sizes.. */ { struct dyString *query = newDyString(1024); struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *gp; int rowOffset; struct bed *bedList = NULL, *bed = NULL; hSetDb(database); rowOffset = hOffsetPastBin(NULL, table); conn = hAllocConn(database); sqlDyStringPrintf(query, "select * from %s", table); if (chromName != NULL) dyStringPrintf(query, " where chrom = '%s'", chromName); if (cgiBoolean("withUtr")) { dyStringPrintf(query, " %s txStart != cdsStart", (chromName == NULL ? "where" : "and")); } sr = sqlGetResult(conn, query->string); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row+rowOffset); genePredIntrons(gp, &bedList); slReverse(&bedList); for (bed = bedList ; bed != NULL ; bed=bed->next) bedTabOutN(bed,6, stdout); bedFreeList(&bedList); genePredFree(&gp); } sqlFreeResult(&sr); hFreeConn(&conn); }
struct genePred *gpFromBedFile(char *file) /* Load entries from a bed file, convert them to genePreds and return them. */ { struct bed *bedList = NULL, *bed = NULL; struct genePred *gpList = NULL, *gp = NULL; bedList = bedLoadAll(file); for(bed = bedList; bed != NULL; bed = bed->next) { gp = bedToGenePred(bed); /* pslxFileOpen gaks if strand is not + or -. bedToGenePred returns * the bed strand, which might be empty (for #fields < 6) or ".". * If so, fake out the strand to + in order to get readable PSL. */ if (! (sameString(gp->strand, "+") || sameString(gp->strand, "-"))) { gp->strand[0] = '+'; gp->strand[1] = '\0'; } slAddHead(&gpList, gp); } slReverse(&gpList); bedFreeList(&bedList); return gpList; }
static void randomPlacement(char *bounding, char *placed) { struct bed *boundingElements = bedLoadAll(bounding); struct bed *placeItems = bedLoadAll(placed); struct bed *nearestNeighbors = NULL; int boundingCount = slCount(boundingElements); int placedCount = slCount(placeItems); int neighborCount = 0; struct chrGapList *boundingGaps = NULL; struct chrGapList *duplicateGapList = NULL; struct chrGapList *neighborGaps = NULL; struct statistic *statsList = NULL; struct statistic *statEl = NULL; if (neighbor) { nearestNeighbors = bedLoadAll(neighbor); slSort(&nearestNeighbors, bedCmp); /* order by chrom,chromStart */ neighborCount = slCount(nearestNeighbors); verbose(2, "neighbor element count: %d\n", neighborCount); neighborGaps = createGaps(nearestNeighbors); } slSort(&boundingElements, bedCmp); /* order by chrom,chromStart */ slSort(&placeItems, bedCmp); /* order by chrom,chromStart */ verbose(2, "bounding element count: %d\n", boundingCount); verbose(2, "placed item count: %d\n", placedCount); boundingGaps = createGaps(boundingElements); if (TRUE) /* display initial placement stats only */ { char *neighborName = NULL; if (neighbor) { neighborName = cloneString(neighbor); duplicateGapList = cloneGapList(neighborGaps); } else { neighborName = cloneString(bounding); duplicateGapList = cloneGapList(boundingGaps); } verbose(2,"stats before initial placement: =================\n"); statEl = gapStats(duplicateGapList, (char *)NULL, (char *)NULL, (char *)NULL); printf("statistics on gaps before any placements:\n\t(%s)\n", neighborName); statsPrint(statEl); slAddHead(&statsList,statEl); initialPlacement(duplicateGapList,placeItems); verbose(2,"stats after initial placement: =================\n"); statEl = gapStats(duplicateGapList, zeroBedOutFile, shoulderBedOutFile, distOut); printf("statistics after initial placement of placed items:\n\t(%s)\n", placed); statsPrint(statEl); slAddHead(&statsList,statEl); freeChrList(&duplicateGapList, FALSE); slReverse(&statsList); freeMem(neighborName); } if (trials > 0) { int trial; srand48((long int)seed); /* for default seed=0, same set of randoms */ slSort(&placeItems, bedCmpSize); /* order by size of elements */ slReverse(&placeItems); /* largest ones first */ measurePlaced(placeItems); /* show placed item characteristics */ for (trial = 0; trial < trials; ++trial) { struct bed *randomPlacedBedList; duplicateGapList = cloneGapList(boundingGaps); randomPlacedBedList = randomTrial(duplicateGapList,placeItems); if (neighbor) { struct chrGapList *duplicateNeighborList; slSort(&randomPlacedBedList,bedCmp);/*order by chrom,chromStart*/ duplicateNeighborList = cloneGapList(neighborGaps); initialPlacement(duplicateNeighborList,randomPlacedBedList); statEl = gapStats(duplicateNeighborList, (char *)NULL, (char *)NULL, (char *)NULL); freeChrList(&duplicateNeighborList, FALSE); } else statEl = gapStats(duplicateGapList, (char *)NULL, (char *)NULL, (char *)NULL); slAddHead(&statsList,statEl); /* this gap list has temporary bed elements that were * created by the randomTrial(), they need to be freed as * the list is released, hence the TRUE signal. * It isn't a true freeBedList operation because the chrom * names are left intact in the original copy of the bed * list. (The names were being shared.) */ if ((trial == (trials - 1)) && (bedOutFile != NULL)) { bedListOutput(duplicateGapList, bedOutFile); } freeChrList(&duplicateGapList, TRUE); } slReverse(&statsList); statsPrint(statsList); } if (neighbor) { bedFreeList(&nearestNeighbors); freeChrList(&neighborGaps, FALSE); } bedFreeList(&boundingElements); bedFreeList(&placeItems); freeChrList(&boundingGaps, FALSE); }
void doExpRatio(struct trackDb *tdb, char *item, struct customTrack *ct) /* Generic expression ratio deatils using microarrayGroups.ra file */ /* and not the expRecord tables. */ { char *expScale = trackDbRequiredSetting(tdb, "expScale"); char *expStep = trackDbRequiredSetting(tdb, "expStep"); double maxScore = atof(expScale); double stepSize = atof(expStep); struct bed *bedList; char *itemName = cgiUsualString("i2","none"); char *expName = (item == NULL) ? itemName : item; char *tdbSetting = trackDbSettingOrDefault(tdb, "expColor", "redGreen"); char *colorVal = NULL; enum expColorType colorScheme; char colorVarName[256]; safef(colorVarName, sizeof(colorVarName), "%s.color", tdb->track); colorVal = cartUsualString(cart, colorVarName, tdbSetting); colorScheme = getExpColorType(colorVal); if (sameWord(tdb->grp, "cancerGenomics")) { /* set global flag */ isCancerGenomicsTrack = TRUE; } if (!ct) { genericHeader(tdb, itemName); bedList = loadMsBed(tdb, tdb->table, seqName, winStart, winEnd); } else if (ct->dbTrack) { genericHeader(tdb, itemName); printCustomUrl(tdb, itemName, TRUE); bedList = ctLoadMultScoresBedDb(ct, seqName, winStart, winEnd); } else bedList = bedFilterListInRange(ct->bedList, NULL, seqName, winStart, winEnd); if (bedList == NULL) printf("<b>No Expression Data in this Range.</b>\n"); else if (expName && sameString(expName, "zoomInMore")) printf("<b>Too much data to display in detail in this range.</b>\n"); else { struct microarrayGroups *groupings = NULL; struct maGrouping *combineGroup; struct hash *erHash = newHash(6); int i; if (!ct) { groupings = maGetTrackGroupings(database, tdb); combineGroup = maCombineGroupingFromCart(groupings, cart, tdb->track); } else combineGroup = maGetGroupingFromCt(ct); maBedClumpGivenGrouping(bedList, combineGroup); for (i = 0; i < combineGroup->numGroups; i++) { /* make stupid exprecord hash.perhaps eventually this won't be needed */ char id[16]; struct expRecord *er = basicExpRecord(combineGroup->names[i], i, 2); safef(id, sizeof(id), "%d", i); hashAdd(erHash, id, er); } puts("<h2></h2><p>\n"); msBedPrintTable(bedList, erHash, itemName, expName, -1*maxScore, maxScore, stepSize, 2, msBedDefaultPrintHeader, msBedExpressionPrintRow, printExprssnColorKey, getColorForExprBed, colorScheme); hashTraverseEls(erHash, erHashElFree); hashFree(&erHash); microarrayGroupsFree(&groupings); } puts("<h2></h2><p>\n"); bedFreeList(&bedList); }
void hgExperiment(char *database, char *table, char *expFile, char *posFile, char *dataFile) /* Main function */ { struct lineFile *lf; int *data = NULL; int *scores; FILE *f = NULL; char expTable[32]; char *words[3]; int wordCt; struct bed *bedList, *bed; int expCount; struct hash *expHash, *dataHash; struct hashEl *hel; /* Open experiment file and use it to create experiment table. Use optional fields if present, otherwise defaults */ safef(expTable, ArraySize(expTable), "%sExps", table); expHash = makeExpsTable(database, expTable, expFile, &expCount); /* Read in positions file */ bedList = bedLoadAll(posFile); slSort(&bedList, bedCmp); /* Read data file into a hash of arrays of data values, keyed by name */ dataHash = newHash(0); lf = lineFileOpen(dataFile, TRUE); while ((wordCt = lineFileChopNext(lf, words, ArraySize(words)))) { /* format: <region-name> <experiment-name> <data-value> */ char *name, *exp; int expId; int value; if (wordCt != 3) errAbort("Expecting 3 words in data file, got %d line %d of %s", wordCt, lf->lineIx, lf->fileName); name = words[0]; hel = hashLookup(dataHash, name); if (!hel) { AllocArray(data, expCount); hel = hashAdd(dataHash, name, data); } data = (int *)hel->val; exp = words[1]; expId = hashIntVal(expHash, exp); if (expId < 0 || expId > expCount-1) errAbort("Invalid experiment ID %d for %s, line %d of %s", expId, exp, lf->lineIx, lf->fileName); //value = atoi(words[2]); value = round(atof(words[2])); if (data[expId] != 0) errAbort("Extra experiment data value %d for %s %s, line %d of %s", value, name, exp, lf->lineIx, lf->fileName); data[expId] = value; } lineFileClose(&lf); /* Fill in BED15 fields - add experiment values, and setup block (only 1)*/ for (bed = bedList; bed != NULL; bed = bed->next) { int i; bed->thickStart = bed->chromStart; bed->thickEnd = bed->chromEnd; bed->blockCount = 1; AllocArray(bed->blockSizes, 1); bed->blockSizes[0] = bed->chromEnd - bed->chromStart; AllocArray(bed->chromStarts, 1); bed->chromStarts[0] = 0; bed->expCount = expCount; AllocArray(bed->expIds, expCount); for (i = 0; i < expCount; i++) bed->expIds[i] = i; AllocArray(bed->expScores, expCount); scores = hashMustFindVal(dataHash, bed->name); for (i = 0; i < expCount; i++) bed->expScores[i] = scores[i]; /* set score for bed to the average of the scores in all experiments */ calculateAverage(bed); } /* from affyPslAndAtlsoToBed ? convertIntensitiesToRatios(bedList); */ /* Write BED data file */ f = hgCreateTabFile(tabDir, table); for (bed = bedList; bed != NULL; bed = bed->next) bedTabOutN(bed, 15, f); /* Cleanup */ carefulClose(&f); freeHash(&expHash); freeHash(&dataHash); bedFreeList(&bedList); }
void bwtool_split(struct hash *options, char *regions, char *size_s, char *bigfile, char *tmp_dir, char *outputfile) /* bwtool_split - main for the splitting program */ { struct metaBig *mb = metaBigOpenWithTmpDir(bigfile, tmp_dir, regions); FILE *output = mustOpen(outputfile, "w"); struct bed *section; struct bed *splitList = NULL; int size = 0; unsigned min_gap = sqlUnsigned((char *)hashOptionalVal(options, "min_gap", "1")); unsigned chunk_size = sqlUnsigned(size_s); char chrom[256] = ""; int start = -1, end = 0; boolean over_size = FALSE; int ix = 1; int gap = 0; for (section = mb->sections; section != NULL; section = section->next) { struct perBaseWig *pbwList = perBaseWigLoadContinue(mb, section->chrom, section->chromStart, section->chromEnd); struct perBaseWig *pbw; for (pbw = pbwList; pbw != NULL; pbw = pbw->next) { int length = pbw->chromEnd - pbw->chromStart; if (end > 0) gap = pbw->chromStart - end; if (!sameString(chrom, pbw->chrom)) { if (!sameString(chrom, "")) slAddHead(&splitList, newBed(chrom, start, end)); strcpy(chrom, pbw->chrom); start = pbw->chromStart; end = pbw->chromEnd; if (size + length > chunk_size) size = length; else size += length; } else { if ((size + length + gap > chunk_size) && (gap >= min_gap)) { slAddHead(&splitList, newBed(chrom, start, end)); start = pbw->chromStart; end = pbw->chromEnd; size = length; } else { size += length + gap; end = pbw->chromEnd; } } } perBaseWigFreeList(&pbwList); } slAddHead(&splitList, newBed(chrom, start, end)); slReverse(&splitList); for (section = splitList; section != NULL; section = section->next) { fprintf(output, "%s\t%d\t%d\n", section->chrom, section->chromStart, section->chromEnd); } carefulClose(&output); metaBigClose(&mb); bedFreeList(&splitList); }
void bwtool_find_max(struct hash *options, char *favorites, char *regions, double fill, char *bigfile, char *tmp_dir, char *outputfile) /* find max points in a range */ { boolean med_base = (hashFindVal(options, "median-base") != NULL) ? TRUE : FALSE; boolean with_max = (hashFindVal(options, "with-max") != NULL) ? TRUE : FALSE; struct metaBig *mb = metaBigOpen_check(bigfile, tmp_dir, NULL); FILE *out = mustOpen(outputfile, "w"); struct bed6 *sections6 = readBed6Soft(regions); struct bed *sections = bed12FromBed6(§ions6); struct bed *section; for (section = sections; section != NULL; section = section->next) { struct perBaseWig *pbwList = perBaseWigLoadContinue(mb, section->chrom, section->chromStart, section->chromEnd); struct perBaseWig *pbw; struct slInt *ii; int i, size; double max = -DBL_MAX; struct slInt *list = NULL; for (pbw = pbwList; pbw != NULL; pbw = pbw->next) { int pbw_off = pbw->chromStart - section->chromStart; for (i = 0; i < pbw->len; i++) { if (pbw->data[i] > max) { slFreeList(&list); struct slInt *new_int = slIntNew(i + pbw_off); slAddHead(&list, new_int); max = pbw->data[i]; } else if (pbw->data[i] == max) { struct slInt *new_int = slIntNew(i + pbw_off); slAddHead(&list, new_int); } } } slReverse(&list); if (list) { size = slCount(list); if (med_base) { section->blockCount = 1; AllocArray(section->blockSizes, sizeof(int)); AllocArray(section->chromStarts, sizeof(int)); section->blockSizes[0] = 1; section->chromStarts[0] = median_base_calc(&list); } else { section->blockCount = size; AllocArray(section->blockSizes, sizeof(int) * size); AllocArray(section->chromStarts, sizeof(int) * size); for (i = 0, ii = list; (i < size) && (ii != NULL); i++, ii = ii->next) { section->blockSizes[i] = 1; section->chromStarts[i] = ii->val; } } if (!with_max) bedTabOutN(section, 12, out); else { bedOutputN(section, 12, out, '\t', '\t'); fprintf(out, "%f\n", max); } slFreeList(&list); } perBaseWigFree(&pbwList); } metaBigClose(&mb); bedFreeList(§ions); carefulClose(&out); }
void outputBedsFromPsls(struct hash *pslHash,char *bedOutName, char *expRecordOutName, char *affyFileName, char *expFileName) /** For each set of entries in affyFile find matching psl and create a bed. */ { struct bed *bed = NULL, *b=NULL; struct psl *pslList = NULL, *psl = NULL; struct hash *expHash = NULL; int numExps = 0; int expCount = 0; int i =0; char *probeSet = NULL; char *row[4]; char key[128]; struct slName *expNames = NULL, *name = NULL; FILE *bedOut = NULL; FILE *expRecordOut = NULL; char *toDiffFileName = optionVal("toDiffFile", NULL); FILE *toDiffOut = NULL; struct lineFile *lf = NULL; fillInExpHash(expFileName, &expHash, &expNames, &expCount); lf = lineFileOpen(affyFileName, TRUE); bedOut = mustOpen(bedOutName, "w"); if(toDiffFileName != NULL) toDiffOut = mustOpen(toDiffFileName, "w"); /* Loop through either adding experiments to beds or if new probeset create bed from psl and start over. */ while(lineFileChopNextTab(lf, row, sizeof(row))) { /* Do we have to make a new bed? */ if(probeSet == NULL || differentWord(probeSet, row[0])) { occassionalDot(); numExps = 0; /* If we have probeset print out the current beds. */ if(probeSet != NULL) { for(b = bed; b != NULL; b = b->next) { int avgCount = 0; for(i = 0; i < b->expCount; i++) if(b->expScores[i] != -10000) avgCount++; if(avgCount != 0 && b->score > 0) b->score = log(b->score / avgCount) * 100; else b->score = 0; bedTabOutN(b, 15, bedOut); if(toDiffOut != NULL) outputToDiffRecord(b, expNames, toDiffOut); } } bedFreeList(&bed); /* Lookup key in pslHash to find list of psl. */ safef(key, sizeof(key), "%s", row[0]); pslList = hashFindVal(pslHash, key); /* Can have multiple psls. */ for(psl = pslList; psl != NULL; psl = psl->next) { b = bedFromPsl(psl); AllocArray(b->expIds, expCount ); AllocArray(b->expScores, expCount); b->expCount = expCount; initBedScores(b, expCount); slAddHead(&bed, b); } } if(bed != NULL) { /* Allocate larger arrays if necessary. */ if(numExps > expCount) { errAbort("Supposed to be %d experiments but probeset %s has at least %d", expCount, bed->name, numExps); } for(b = bed; b != NULL; b = b->next) { int exp = hashIntVal(expHash, row[1]); if(differentWord(row[3], "NaN")) b->expScores[exp] = atof(row[3]); if(differentWord(row[2], "NaN")) b->score += atof(row[2]); } numExps++; } freez(&probeSet); probeSet = cloneString(row[0]); } expRecordOut = mustOpen(expRecordOutName, "w"); i = 0; for(name = expNames; name != NULL; name = name->next) { subChar(name->name, ',', '_'); subChar(name->name, ' ', '_'); fprintf(expRecordOut, "%d\t%s\tuclaExp\tuclaExp\tuclaExp\tuclaExp\t1\t%s,\n", i++, name->name, name->name); } hashFree(&expHash); slFreeList(&expNames); carefulClose(&expRecordOut); carefulClose(&bedOut); lineFileClose(&lf); }
int checkTableCoords(char *db) /* Check several invariants (see comments in check*() above), * summarize errors, return nonzero if there are errors. */ { struct sqlConnection *conn = hAllocConn(db); struct slName *tableList = NULL, *curTable = NULL; struct slName *allChroms = NULL; boolean gotError = FALSE; allChroms = hAllChromNames(db); if (theTable == NULL) tableList = getTableNames(conn); else if (sqlTableExists(conn, theTable)) tableList = newSlName(theTable); else errAbort("Error: specified table \"%s\" does not exist in database %s.", theTable, db); for (curTable = tableList; curTable != NULL; curTable = curTable->next) { struct hTableInfo *hti = NULL; struct slName *chromList = NULL, *chromPtr = NULL; char *table = curTable->name; char tableChrom[32], trackName[128], tableChromPrefix[33]; hParseTableName(db, table, trackName, tableChrom); hti = hFindTableInfo(db, tableChrom, trackName); if (hti != NULL && hti->isPos) { /* watch out for presence of both split and non-split tables; * hti for non-split will be replaced with hti of split. */ if (splitAndNonSplitExist(conn, table, tableChrom)) continue; safef(tableChromPrefix, sizeof(tableChromPrefix), "%s_", tableChrom); if (hti->isSplit) chromList = newSlName(tableChrom); else chromList = allChroms; /* invariant: chrom must be described in chromInfo. */ /* items with bad chrom will be invisible to hGetBedRange(), so * catch them here by SQL query. */ /* The SQL query is too huge for scaffold-based db's, check count: */ if (hChromCount(db) <= MAX_SEQS_SUPPORTED) { if (isNotEmpty(hti->chromField)) { struct dyString *bigQuery = newDyString(1024); dyStringClear(bigQuery); sqlDyStringPrintf(bigQuery, "select count(*) from %s where ", table); for (chromPtr=chromList; chromPtr != NULL; chromPtr=chromPtr->next) { sqlDyStringPrintf(bigQuery, "%s != '%s' ", hti->chromField, chromPtr->name); if (chromPtr->next != NULL) dyStringAppend(bigQuery, "AND "); } gotError |= reportErrors(BAD_CHROM, table, sqlQuickNum(conn, bigQuery->string)); dyStringFree(&bigQuery); } for (chromPtr=chromList; chromPtr != NULL; chromPtr=chromPtr->next) { char *chrom = chromPtr->name; struct bed *bedList = hGetBedRange(db, table, chrom, 0, 0, NULL); if (hti->isSplit && isNotEmpty(hti->chromField)) gotError |= checkSplitTableOnlyChrom(bedList, table, hti, tableChrom); gotError |= checkStartEnd(bedList, table, hti, testChromSize(chrom)); if (hti->hasCDS) gotError |= checkCDSStartEnd(bedList, table, hti); if (hti->hasBlocks && !ignoreBlocks) gotError |= checkBlocks(bedList, table, hti); bedFreeList(&bedList); } } } } return gotError; }