unsigned sqlEnumParse(char *valStr, char **values, struct hash **valHashPtr) /* parse an enumerated column value */ { if (*valHashPtr == NULL) *valHashPtr = buildSymHash(values, TRUE); return hashIntVal(*valHashPtr, valStr); }
void axtSwapFile(char *source, char *targetSizes, char *querySizes, char *dest) /* axtSwapFile - Swap source and query in an axt file. */ { struct hash *tHash = loadIntHash(targetSizes); struct hash *qHash = loadIntHash(querySizes); struct lineFile *lf = lineFileOpen(source, TRUE); FILE *f = mustOpen(dest, "w"); struct axt *axt; while ((axt = axtRead(lf)) != NULL) { axtSwap(axt, hashIntVal(tHash, axt->tName), hashIntVal(qHash, axt->qName)); axtWrite(axt, f); axtFree(&axt); } }
void printBiggestGap(char *database, struct sqlConnection *conn, struct slName *chromList, struct hash *chromHash, char *track) /* Look up track in database, figure out which type it is, call * appropriate biggest gap finder, and then print result. */ { struct trackDb *tdb = hTrackInfo(conn, track); struct hTableInfo *hti = hFindTableInfo(database, chromList->name, tdb->table); char *typeWord = cloneFirstWord(tdb->type); boolean isBig = FALSE, isBigBed = FALSE; struct bbiFile *bbi = NULL; if (sameString(typeWord, "bigBed")) { isBig = TRUE; isBigBed = TRUE; bbi = bigBedFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) ); } else if (sameString(typeWord, "bigWig")) { isBig = TRUE; bbi = bigWigFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) ); } char *biggestChrom = NULL; int biggestSize = 0, biggestStart = 0, biggestEnd = 0; struct slName *chrom; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { if (!allParts && strchr(chrom->name, '_')) // Generally skip weird chroms continue; if (female && sameString(chrom->name, "chrY")) continue; int chromSize = hashIntVal(chromHash, chrom->name); struct rbTree *rt = rangeTreeNew(); int start = 0, end = 0, size = 0; if (isBig) bigCoverageIntoTree(tdb, bbi, chrom->name, chromSize, rt, isBigBed); else tableCoverageIntoTree(hti, tdb, conn, chrom->name, chromSize, rt); if (rt->n > 0) // Want to keep completely uncovered chromosome uncovered addGaps(conn, chrom->name, rt); biggestGapFromRangeTree(rt, chromSize, &start, &end, &size); if (size > biggestSize) { biggestSize = size; biggestStart = start; biggestEnd = end; biggestChrom = chrom->name; } rangeTreeFree(&rt); } printf("%s\t%s:%d-%d\t", track, biggestChrom, biggestStart+1, biggestEnd); if (noComma) printf("%d", biggestSize); else printLongWithCommas(stdout, biggestSize); putchar('\n'); freez(&typeWord); bbiFileClose(&bbi); }
/* Some dumb helper function. */ static void chromOob(struct metaBig* mb, char* chrom, int* start, int* end) { int csize = hashIntVal(mb->chromSizeHash, chrom); if (*start < 0) *start = 0; if (*end > csize) *end = csize; }
void outputAxtAsMaf(FILE *f, struct axt *axt, struct hash *tSizeHash, char *tPrefix, struct hash *qSizeHash, char *qPrefix) /* Write out an axt in maf format. */ { struct mafAli temp; char *oldQ = axt->qName; char *oldT = axt->tName; char tName[256], qName[256]; snprintf(tName, sizeof(tName), "%s%s", tPrefix, axt->tName); axt->tName = tName; snprintf(qName, sizeof(qName), "%s%s", qPrefix, axt->qName); axt->qName = qName; mafFromAxtTemp(axt, hashIntVal(tSizeHash, oldT), hashIntVal(qSizeHash, oldQ), &temp); axt->qName = oldQ; axt->tName = oldT; mafWriteGood(f, &temp); }
void possiblyOutputATag(struct dnaSeq *seq, int pos, int start, char strand, struct snp **snpsUsedArray, int numSnps, struct hash *freqHash, struct hash *libTotHash, FILE *output) /* If this sequence permutation appears in the frequency hash, output it. */ { struct slPair *list; list = hashFindVal(freqHash, seq->dna + 4); if (list) /* Output in cgapSage bed format. */ { struct slPair *cur; int chromStart = pos + start; int chromEnd = chromStart + TAG_SIZE; int thickStart = (strand == '+') ? chromStart : chromEnd - 4; int thickEnd = thickStart + 4; int numLibs = slCount(list); int i; int snpsUsed = 0; /* chrom, chromStart, chromEnd */ fprintf(output, "%s\t%d\t%d\t", seq->name, chromStart, chromEnd); /* name */ fprintf(output, "%s\t", seq->dna + 4); /* score, strand, thickStart, thickEnd */ fprintf(output, "1000\t%c\t%d\t%d\t", strand, thickStart, thickEnd); /* numLibs */ fprintf(output, "%d\t", numLibs); /* libIds */ for (cur = list; cur != NULL; cur = cur->next) fprintf(output, "%s,", cur->name); fprintf(output, "\t"); /* freqs */ for (cur = list; cur != NULL; cur = cur->next) fprintf(output, "%d,", ptToInt(cur->val)); fprintf(output, "\t"); /* TPMs */ for (cur = list; cur != NULL; cur = cur->next) { double totalTags = (double)hashIntVal(libTotHash, cur->name); int freq = ptToInt(cur->val); double tpm = (double)freq * (1000000 / totalTags); fprintf(output, "%.4f,", tpm); } fprintf(output, "\t"); for (i = 0; i < numSnps; i++) if (snpsUsedArray[i] != NULL) snpsUsed++; /* numSNps */ fprintf(output, "%d\t", snpsUsed); /* snps */ for (i = 0; i < numSnps; i++) if (snpsUsedArray[i] != NULL) fprintf(output, "%s,", snpsUsedArray[i]->name); fprintf(output, "\n"); } }
static struct chromSize *getChromsFromSpecs(char *db, struct slName *specs) /* build chromSizes from results of */ { struct hash *chrTbl = hChromSizeHash(db); struct chromSize *chroms = NULL; struct slName *spec; for (spec = specs; spec != NULL; spec = spec->next) slSafeAddHead(&chroms, chromSizeNew(spec->name, hashIntVal(chrTbl, spec->name))); hashFree(&chrTbl); return chroms; }
static void gapSanityCheck(struct agpGap *gapList) { int prevEnd = 0; int prevStart = 0; char *prevChr = NULL; char *prevType = NULL; struct agpGap *gap; for (gap = gapList; gap; gap = gap->next) { int chrSize = hashIntVal(cInfoHash, gap->chrom); if (gap->chromStart < 0) verbose(1, "WARNING: gap chromStart < 0 at %s:%d-%d\n", gap->chrom, gap->chromStart, gap->chromEnd); if (gap->chromEnd > chrSize) verbose(1, "WARNING: gap chromEnd > chromSize(%d) " "at %s:%d-%d\n", chrSize, gap->chrom, gap->chromStart, gap->chromEnd); if (gap->chromEnd == chrSize && differentString(gap->type, "telomere")) verbose(1, "WARNING: gap at end of chromosome not telomere " "at %s:%d-%d, type: %s\n", gap->chrom, gap->chromStart, gap->chromEnd, gap->type); if (gap->chromStart >= gap->chromEnd) verbose(1, "WARNING: gap chromStart >= chromEnd at %s:%d-%d\n", gap->chrom, gap->chromStart, gap->chromEnd); if (prevEnd > 0) { if (sameWord(prevChr, gap->chrom) && (prevEnd >= gap->chromStart)) verbose(1,"WARNING: overlapping gap at " "%s:%d-%d(%s) and %s:%d-%d(%s)\n", gap->chrom, prevStart, prevEnd, prevType, gap->chrom, gap->chromStart, gap->chromEnd, gap->type); } else { prevStart = gap->chromStart; prevEnd = gap->chromEnd; prevType = gap->type; } if (isNotEmpty(prevChr)) { if (differentWord(prevChr, gap->chrom)) { freeMem(prevChr); prevChr = cloneString(gap->chrom); } } else prevChr = cloneString(gap->chrom); prevStart = gap->chromStart; prevEnd = gap->chromEnd; } }
static struct psl *bedToPsl(struct bed *bed, struct hash *chromSizes) /* Convert a single bed to a PSL. */ { int qSize = bedTotalBlockSize(bed); struct psl *psl; if (keepQuery) psl = pslNew(bed->chrom, hashIntVal(chromSizes, bed->chrom), bed->chromStart, bed->chromEnd, bed->chrom, hashIntVal(chromSizes, bed->chrom), bed->chromStart, bed->chromEnd, ((bed->strand[0] == '\0') ? "+" : bed->strand), (bed->blockCount == 0) ? 1 : bed->blockCount, 0); else psl = pslNew(bed->name, qSize, 0, qSize, bed->chrom, hashIntVal(chromSizes, bed->chrom), bed->chromStart, bed->chromEnd, ((bed->strand[0] == '\0') ? "+" : bed->strand), (bed->blockCount == 0) ? 1 : bed->blockCount, 0); psl->match = psl->qSize; if (bed->blockCount == 0) bedToPsl4(bed, psl); else bedToPsl12(bed, psl); return psl; }
void pslListFromGenePred(char *chromSizesFile, struct genePred *gpList, FILE *out) { struct hash *chromSizes = getChromSizes(chromSizesFile); struct genePred *gp = NULL; struct psl *psl=NULL; for(gp=gpList; gp != NULL; gp=gp->next) { int size = hashIntVal(chromSizes, gp->chrom); psl = pslFromGenePred(gp, size); pslTabOut(psl, out); } hashFree(&chromSizes); }
void writeMousePartsAsMaf(FILE *f, struct hash *mouseHash, char *ratMouseDir, char *mouseChrom, int mouseStart, int mouseEnd, int mouseChromSize, struct hash *rSizeHash, struct hash *dupeHash) /* Write out mouse/rat alignments that intersect given region of mouse. * This gets a little involved because we need to do random access on * the mouse/rat alignment files, which are too big to fit into memory. * On disk we have a mouse/rat alignment file for each mouse chromosome, * and an index of it. When we first access a mouse chromosome we load * the index for that chromosome into memory, and open the alignment file. * We then do a seek and read to load a particular alignment. */ { struct mouseChromCache *mcc = NULL; struct binElement *list = NULL, *el; char aliName[512]; /* Get cache for this mouse chromosome */ mcc = hashFindVal(mouseHash, mouseChrom); if (mcc == NULL) { mcc = newMouseChromCache(mouseChrom, mouseChromSize, ratMouseDir); hashAdd(mouseHash, mouseChrom, mcc); } if (mcc->lf == NULL) return; /* Get list of positions and process one axt into a maf for each */ list = binKeeperFindSorted(mcc->bk, mouseStart, mouseEnd); for (el = list; el != NULL; el = el->next) { struct axt *axt; struct mafAli temp; long long *pPos, pos; pPos = el->val; pos = *pPos; sprintf(aliName, "%s.%lld", mouseChrom, pos); if (!hashLookup(dupeHash, aliName)) { int rChromSize; hashAdd(dupeHash, aliName, NULL); lineFileSeek(mcc->lf, pos, SEEK_SET); axt = axtRead(mcc->lf); rChromSize = hashIntVal(rSizeHash, axt->qName); prefixAxt(axt, rPrefix, mPrefix); mafFromAxtTemp(axt, mouseChromSize, rChromSize, &temp); mafWriteGood(f, &temp); axtFree(&axt); } } slFreeList(&list); }
int altGraphXItemHeight(struct track *tg, void *item) /* Return how high an item is. If we're using altGraphXDrawPackTrack() * we have to look up how many rows an item takes in the associated * hash, otherwise it is just the heightPer. */ { if(tg->limitedVis == tvDense || tg->customPt == NULL) return tg->lineHeight; else if(tg->limitedVis == tvFull) { char key[128]; safef(key, sizeof(key), "%d", slIxFromElement(tg->items, item)); return (hashIntVal((struct hash*)tg->customPt, key)) * tg->lineHeight; } else return tg->heightPer; }
unsigned sqlSetParse(char *valStr, char **values, struct hash **valHashPtr) /* parse a set column value */ { if (*valHashPtr == NULL) *valHashPtr = buildSymHash(values, FALSE); /* parse comma separated string */ unsigned value = 0; char *val = strtok(valStr, ","); while (val != NULL) { value |= hashIntVal(*valHashPtr, val); val = strtok(NULL, ","); } return value; }
static void liftSide(char *desc, struct hash *seqSizes, struct psl *psl, char *name, char strand, unsigned *seqSize, int *start, int *end, unsigned *starts) /* life one side of the alignment */ { int regStart, regEnd, i; if (parseName(desc, name, ®Start, ®End)) { *seqSize = hashIntVal(seqSizes, name); if (*end > *seqSize) errAbort("subrange %s:%d-%d extends past sequence end %ud", name, regStart, regEnd, *seqSize); *start += regStart; *end += regStart; if (strand == '-') reverseIntRange(®Start, ®End, *seqSize); for (i = 0; i < psl->blockCount; i++) starts[i] += regStart; } }
void bwgMakeChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, int *retChromCount, struct bbiChromInfo **retChromArray, int *retMaxChromNameSize) /* Fill in chromId field in sectionList. Return array of chromosome name/ids. * The chromSizeHash is keyed by name, and has int values. */ { /* Build up list of unique chromosome names. */ struct bwgSection *section; char *chromName = ""; int chromCount = 0; int maxChromNameSize = 0; struct slRef *uniq, *uniqList = NULL; for (section = sectionList; section != NULL; section = section->next) { if (!sameString(section->chrom, chromName)) { chromName = section->chrom; refAdd(&uniqList, chromName); ++chromCount; int len = strlen(chromName); if (len > maxChromNameSize) maxChromNameSize = len; } section->chromId = chromCount-1; } slReverse(&uniqList); /* Allocate and fill in results array. */ struct bbiChromInfo *chromArray; AllocArray(chromArray, chromCount); int i; for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next) { chromArray[i].name = uniq->val; chromArray[i].id = i; chromArray[i].size = hashIntVal(chromSizeHash, uniq->val); } /* Clean up, set return values and go home. */ slFreeList(&uniqList); *retChromCount = chromCount; *retChromArray = chromArray; *retMaxChromNameSize = maxChromNameSize; }
double scoreLiftOverChain(struct liftOverChain *chain, char *fromOrg, char *fromDb, char *toOrg, char *toDb, char *cartOrg, char *cartDb, struct hash *dbRank ) /* Score the chain in terms of best match for cart settings */ { double score = 0; char *chainFromOrg = hArchiveOrganism(chain->fromDb); char *chainToOrg = hArchiveOrganism(chain->toDb); int fromRank = hashIntValDefault(dbRank, chain->fromDb, 0); /* values up to approx. #assemblies */ int toRank = hashIntValDefault(dbRank, chain->toDb, 0); int maxRank = hashIntVal(dbRank, "maxRank"); if (sameOk(fromOrg,chainFromOrg) && sameOk(fromDb,chain->fromDb) && sameOk(toOrg,chainToOrg) && sameOk(toDb,chain->toDb)) score += 10000000; if (sameOk(fromOrg,chainFromOrg)) score += 2000000; if (sameOk(fromDb,chain->fromDb)) score += 1000000; if (sameOk(toOrg,chainToOrg)) score += 200000; if (sameOk(toDb,chain->toDb)) score += 100000; if (sameOk(cartDb,chain->fromDb)) score += 20000; if (sameOk(cartDb,chain->toDb)) score += 10000; if (sameOk(cartOrg,chainFromOrg)) score += 2000; if (sameOk(cartOrg,chainToOrg)) score += 1000; score += 10*(maxRank-fromRank); score += (maxRank - toRank); return score; }
static struct bed6* bigBedBed6Fetch(struct metaBig* mb, char* chrom, unsigned start, unsigned end, struct lm* lm) /* the main fetcher */ { struct metaBigBed6Helper helper; struct bigBedInterval* ints = bigBedIntervalQuery(mb->big.bbi, chrom, (bits32)start, (bits32)end, 0, lm); struct bigBedInterval* ival; helper.lm = lm; helper.chrom = chrom; helper.chromSize = hashIntVal(mb->chromSizeHash, chrom); helper.dot = "."; helper.bedList = NULL; helper.mb = mb; for (ival = ints; ival != NULL; ival = ival->next) { struct bed6* newbed = bed6FromBigBedInterval(ival, helper); if (newbed) slAddHead(&helper.bedList, newbed); } slReverse(&helper.bedList); return helper.bedList; }
struct binKeeper *readRepeats2(char *chrom, char *rmskFileName, struct hash *tSizeHash) /* read all repeats for a chromosome of size size, returns results in binKeeper structure for fast query*/ { boolean rmskRet; struct lineFile *rmskF = NULL; struct rmskOut2 *rmsk; struct binKeeper *bk; int size; size = hashIntVal(tSizeHash, chrom); bk = binKeeperNew(0, size); assert(size > 1); rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet); while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL) { binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk); } lineFileClose(&rmskF); return bk; }
static struct hash *combineCgapSages(struct cgapSage *tag, struct hash *libHash, struct hash *libTotHash) /* Go through the each lib for a tag and combine it's score using a hash for */ /* repeated tissues. */ { struct hash *tpmHash = newHash(5); int i; for (i = 0; i < tag->numLibs; i++) { char libId[16]; char *libName; int libTotTags = 0; struct cgapSageTpmHashEl *tpm; safef(libId, sizeof(libId), "%d", tag->libIds[i]); libName = hashMustFindVal(libHash, libId); tpm = hashFindVal(tpmHash, libName); libTotTags = hashIntVal(libTotHash, libId); if (keepThisLib(libName, libId)) { if (tpm) { tpm->count++; tpm->freqTotal += tag->freqs[i]; tpm->total += tag->tagTpms[i]; tpm->libTotals += libTotTags; } else { AllocVar(tpm); tpm->count = 1; tpm->total = tag->tagTpms[i]; tpm->freqTotal = tag->freqs[i]; tpm->libTotals = libTotTags; hashAdd(tpmHash, libName, tpm); } } } return tpmHash; }
struct cgapSage *cloneBedAddStuff(struct hash *freqHash, struct hash *totTagsHash, struct bed *oneBed) /* Do a shallow copy of the bed into the cgapSage struct. Later the original */ /* bed list should be freed with slFreeList instead of bedFreeList. */ { struct cgapSage *newCgap; struct slPair *list = hashFindVal(freqHash, oneBed->name); struct slPair *cur; int ix = 0; AllocVar(newCgap); newCgap->chrom = oneBed->chrom; newCgap->chromStart = oneBed->chromStart; newCgap->chromEnd = oneBed->chromEnd; newCgap->name = oneBed->name; newCgap->score = oneBed->score; newCgap->strand[0] = oneBed->strand[0]; newCgap->thickStart = oneBed->thickStart; newCgap->thickEnd = oneBed->thickEnd; newCgap->numLibs = slCount(list); if (newCgap->numLibs > 0) { AllocArray(newCgap->libIds, newCgap->numLibs); AllocArray(newCgap->freqs, newCgap->numLibs); AllocArray(newCgap->tagTpms, newCgap->numLibs); for (cur = list; cur != NULL; cur = cur->next) { double totalTags = (double)hashIntVal(totTagsHash, cur->name); newCgap->libIds[ix] = sqlUnsigned(cur->name); newCgap->freqs[ix] = (unsigned)ptToInt(cur->val); newCgap->tagTpms[ix] = (double)newCgap->freqs[ix] * (1000000 / totalTags); ix++; } } else if (optionExists("noEmpty")) { cgapSageFree(&newCgap); } return newCgap; }
struct mafAli *axtLoadAsMafInRegion(struct sqlConnection *conn, char *table, char *chrom, int start, int end, char *tPrefix, char *qPrefix, int tSize, struct hash *qSizeHash) /* Return list of alignments in region from axt external file as a maf. */ { char **row; unsigned int extFileId = 0; struct lineFile *lf = NULL; struct mafAli *maf, *mafList = NULL; struct axt *axt; int rowOffset; struct sqlResult *sr = hRangeQuery(conn, table, chrom, start, end, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct scoredRef ref; scoredRefStaticLoad(row + rowOffset, &ref); if (ref.extFile != extFileId) { char *path = hExtFileName(sqlGetDatabase(conn),"extFile", ref.extFile); lf = lineFileOpen(path, TRUE); extFileId = ref.extFile; } lineFileSeek(lf, ref.offset, SEEK_SET); axt = axtRead(lf); if (axt == NULL) internalErr(); maf = mafFromAxt(axt, tSize, tPrefix, hashIntVal(qSizeHash, axt->qName), qPrefix); axtFree(&axt); slAddHead(&mafList, maf); } sqlFreeResult(&sr); lineFileClose(&lf); slReverse(&mafList); return mafList; }
void bwgMakeAllChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, int *retChromCount, struct bbiChromInfo **retChromArray, int *retMaxChromNameSize) /* Fill in chromId field in sectionList. Return array of chromosome name/ids. * The chromSizeHash is keyed by name, and has int values. */ { /* Build up list of unique chromosome names. */ int maxChromNameSize = 0; /* Get list of values */ int chromCount = chromSizeHash->elCount; char ** chromName, ** chromNames; AllocArray(chromNames, chromCount); chromName = chromNames; struct hashEl* el; struct hashCookie cookie = hashFirst(chromSizeHash); for (el = hashNext(&cookie); el; el = hashNext(&cookie)) { *chromName = el->name; if (strlen(el->name) > maxChromNameSize) maxChromNameSize = strlen(el->name); chromName++; } qsort(chromNames, chromCount, sizeof(char *), bwgStrcmp); /* Allocate and fill in results array. */ struct bbiChromInfo *chromArray; AllocArray(chromArray, chromCount); int i; for (i = 0; i < chromCount; ++i) { chromArray[i].name = chromNames[i]; chromArray[i].id = i; chromArray[i].size = hashIntVal(chromSizeHash, chromNames[i]); } // Assign IDs to sections: struct bwgSection *section; char *name = ""; bits32 chromId = 0; for (section = sectionList; section != NULL; section = section->next) { if (!sameString(section->chrom, name)) { for (i = 0; i < chromCount; ++i) { if (sameString(section->chrom, chromArray[i].name)) { section->chromId = i; break; } } if (i == chromCount) errAbort("Could not find %s in list of chromosomes\n", section->chrom); chromId = section->chromId; name = section->chrom; } else section->chromId = chromId; } /* Clean up, set return values and go home. */ *retChromCount = chromCount; *retChromArray = chromArray; *retMaxChromNameSize = maxChromNameSize; }
static void parseBedGraphSection(struct lineFile *lf, boolean clipDontDie, struct hash *chromSizeHash, struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList) /* Parse out bedGraph section until we get to something that is not in bedGraph format. */ { /* Set up hash and list to store chromosomes. */ struct hash *chromHash = hashNew(0); struct bedGraphChrom *chrom, *chromList = NULL; /* Collect lines in items on appropriate chromosomes. */ struct bwgBedGraphItem *item; char *line; while (lineFileNextReal(lf, &line)) { /* Check for end of section. */ if (stepTypeLine(line)) { lineFileReuse(lf); break; } /* Parse out our line and make sure it has exactly 4 columns. */ char *words[5]; int wordCount = chopLine(line, words); lineFileExpectWords(lf, 4, wordCount); /* Get chromosome. */ char *chromName = words[0]; chrom = hashFindVal(chromHash, chromName); if (chrom == NULL) { lmAllocVar(chromHash->lm, chrom); hashAddSaveName(chromHash, chromName, chrom, &chrom->name); chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM); slAddHead(&chromList, chrom); } /* Convert to item and add to chromosome list. */ lmAllocVar(lm, item); item->start = lineFileNeedNum(lf, words, 1); item->end = lineFileNeedNum(lf, words, 2); item->val = lineFileNeedDouble(lf, words, 3); /* Do sanity checking on coordinates. */ if (item->start > item->end) errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", item->start, item->end, lf->lineIx, lf->fileName); if (item->end > chrom->size) { warn("bedGraph error line %d of %s: chromosome %s has size %u but item ends at %u", lf->lineIx, lf->fileName, chrom->name, chrom->size, item->end); if (!clipDontDie) noWarnAbort(); } else { slAddHead(&chrom->itemList, item); } } slSort(&chromList, bedGraphChromCmpName); /* Loop through each chromosome and output the item list, broken into sections * for that chrom. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { slSort(&chrom->itemList, bwgBedGraphItemCmp); /* Check to make sure no overlap between items. */ struct bwgBedGraphItem *item = chrom->itemList, *nextItem; for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next) { if (item->end > nextItem->start) errAbort("Overlap between %s %d %d and %s %d %d.\nPlease remove overlaps and try again", chrom->name, item->start, item->end, chrom->name, nextItem->start, nextItem->end); item = nextItem; } /* Break up into sections of no more than items-per-slot size. */ struct bwgBedGraphItem *startItem, *endItem, *nextStartItem = chrom->itemList; for (startItem = chrom->itemList; startItem != NULL; startItem = nextStartItem) { /* Find end item of this section, and start item for next section. * Terminate list at end item. */ int sectionSize = 0; int i; endItem = startItem; for (i=0; i<itemsPerSlot; ++i) { if (nextStartItem == NULL) break; endItem = nextStartItem; nextStartItem = nextStartItem->next; ++sectionSize; } endItem->next = NULL; /* Fill in section and add it to section list. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = cloneString(chrom->name); section->start = startItem->start; section->end = endItem->end; section->type = bwgTypeBedGraph; section->items.bedGraphList = startItem; section->itemCount = sectionSize; slAddHead(pSectionList, section); } } /* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in * hash's memory. */ hashFree(&chromHash); chromList = NULL; }
static void parseSteppedSection(struct lineFile *lf, boolean clipDontDie, struct hash *chromSizeHash, char *initialLine, struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList) /* Parse out a variableStep or fixedStep section and add it to list, breaking it up as need be. */ { /* Parse out first word of initial line and make sure it is something we recognize. */ char *typeWord = nextWord(&initialLine); enum bwgSectionType type = bwgTypeFixedStep; if (sameString(typeWord, "variableStep")) type = bwgTypeVariableStep; else if (sameString(typeWord, "fixedStep")) type = bwgTypeFixedStep; else errAbort("Unknown type %s\n", typeWord); /* Set up defaults for values we hope to parse out of rest of line. */ int span = 0; bits32 step = 0; bits32 start = 0; char *chrom = NULL; /* Parse out var=val pairs. */ char *varEqVal; while ((varEqVal = nextWord(&initialLine)) != NULL) { char *wordPairs[2]; int wc = chopByChar(varEqVal, '=', wordPairs, 2); if (wc != 2) errAbort("strange var=val pair line %d of %s", lf->lineIx, lf->fileName); char *var = wordPairs[0]; char *val = wordPairs[1]; if (sameString(var, "chrom")) chrom = cloneString(val); else if (sameString(var, "span")) span = parseUnsignedVal(lf, var, val); else if (sameString(var, "step")) step = parseUnsignedVal(lf, var, val); else if (sameString(var, "start")) { start = parseUnsignedVal(lf, var, val); } else errAbort("Unknown setting %s=%s line %d of %s", var, val, lf->lineIx, lf->fileName); } /* Check that we have all that are required and no more, and call type-specific routine to parse * rest of section. */ if (chrom == NULL) errAbort("Missing chrom= setting line %d of %s\n", lf->lineIx, lf->fileName); bits32 chromSize = (chromSizeHash ? hashIntVal(chromSizeHash, chrom) : BIGNUM); if (start > chromSize) { warn("line %d of %s: chromosome %s has %u bases, but item starts at %u", lf->lineIx, lf->fileName, chrom, chromSize, start); if (!clipDontDie) noWarnAbort(); } if (type == bwgTypeFixedStep) { if (start == 0) errAbort("Missing start= setting line %d of %s\n", lf->lineIx, lf->fileName); if (step == 0) errAbort("Missing step= setting line %d of %s\n", lf->lineIx, lf->fileName); if (span == 0) span = step; parseFixedStepSection(lf, clipDontDie, lm, itemsPerSlot, chrom, chromSize, span, start-1, step, pSectionList); } else { if (start != 0) errAbort("Extra start= setting line %d of %s\n", lf->lineIx, lf->fileName); if (step != 0) errAbort("Extra step= setting line %d of %s\n", lf->lineIx, lf->fileName); if (span == 0) span = 1; parseVariableStepSection(lf, clipDontDie, lm, itemsPerSlot, chrom, chromSize, span, pSectionList); } }
void bigBedTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) /* Print out selected fields from Big Bed. If fields is NULL, then print out all fields. */ { if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = bigBedGetFields(table, conn); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) hashAddInt(fieldHash, bb->name, i); // If bigBed has name column, look up pasted/uploaded identifiers if any: struct hash *idHash = NULL; if (slCount(bbList) >= 4) idHash = identifierHash(db, table); /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } /* Output row of labels */ fprintf(f, "#%s", fieldArray[0]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", fieldArray[i]); fprintf(f, "\n"); /* Open up bigBed file. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpen(fileName); struct asObject *as = bigBedAsOrDefault(bbi); struct asFilter *filter = NULL; if (anyFilter()) { filter = asFilterFromCart(cart, db, table, as); if (filter) { fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); } } /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); for (region = regionList; region != NULL; region = region->next) { struct lm *lm = lmInit(0); struct bigBedInterval *iv, *ivList = bigBedIntervalQuery(bbi, region->chrom, region->start, region->end, 0, lm); char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; for (iv = ivList; iv != NULL; iv = iv->next) { bigBedIntervalToRow(iv, region->chrom, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { if ((idHash != NULL) && (hashLookup(idHash, row[3]) == NULL)) continue; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", row[columnArray[i]]); fprintf(f, "\n"); } } lmCleanup(&lm); } /* Clean up and exit. */ bbiFileClose(&bbi); hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
void bamTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) /* Print out selected fields from BAM. If fields is NULL, then print out all fields. */ { struct hTableInfo *hti = NULL; hti = getHti(db, table, conn); struct hash *idHash = NULL; char *idField = getIdField(db, curTrack, table, hti); int idFieldNum = 0; /* if we know what field to use for the identifiers, get the hash of names */ if (idField != NULL) idHash = identifierHash(db, table); if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = bamGetFields(); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) { /* if we know the field for identifiers, save it away */ if ((idField != NULL) && sameString(idField, bb->name)) idFieldNum = i; hashAddInt(fieldHash, bb->name, i); } /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } /* Output row of labels */ fprintf(f, "#%s", fieldArray[0]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", fieldArray[i]); fprintf(f, "\n"); struct asObject *as = bamAsObj(); struct asFilter *filter = NULL; if (anyFilter()) { filter = asFilterFromCart(cart, db, table, as); if (filter) { fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); } } /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { struct lm *lm = lmInit(0); char *fileName = bamFileName(table, conn, region->chrom); struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom, region->start, region->end, lm); char *row[SAMALIGNMENT_NUM_COLS]; char numBuf[BAM_NUM_BUF_SIZE]; for (sam = samList; sam != NULL && (maxOut > 0); sam = sam->next) { samAlignmentToRow(sam, numBuf, row); if (asFilterOnRow(filter, row)) { /* if we're looking for identifiers, check if this matches */ if ((idHash != NULL)&&(hashLookup(idHash, row[idFieldNum]) == NULL)) continue; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", row[columnArray[i]]); fprintf(f, "\n"); maxOut --; } } freeMem(fileName); lmCleanup(&lm); } if (maxOut == 0) warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); /* Clean up and exit. */ hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); }
static void gapToLift(char *db, char *outFile) /* gapToLift - create lift file from gap table(s). */ { FILE *out = mustOpen(outFile, "w"); struct sqlConnection *conn = sqlConnect(db); struct chromInfo *cInfoList = loadChromInfo(conn); struct agpGap *gapList = loadAllGaps(conn, db, cInfoList); struct agpGap *gap; int start = 0; int end = 0; char *prevChr = NULL; int liftCount = 0; int chrSize = 0; static struct hash *chrDone = NULL; chrDone = newHash(0); if (isNotEmpty(bedFileName)) { bedFile = mustOpen(bedFileName, "w"); verbose(2,"#\tbed output requested to %s\n", bedFileName); } for (gap = gapList; gap; gap = gap->next) { verbose(3,"#\t%s\t%d\t%d\t%s\n", gap->chrom, gap->chromStart, gap->chromEnd, gap->bridge); if (prevChr && sameWord(prevChr, gap->chrom)) { /* continuing same segment, check for gap break, * or gap at end of chrom */ if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize)) { end = gap->chromStart; liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize); start = gap->chromEnd; end = start; } else end = gap->chromEnd; } else /* new chrom encountered */ { /* output last segment of previous chrom when necessary */ if (prevChr && differentWord(prevChr, gap->chrom)) { if (end < chrSize) liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize); } liftCount = 0; chrSize = hashIntVal(cInfoHash, gap->chrom); hashAddInt(chrDone, gap->chrom, 1); if (gap->chromStart > 0) { /* starting first segment at position 0 */ start = 0; end = gap->chromStart; /* does the first gap break it ? Or gap goes to end of chrom. */ if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize)) { liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize); start = gap->chromEnd; end = start; } } else /* first gap is actually the beginning of the chrom */ { /* thus, first segment starts after this first gap */ start = gap->chromEnd; end = start; } } prevChr = gap->chrom; /* remember prev chrom to detect next chrom */ } /* potentially a last one */ if (end < chrSize) liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize); /* check that all chroms have been used */ struct hashCookie cookie = hashFirst(cInfoHash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { if (NULL == hashLookup(chrDone, hel->name)) { chrSize = hashIntVal(cInfoHash, hel->name); verbose(2, "#\tno gaps on chrom: %s, size: %d\n", hel->name, chrSize); liftCount = liftOutLine(out, hel->name, 0, chrSize, 0, chrSize); } } carefulClose(&out); sqlDisconnect(&conn); }
static struct bed* sectionToBed(char* sectionString, struct hash* chromSizes) /* deconstruct the "chr:start-end" into what's needed for the */ /* bed. Also allow just "chr" or "chr:start+size" */ { struct bed* sec = NULL; char* colon = NULL; AllocVar(sec); colon = strchr(sectionString, ':'); if (colon == NULL) /* only a chrom */ { sec->chrom = cloneString(sectionString); sec->chromStart = 0; sec->chromEnd = (unsigned)hashIntVal(chromSizes, sec->chrom); } else { unsigned chromSize = 0; char* minusOrPlus = NULL; char* start = colon + 1; char* end = NULL; if (start == NULL) errAbort("malformed range (nothing after ':')"); sec->chrom = cloneStringZ(sectionString, colon - sectionString); chromSize = (unsigned)hashIntVal(chromSizes, sec->chrom); minusOrPlus = strchr(start, '-'); if (minusOrPlus != NULL) { if (minusOrPlus == start) sec->chromStart = 0; else { *minusOrPlus = '\0'; sec->chromStart = sqlUnsigned(start); *minusOrPlus = '-'; } end = minusOrPlus + 1; if ((end != NULL) && (strlen(end) > 0)) sec->chromEnd = sqlUnsigned(end); else sec->chromEnd = chromSize; } else { /* check for '+' */ minusOrPlus = strchr(start, '+'); if (minusOrPlus != NULL) { if (minusOrPlus == start) sec->chromStart = 0; else { *minusOrPlus = '\0'; sec->chromStart = sqlUnsigned(start); *minusOrPlus = '+'; } end = minusOrPlus + 1; if (end != NULL) sec->chromEnd = sqlUnsigned(end) + sec->chromStart; } else errAbort("malformed range %s", sectionString); } if (sec->chromStart >= sec->chromEnd) errAbort("bad range specified: start >= end with %s", sectionString); if (sec->chromEnd > chromSize) errAbort("bad range specified: end > %s size (%u) for %s", sec->chrom, chromSize, sectionString); } return sec; }
static struct linkedFeatures *cgapSageToLinkedFeatures(struct cgapSage *tag, struct hash *libHash, struct hash *libTotHash, enum trackVisibility vis) /* Convert a single CGAP tag to a list of linkedFeatures. */ { struct linkedFeatures *libList = NULL; struct linkedFeatures *skel = skeletonLf(tag); int i; if (vis == tvDense) /* Just use the skeleton one. */ { int tagTotal = 0; int freqTotal = 0; int libsUsed = 0; for (i = 0; i < tag->numLibs; i++) { char libId[16]; char *libName; safef(libId, sizeof(libId), "%d", tag->libIds[i]); libName = hashMustFindVal(libHash, libId); if (keepThisLib(libName, libId)) { int libTotal = hashIntVal(libTotHash, libId); tagTotal += libTotal; freqTotal += tag->freqs[i]; libsUsed++; } } if (libsUsed > 0) { skel->name = cloneString("whatever"); skel->score = (float)((double)freqTotal * (1000000/tagTotal)); skel->grayIx = grayIxForCgap(skel->score); addSimpleFeature(skel); libList = skel; } } else if (vis == tvPack) { /* If it's pack mode, average tissues into one linkedFeature. */ struct hash *tpmHash = combineCgapSages(tag, libHash, libTotHash); struct hashEl *tpmList = hashElListHash(tpmHash); struct hashEl *tpmEl; slSort(&tpmList, slNameCmp); for (tpmEl = tpmList; tpmEl != NULL; tpmEl = tpmEl->next) { struct linkedFeatures *tiss = CloneVar(skel); struct cgapSageTpmHashEl *tpm = (struct cgapSageTpmHashEl *)tpmEl->val; char link[256]; char *encTissName = NULL; double score = 0; int len = strlen(tpmEl->name) + 32; tiss->name = needMem(len); safef(tiss->name, len, "%s (%d)", tpmEl->name, tpm->count); encTissName = cgiEncode(tpmEl->name); safef(link, sizeof(link), "i=%s&tiss=%s", tag->name, encTissName); score = (double)tpm->freqTotal*(1000000/(double)tpm->libTotals); tiss->score = (float)score; tiss->grayIx = grayIxForCgap(score); tiss->extra = cloneString(link); freeMem(encTissName); addSimpleFeature(tiss); slAddHead(&libList, tiss); } hashElFreeList(&tpmList); freeHashAndVals(&tpmHash); } else /* full mode */ { for (i = 0; i < tag->numLibs; i++) { char libId[16]; char *libName; char link[256]; struct linkedFeatures *lf; safef(libId, sizeof(libId), "%d", tag->libIds[i]); libName = hashMustFindVal(libHash, libId); if (keepThisLib(libName, libId)) { lf = CloneVar(skel); lf->name = cloneString(libName); safef(link, sizeof(link), "i=%s&lib=%s", tag->name, libId); lf->score = (float)tag->tagTpms[i]; lf->grayIx = grayIxForCgap(tag->tagTpms[i]); lf->extra = cloneString(link); addSimpleFeature(lf); slAddHead(&libList, lf); } } } slSort(&libList, cgapLinkedFeaturesCmp); slReverse(&libList); return libList; }