void checkInputOpenFiles(struct inInfo *array, int count) /* Make sure all of the input is there and of right format before going forward. Since * this is going to take a while we want to fail fast. */ { int i; for (i=0; i<count; ++i) { struct inInfo *in = &array[i]; switch (in->type) { case itBigWig: { /* Just open and close, it will abort if any problem. */ in->bbi = bigWigFileOpen(in->fileName); break; } case itPromoterBed: case itUnstrandedBed: case itBlockedBed: { struct lineFile *lf = in->lf = lineFileOpen(in->fileName, TRUE); char *line; lineFileNeedNext(lf, &line, NULL); char *dupe = cloneString(line); char *row[256]; int wordCount = chopLine(dupe, row); struct bed *bed = NULL; switch (in->type) { case itPromoterBed: lineFileExpectAtLeast(lf, 6, wordCount); bed = bedLoadN(row, 6); char strand = bed->strand[0]; if (strand != '+' && strand != '-') errAbort("%s must be stranded, got %s in that field", lf->fileName, row[6]); break; case itUnstrandedBed: lineFileExpectAtLeast(lf, 4, wordCount); bed = bedLoadN(row, 4); break; case itBlockedBed: lineFileExpectAtLeast(lf, 4, wordCount); bed = bedLoadN(row, 12); break; default: internalErr(); break; } bedFree(&bed); freez(&dupe); lineFileReuse(lf); break; } default: internalErr(); break; } } }
void peakClusterMakerAddFromSource(struct peakClusterMaker *maker, struct peakSource *source) /* Read through data source and add items to it to rangeTrees in maker */ { struct hash *chromHash = maker->chromHash; struct lineFile *lf = lineFileOpen(source->dataSource, TRUE); struct lm *lm = chromHash->lm; /* Local memory pool - share with hash */ char *row[source->minColCount]; struct peakItem *item; char *line; while (lineFileNextReal(lf, &line)) { char *asciiLine = lmCloneString(lm, line); int wordCount = chopByWhite(line, row, source->minColCount); lineFileExpectAtLeast(lf, source->minColCount, wordCount); char *chrom = row[source->chromColIx]; struct hashEl *hel = hashLookup(chromHash, chrom); if (hel == NULL) { struct rbTree *tree = rangeTreeNewDetailed(lm, maker->stack); hel = hashAdd(chromHash, chrom, tree); } struct rbTree *tree = hel->val; lmAllocVar(lm, item); item->chrom = hel->name; item->chromStart = sqlUnsigned(row[source->startColIx]); item->chromEnd = sqlUnsigned(row[source->endColIx]); item->score = sqlDouble(row[source->scoreColIx]) * source->normFactor; if (item->score > 1000) item->score = 1000; item->source = source; item->asciiLine = asciiLine; rangeTreeAddValList(tree, item->chromStart, item->chromEnd, item); } lineFileClose(&lf); }
double calcNormScoreFactor(char *fileName, int scoreCol) /* Figure out what to multiply things by to get a nice browser score (0-1000) */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[scoreCol+1]; double sum = 0, sumSquares = 0; int n = 0; double minVal=0, maxVal=0; int fieldCount; while ((fieldCount = lineFileChop(lf, row)) != 0) { lineFileExpectAtLeast(lf, scoreCol+1, fieldCount); double x = sqlDouble(row[scoreCol]); if (n == 0) minVal = maxVal = x; if (x < minVal) minVal = x; if (x > maxVal) maxVal = x; sum += x; sumSquares += x*x; n += 1; } lineFileClose(&lf); double std = calcStdFromSums(sum, sumSquares, n); double mean = sum/n; double highEnd = mean + std; if (highEnd > maxVal) highEnd = maxVal; return 1000.0/highEnd; }
boolean mgcStatusTblCopyRow(struct lineFile *inLf, FILE *outFh) /* read a copy one row of a status table tab file without * fully parsing. Expand if optional fields are missing */ { char *line; int numCols, i; char *row[MGCSTATUS_NUM_COLS]; if (!lineFileNextReal(inLf, &line)) return FALSE; numCols = chopTabs(line, row); numCols = min(numCols, MGCSTATUS_NUM_COLS); lineFileExpectAtLeast(inLf, MGCSTATUS_MIN_NUM_COLS, numCols); for (i = 0; i < numCols; i++) { if (i > 0) fputc('\t', outFh); fputs(row[i], outFh); } /* pad */ for (; i < MGCSTATUS_NUM_COLS; i++) fputc('\t', outFh); fputc('\n', outFh); return TRUE; }
void colTransform(char *column, char *input, char *addFactor, char *mulFactor, char *output) /* colTransform - Add and/or multiply column by constant.. */ { int col = sqlUnsigned(column) - 1; double add = sqlDouble(addFactor); double mul = sqlDouble(mulFactor); struct lineFile *lf = lineFileOpen(input, TRUE); FILE *f = mustOpen(output, "w"); char *words[512]; int wordCount; while ((wordCount = lineFileChop(lf, words)) > 0) { lineFileExpectAtLeast(lf, col, wordCount); double x = lineFileNeedDouble(lf, words, col); int i; for (i=0; i<wordCount; ++i) { if (i != 0) fputc('\t', f); if (i == col) fprintf(f, "%g", x*mul+add); else fputs(words[i], f); } fputc('\n', f); } carefulClose(&f); }
static void agpToFa(char *agpFile, char *agpSeq, char *faOut, char *seqDir) /* agpToFa - Convert a .agp file to a .fa file. */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; int lastPos = 0; struct agpFrag *agpList = NULL, *agp; FILE *f = mustOpen(faOut, "w"); char *prevChrom = NULL; verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (! (sameWord("all", agpSeq) || sameWord(words[0], agpSeq))) continue; if (prevChrom != NULL && !sameString(prevChrom, words[0])) { agpToFaOne(&agpList, agpFile, prevChrom, seqDir, lastPos, f); lastPos = 0; } if (words[4][0] != 'N' && words[4][0] != 'U') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* file is 1-based but agpFragLoad() now assumes 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n", lf->lineIx, lf->fileName); if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); slAddHead(&agpList, agp); lastPos = agp->chromEnd; } else { lastPos = lineFileNeedNum(lf, words, 2); } if (prevChrom == NULL || !sameString(prevChrom, words[0])) { freeMem(prevChrom); prevChrom = cloneString(words[0]); } } agpToFaOne(&agpList, agpFile, prevChrom, seqDir, lastPos, f); }
static struct genePred *fileNext(struct genePredReader* gpr) /* read the next record from a file */ { char *row[GENEPREDX_NUM_COLS]; int numFields; while ((numFields = lineFileChopNextTab(gpr->lf, row, GENEPREDX_NUM_COLS)) > 0) { lineFileExpectAtLeast(gpr->lf, GENEPRED_NUM_COLS, numFields); if ((gpr->chrom == NULL) || (sameString(row[1], gpr->chrom))) return genePredExtLoad(row, numFields); } return NULL; }
struct mgcStatusTbl *mgcStatusTblLoad(char *mgcStatusTab, unsigned opts) /* Load a mgcStatusTbl object from a tab file */ { struct mgcStatusTbl *mst = mgcStatusTblNew(opts); struct lineFile *lf = lineFileOpen(mgcStatusTab, TRUE); char *line; char *row[MGCSTATUS_NUM_COLS]; while (lineFileNextReal(lf, &line)) { int numCols = chopTabs(line, row); lineFileExpectAtLeast(lf, MGCSTATUS_MIN_NUM_COLS, numCols); loadRow(mst, lf, row, numCols); } lineFileClose(&lf); return mst; }
struct chainNet *chainNetRead(struct lineFile *lf) /* Read next net from file. Return NULL at end of file.*/ { char *line, *words[3]; struct chainNet *net; int wordCount; if (!lineFileNextReal(lf, &line)) return NULL; if (!startsWith("net ", line)) errAbort("Expecting 'net' first word of line %d of %s", lf->lineIx, lf->fileName); AllocVar(net); wordCount = chopLine(line, words); lineFileExpectAtLeast(lf, 3, wordCount); net->name = cloneString(words[1]); net->size = lineFileNeedNum(lf, words, 2); net->nameHash = hashNew(6); net->fillList = cnFillRead(net, lf); return net; }
static void agpSangerUnfinished(char *agpFile, char *contigFasta, char *agpOut) /* Fix agp to match unfinished contigs in fasta */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, "w"); char *newChrom = NULL; struct hash *hash = hashFasta(contigFasta); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); struct hashEl *e, *elist = hashLookup(hash, root); for (e = elist; e; e = hashLookupNext(e)) { struct unfinishedContig *u = e->val; if ((u->fragStart <= agp->fragStart) && (u->fragEnd >= agp->fragEnd)) { agp->frag = cloneString(u->frag); agp->fragEnd -= u->fragStart; agp->fragStart -= u->fragStart; } } freeMem(root); } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }
void motifFinder(char *database, char *name, int fileCount, char *files[]) /* motifFinder - find largest scoring motif in bed items. */ { struct sqlConnection *conn = sqlConnect(database); int fileNum; char where[256]; struct chromInfo *ci = createChromInfoList(NULL, database); sqlSafefFrag(where, sizeof(where), "name = '%s'", name); struct dnaMotif *motif = dnaMotifLoadWhere(conn, motifTable, where); if(markovTable != NULL) dnaMotifMakeLog2(motif); if(motif == NULL) errAbort("couldn't find motif '%s'", name); for (fileNum = 0; fileNum < fileCount; fileNum++) { char *words[64], *line; char **row; struct lineFile *lf = lineFileOpen(files[fileNum], TRUE); while (lineFileNextReal(lf, &line)) { int dnaLength, i, j, rowOffset, length, wordCount = chopTabs(line, words); unsigned chromSize; boolean markovFound = FALSE; double mark0[5]; double mark2[5][5][5]; struct dnaSeq *seq = NULL; char *dupe = NULL; if (0 == wordCount) continue; lineFileExpectAtLeast(lf, 3, wordCount); dupe = cloneString(line); char *chrom = words[0]; int chromStart = lineFileNeedNum(lf, words, 1); if(markovTable != NULL) chromStart = max(2, chromStart); unsigned chromEnd = lineFileNeedNum(lf, words, 2); if (chromEnd < 1) errAbort("ERROR: line %d:'%s'\nchromEnd is less than 1\n", lf->lineIx, dupe); if (chromStart > chromEnd) errAbort("ERROR: line %d:'%s'\nchromStart after chromEnd (%d > %d)\n", lf->lineIx, dupe, chromStart, chromEnd); length = chromEnd - chromStart; chromSize = getChromSize(ci, chrom); if(markovTable == NULL) { dnaLength = length; seq = hDnaFromSeq(database, chrom, chromStart, chromEnd, dnaUpper); if(uniformBackground) { int i; mark0[0] = 1; for(i = 1; i <= 4; i++) mark0[i] = 0.25; } else { dnaMark0(seq, mark0, NULL); } } else { dnaLength = length + 4; if(chromStart - 2 + dnaLength > chromSize) // can't do analysis for potential peak hanging off the end of the chrom continue; seq = hDnaFromSeq(database, chrom, chromStart - 2, chromEnd + 2, dnaUpper); struct sqlResult *sr = hRangeQuery(conn, markovTable, chrom, chromStart, chromStart + 1, NULL, &rowOffset); if((row = sqlNextRow(sr)) != NULL) { dnaMark2Deserialize(row[rowOffset + 3], mark2); dnaMarkMakeLog2(mark2); markovFound = TRUE; } else errAbort("markov table '%s' is missing; non-markov analysis is current not supported", markovTable); sqlFreeResult(&sr); } struct bed6FloatScore *hits = NULL; for (i = 0; i < 2; i++) { double mark0Copy[5]; char strand = i == 0 ? '+' : '-'; for (j = 0; j <= 4; j++) mark0Copy[j] = mark0[j]; if(strand == '-') { // reverse markov table too! double tmp; reverseComplement(seq->dna, dnaLength); tmp = mark0Copy[1]; mark0Copy[1] = mark0Copy[3]; mark0Copy[3] = tmp; tmp = mark0Copy[2]; mark0Copy[2] = mark0Copy[4]; mark0Copy[4] = tmp; } for (j = 0; j < length - motif->columnCount + 1; j++) // tricky b/c if(markovFound) then seq includes the two bytes on either side of actual sequence. { double score; if(markovFound) score = dnaMotifBitScoreWithMarkovBg(motif, seq->dna + j, mark2); else score = dnaMotifBitScoreWithMark0Bg(motif, seq->dna + j, mark0Copy); if(score >= minScoreCutoff) { int start; if(strand == '-') start = (chromEnd - j) - motif->columnCount; else start = chromStart + j; struct bed6FloatScore *hit = NULL; // Watch out for overlapping hits (on either strand; yes, I've seen that happen); // we report only the highest scoring hit in this case. // O(n^2) where n == number of motifs in a peak, but I expect n to be almost always very small. if(!originalCoordinates) { for (hit = hits; hit != NULL; hit = hit->next) { if(hit->chromEnd > start && hit->chromStart <= (start + motif->columnCount)) { verbose(3, "found overlapping hits: %d-%d overlaps with %d-%d\n", start, start + motif->columnCount, hit->chromStart, hit->chromEnd); break; } } } if(hit == NULL || hit->score < score) { if(hit == NULL) { AllocVar(hit); slAddHead(&hits, hit); hit->chrom = cloneString(chrom); } hit->chromStart = originalCoordinates ? chromStart : start; hit->chromEnd = originalCoordinates ? chromEnd : start + motif->columnCount; hit->score = score; hit->strand[0] = strand; } } verbose(3, "j: %d; score: %.2f\n", j, score); } } slSort(&hits, bed6FloatCmpDesc); int count; float currentPrior = prior; for(count = 1; hits != NULL; count++, hits = hits->next) { if(topOnly && count > topOnly) break; // Use a progressively weaker prior for hits with lower scores verbose(3, "count: %d; score: %.2f; prior: %.2f; log2(prior / (1 - prior)): %.2f\n", count, hits->score, currentPrior, log2(currentPrior / (1 - currentPrior))); if(hits->score >= minScoreCutoff - log2(currentPrior / (1 - currentPrior))) { printf("%s\t%d\t%d\t%s\t%.2f\t%c\n", chrom, originalCoordinates ? chromStart : hits->chromStart, originalCoordinates ? chromEnd : hits->chromStart + motif->columnCount, name, hits->score, hits->strand[0]); currentPrior = count == 1 ? priorBackoff : currentPrior * priorBackoff; if(count > 2) verbose(3, "hit for count: %d at %s:%d-%d\n", count, chrom, hits->chromStart, hits->chromStart + motif->columnCount); } else break; } freeDnaSeq(&seq); freeMem(dupe); } lineFileClose(&lf); } sqlDisconnect(&conn); }
struct cnFill *cnFillFromLine(struct hash *nameHash, struct lineFile *lf, char *line) /* Create cnFill structure from line. This will chop up * line as a side effect. */ { static char *words[64]; int i, wordCount; enum {basicFields = 7}; struct cnFill *fill; wordCount = chopLine(line, words); lineFileExpectAtLeast(lf, basicFields, wordCount); fill = cnFillNew(); fill->tStart = lineFileNeedNum(lf, words, 1); fill->tSize = lineFileNeedNum(lf, words, 2); fill->qName = hashStoreName(nameHash, words[3]); fill->qStrand = words[4][0]; fill->qStart = lineFileNeedNum(lf, words, 5); fill->qSize = lineFileNeedNum(lf, words, 6); for (i=basicFields; i<wordCount; i += 2) { char *name = words[i]; if (sameString(name, "score")) fill->score = atof(words[i+1]); else if (sameString(name, "type")) fill->type = hashStoreName(nameHash, words[i+1]); else { /* Cope with integer values. */ int iVal = lineFileNeedNum(lf, words, i+1); if (sameString(name, "id")) fill->chainId = iVal; else if (sameString(name, "ali")) fill->ali = iVal; else if (sameString(name, "tN")) fill->tN = iVal; else if (sameString(name, "qN")) fill->qN = iVal; else if (sameString(name, "tR")) fill->tR = iVal; else if (sameString(name, "qR")) fill->qR = iVal; else if (sameString(name, "tNewR")) fill->tNewR = iVal; else if (sameString(name, "qNewR")) fill->qNewR = iVal; else if (sameString(name, "tOldR")) fill->tOldR = iVal; else if (sameString(name, "qOldR")) fill->qOldR = iVal; else if (sameString(name, "tTrf")) fill->tTrf = iVal; else if (sameString(name, "qTrf")) fill->qTrf = iVal; else if (sameString(name, "qOver")) fill->qOver = iVal; else if (sameString(name, "qFar")) fill->qFar = iVal; else if (sameString(name, "qDup")) fill->qDup = iVal; } } return fill; }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); double aveSize = 0; if (bedCount > 0) aveSize = (double)totalBases/bedCount; *retMinDiff = minDiff; *retAveSize = aveSize; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, char *geneLevels, char *pairsIn, char *outDir) /* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell * lines. */ { /* Load up cell descriptions into cell array */ struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions); int cellCount = slCount(cellList); struct expRecord **cellArray; AllocArray(cellArray, cellCount); int i; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) cellArray[i] = cell; verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions); /* Load up enhBed into a hash keyed by name */ struct bed *enh, *enhList; int fieldCount; bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount); if (fieldCount != 15) errAbort("Expecting bed 15 format in %s", enhBed); struct hash *enhHash = hashNew(16); for (enh = enhList; enh != NULL; enh = enh->next) { if (enh->expCount != cellCount) errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", cellCount, cellDescriptions, enh->expCount, enhBed); hashAddUnique(enhHash, enh->name, enh); } verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed); /* Get a hash with key of gene name and value an array of expression values. */ struct hash *geneHash = hashGeneLevels(geneLevels, cellCount); verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels); /* Open inPairs.bed, just to make sure it's there before we do any output. */ struct lineFile *lf = lineFileOpen(pairsIn, TRUE); /* Remove trailing slash from output dir if any */ if (lastChar(outDir) == '/') { int len = strlen(outDir); outDir[len-1] = 0; } /* Make output directory and open all output files. */ makeDirsOnPath(outDir); FILE *outFiles[cellCount]; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) { char path[PATH_LEN]; safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description); outFiles[i] = mustOpen(path, "w"); } /* Stream through input file and copy to appropriate outputs. */ char *words[bedKnownFields*2]; // Make a little bigger than any known bed int wordCount, wordsRequired = 0; char *separator = "->"; int separatorSize = strlen(separator); int pairCount = 0; while ((wordCount = lineFileChop(lf, words)) != 0) { /* Make sure all lines have same # of fields, and at least 4. */ if (wordsRequired == 0) { wordsRequired = wordCount; lineFileExpectAtLeast(lf, 4, wordCount); } else lineFileExpectWords(lf, wordsRequired, wordCount); ++pairCount; /* Parse out name field. */ char *name = words[3]; char *sepPos = stringIn(separator, name); if (sepPos == NULL) errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName); char *enhName = cloneStringZ(name, sepPos-name); char *geneName = sepPos + separatorSize; /* Look up enhancer and gene. */ enh = hashMustFindVal(enhHash, enhName); double *geneLevels = hashMustFindVal(geneHash, geneName); freez(&enhName); /* Output ones over minimum levels. */ for (i=0; i < cellCount; ++i) { double enhLevel = enh->expScores[i]; double geneLevel = geneLevels[i]; if (enhLevel >= minAct && geneLevel >= minExp) { int j; FILE *f = outFiles[i]; fprintf(f, "%s", words[0]); for (j=1; j<wordCount; ++j) fprintf(f, "\t%s", words[j]); fprintf(f, "\n"); } } } verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn); /* Clean up. */ lineFileClose(&lf); for (i=0; i<cellCount; ++i) carefulClose(&outFiles[i]); }
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table, struct hash *geneToModuleHash, struct hash *moduleAndMotifHash, struct hash *motifHash, struct hash *positionsHash, char *regionTable) /* Load file which is a big matrix with genes for rows and motifs for * columns. There is a semicolon-separated list of numbers in the matrix * where a gene has the motif, and an empty (tab separated) field * where there is no motif. The numbers are relative to the * region associated with the gene in the positionsHash. * Only load bits of this where motif actually occurs in module associated * with gene. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; FILE *f = hgCreateTabFile(tmpDir, table); char *motifNames[32*1024], *row[32*1024]; int motifCount, rowSize, i; char *gene, *module; int geneCount = 0, total = 0; struct dyString *dy = dyStringNew(512); struct genomePos *motifPosList = NULL, *motifPosForGene; struct genomePos *regionPosList = NULL, *regionPos; /* Read first line, which is labels. */ if (!lineFileNextReal(lf, &line)) errAbort("Empty file %s", fileName); subChar(line, ' ', '_'); motifCount = chopLine(line, motifNames); if (motifCount >= ArraySize(motifNames)) errAbort("Too many motifs line 1 of %s", fileName); lineFileExpectAtLeast(lf, 2, motifCount); motifNames[0] = NULL; for (i=1; i<motifCount; ++i) { char name[64]; motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name))); if (!hashLookup(motifHash, motifNames[i])) errAbort("Motif %s is in %s but not modules_motifs.gxm", motifNames[i], fileName); } /* Read subsequent lines. */ while ((rowSize = lineFileChopTab(lf, row)) != 0) { lineFileExpectWords(lf, motifCount, rowSize); gene = row[0]; module = hashFindVal(geneToModuleHash, gene); if (module == NULL) { warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", gene, lf->lineIx, lf->fileName); continue; } regionPos = NULL; for (i=1; i<rowSize; ++i) { if (row[i][0] != 0) { if (hashLookup2(moduleAndMotifHash, module, motifNames[i])) { regionPos = hashFindVal(positionsHash, gene); if (regionPos == NULL) { warn("WARNING: %s in %s but not gene_positions.tab", gene, fileName); i = rowSize; continue; } motifPosForGene = convertMotifPos(row[i], regionPos, hashMustFindVal(motifHash, motifNames[i]), lf); motifPosList = slCat(motifPosForGene, motifPosList); ++total; } } } if (regionPos != NULL) { slAddHead(®ionPosList, regionPos); } ++geneCount; } lineFileClose(&lf); /* Output sorted table of all motif hits. */ { struct genomePos *pos; slSort(&motifPosList, genomePosCmp); for (pos = motifPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->motif); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\t", pos->strand); fprintf(f, "%s\n", pos->name); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " gene varchar(255) not null,\n" " #Indices\n" " INDEX(gene(12)),\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d genes, %d motifs, %d motifs in genes\n", geneCount, motifCount-1, total); hgLoadTabFile(conn, tmpDir, table, &f); // hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); slFreeList(&motifPosList); } /* Now output sorted table of upstream regions. */ { FILE *f = hgCreateTabFile(tmpDir, regionTable); struct genomePos *pos; dyStringClear(dy); dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " #Indices\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", regionTable); sqlRemakeTable(conn, regionTable, dy->string); slSort(®ionPosList, genomePosCmp); for (pos = regionPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->name); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\n", pos->strand); } hgLoadTabFile(conn, tmpDir, regionTable, &f); // hgRemoveTabFile(tmpDir, regionTable); } }
struct hash *agpLoadAll(char *agpFile) /* load AGP entries into a hash of AGP lists, one per chromosome */ { struct hash *agpHash = newHash(0); struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *words[9]; int lastPos = 0; int wordCount; struct agpFrag *agpFrag; struct agpGap *agpGap; char *chrom; struct agp *agp; struct hashEl *hel; while ((wordCount = lineFileChopNext(lf, words, ArraySize(words))) != 0) { lineFileExpectAtLeast(lf, 8, wordCount); chrom = words[0]; if (!hashFindVal(agpHash, chrom)) lastPos = 1; AllocVar(agp); if (words[4][0] != 'N' && words[4][0] != 'U') { /* not a gap */ lineFileExpectWords(lf, 9, wordCount); agpFrag = agpFragLoad(words); if (agpFrag->chromStart != lastPos) errAbort( "Frag start (%d, %d) doesn't match previous end line %d of %s\n", agpFrag->chromStart, lastPos, lf->lineIx, lf->fileName); if (agpFrag->chromEnd - agpFrag->chromStart != agpFrag->fragEnd - agpFrag->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agpFrag->chrom, agpFrag->frag, lf->lineIx, lf->fileName); lastPos = agpFrag->chromEnd + 1; agp->entry = agpFrag; agp->isFrag = TRUE; } else { /* gap */ lineFileExpectWords(lf, 8, wordCount); agpGap = agpGapLoad(words); if (agpGap->chromStart != lastPos) errAbort("Gap start (%d, %d) doesn't match previous end line %d of %s\n", agpGap->chromStart, lastPos, lf->lineIx, lf->fileName); lastPos = agpGap->chromEnd + 1; agp->entry = agpGap; agp->isFrag = FALSE; } if ((hel = hashLookup(agpHash, chrom)) == NULL) hashAdd(agpHash, chrom, agp); else slAddHead(&(hel->val), agp); } #ifndef DEBUG { struct hashCookie cookie; struct hashEl *hel; cookie = hashFirst(agpHash); while ((hel = hashNext(&cookie)) != NULL) { struct agp *agpList; agpList = (struct agp *)hel->val; /* for (agp = agpList; agp != NULL; agp = agp->next) printf("isFrag: %d\n", agp->isFrag); */ } } #endif /* reverse AGP lists */ //hashTraverseVals(agpHash, slReverse); #ifndef DEBUG { struct hashCookie cookie; struct hashEl *hel; cookie = hashFirst(agpHash); while ((hel = hashNext(&cookie)) != NULL) { struct agp *agpList; slReverse(&hel->val); agpList = hel->val; /* agpList = (struct agp *)hel->val; slReverse(&agpList); hashRemove(agpHash, hel->name); hashAdd(agpHash, hel->name, agpList); */ /* for (agp = agpList; agp != NULL; agp = agp->next) printf("isFrag: %d\n", agp->isFrag); */ } } #endif return agpHash; }
static void agpMergeChromScaf(char *agpFile, char *agpOut, boolean filtering) /* Create a combined agp file from the chrom.agp and scaffold.agp, * merging in only scaffolds from scaffold.agp * that are not already in chroms. */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, filtering ? "a" : "w"); char *newChrom = NULL; static struct hash *hash = NULL; boolean skipping = FALSE; if (!hash) hash = hashNew(0); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } skipping = FALSE; if (filtering) { if (hashLookup(hash, words[0])) skipping = TRUE; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); if (!filtering) { char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); hashStore(hash, root); freeMem(root); } } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ if (!skipping) agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; if (!skipping) agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }