boolean lineFileNextRowTab(struct lineFile *lf, char *words[], int wordCount) /* Return next non-blank line that doesn't start with '#' chopped into words * at tabs. Returns FALSE at EOF. Aborts on error. */ { int wordsRead; wordsRead = lineFileChopNextTab(lf, words, wordCount); if (wordsRead == 0) return FALSE; if (wordsRead < wordCount) lineFileExpectWords(lf, wordCount, wordsRead); return TRUE; }
boolean lineFileNextCharRow(struct lineFile *lf, char sep, char *words[], int wordCount) /* Return next non-blank line that doesn't start with '#' chopped into words * delimited by sep. Returns FALSE at EOF. Aborts on error. */ { int wordsRead; wordsRead = lineFileChopCharNext(lf, sep, words, wordCount); if (wordsRead == 0) return FALSE; if (wordsRead < wordCount) lineFileExpectWords(lf, wordCount, wordsRead); return TRUE; }
struct vcfRecord *vcfNextRecord(struct vcfFile *vcff) /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ { char *words[VCF_MAX_COLUMNS]; int wordCount; if ((wordCount = lineFileChop(vcff->lf, words)) <= 0) return NULL; int expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; lineFileExpectWords(vcff->lf, expected, wordCount); return vcfRecordFromRow(vcff, words); }
void parseA(struct lineFile *lf, struct block **retBlockList, int *retScore) /* Parse an alignment stanza into a block list. */ { struct block *block, *blockList = NULL; char *line, *words[6], typeChar; int wordCount; int score = -666; boolean gotScore = FALSE; while (lineFileNext(lf, &line, NULL)) { if (line[0] == '#') continue; if (line[0] == '}') break; wordCount = chopLine(line, words); if (wordCount == 0) continue; typeChar = words[0][0]; if (typeChar == 'l') { lineFileExpectWords(lf, 6, wordCount); AllocVar(block); block->tStart = lineFileNeedNum(lf, words, 1) - 1; block->tEnd = lineFileNeedNum(lf, words, 3); block->qStart = lineFileNeedNum(lf, words, 2) - 1; block->qEnd = lineFileNeedNum(lf, words, 4); if (block->qEnd - block->qStart != block->tEnd - block->tStart) errAbort("Block size mismatch line %d of %s", lf->lineIx, lf->fileName); block->percentId = lineFileNeedNum(lf, words, 5); slAddHead(&blockList, block); } else if (typeChar == 's') { gotScore = TRUE; score = lineFileNeedNum(lf, words, 1); } } if (!gotScore) { errAbort("'a' stanza missing score line %d of %s", lf->lineIx, lf->fileName); } slReverse(&blockList); blockList = removeFrayedEnds(blockList); *retBlockList = blockList; *retScore = score; }
void dnaseHg38AddTreatments(char *inTab, char *outTab) /* dnaseHg38AddTreatments - Add treatments to dnase hg38 metadata. */ { struct sqlConnection *conn = sqlConnect("hgFixed"); struct lineFile *lf = lineFileOpen(inTab, TRUE); FILE *f = mustOpen(outTab, "w"); char *line; while (lineFileNext(lf, &line, NULL)) { if (line[0] == '#') fprintf(f, "%s\ttreatment\tlabel\n", line); else { char *inRow[5]; int wordCount = chopByWhite(line, inRow, ArraySize(inRow)); lineFileExpectWords(lf, 4, wordCount); char *acc = inRow[0]; char *biosample = inRow[1]; char query[512]; sqlSafef(query, sizeof(query), "select expVars from encodeExp where accession = '%s'", acc); char varBuf[1024]; char *treatment = "n/a"; char *label = biosample; char labelBuf[256]; char *vars = sqlQuickQuery(conn, query, varBuf, sizeof(varBuf)); if (!isEmpty(vars)) { treatment = vars + strlen("treatment="); if (sameString(treatment, "4OHTAM_20nM_72hr")) safef(labelBuf, sizeof(labelBuf), "%s 40HTAM", biosample); else if (sameString(treatment, "diffProtA_14d")) safef(labelBuf, sizeof(labelBuf), "%s diff 14d", biosample); else if (sameString(treatment, "diffProtA_5d")) safef(labelBuf, sizeof(labelBuf), "%s diff 5d", biosample); else if (sameString(treatment, "DIFF_4d")) safef(labelBuf, sizeof(labelBuf), "%s diff 4d", biosample); else if (sameString(treatment, "Estradiol_100nM_1hr")) safef(labelBuf, sizeof(labelBuf), "%s estradi 1h", biosample); else if (sameString(treatment, "Estradiol_ctrl_0hr")) safef(labelBuf, sizeof(labelBuf), "%s estradi 0h", biosample); else errAbort("Unknown treatment %s", treatment); label = labelBuf; } fprintf(f, "%s\t%s\t%s\t%s\t%s\t%s\n", inRow[0], inRow[1], inRow[2], inRow[3], treatment, label); } } carefulClose(&f); }
struct hash *hashTwoColumnFile(char *fileName) /* Given a two column file (key, value) return a hash. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = hashNew(16); char *row[3]; int fields = 0; while ((fields = lineFileChop(lf, row)) != 0) { lineFileExpectWords(lf, 2, fields); char *name = row[0]; char *value = lmCloneString(hash->lm, row[1]); hashAdd(hash, name, value); } lineFileClose(&lf); return hash; }
struct rgi *readRgi(char *inName) { struct rgi *rgiList = NULL, *rgi; struct lineFile *lf = lineFileOpen(inName, TRUE); int wordCount; char *words[8]; while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 4, wordCount); rgi = rgiLoad(words); slAddHead(&rgiList, rgi); uglyf("%s %s: min %d, max %d\n", rgi->a, rgi->b, rgi->minDistance, rgi->maxDistance); } lineFileClose(&lf); slReverse(&rgiList); return rgiList; }
static void vcfParseData(struct vcfFile *vcff, int maxRecords) /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned * at the beginning of a data row, parse and store all data rows from lineFile. */ { if (vcff == NULL) return; int recCount = 0, expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; char *words[VCF_MAX_COLUMNS]; int wordCount; while ((wordCount = lineFileChop(vcff->lf, words)) > 0) { if (maxRecords >= 0 && recCount >= maxRecords) break; lineFileExpectWords(vcff->lf, expected, wordCount); struct vcfRecord *record; AllocVar(record); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn. record->chromEnd = record->chromStart+1; record->name = vcfFilePooledStr(vcff, words[2]); parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } slAddHead(&(vcff->records), record); recCount++; } slReverse(&(vcff->records)); lineFileClose(&(vcff->lf)); }
void loadOneBed(struct lineFile *lf, int bedSize, struct bedStub **pList) /* Load one bed file. Make sure all lines have bedSize fields. * Put results in *pList. */ { char *words[64], *line, *dupe; int wordCount; struct bedStub *bed; verbose(1, "Reading %s\n", lf->fileName); while (lineFileNextReal(lf, &line)) { if (hasBin) nextWord(&line); dupe = cloneString(line); if (strictTab) wordCount = chopTabs(line, words); else wordCount = chopLine(line, words); /* ignore empty lines */ if (0 == wordCount) continue; lineFileExpectWords(lf, bedSize, wordCount); AllocVar(bed); bed->chrom = cloneString(words[0]); bed->chromStart = lineFileNeedNum(lf, words, 1); bed->chromEnd = lineFileNeedNum(lf, words, 2); if (! noStrict) { if (bed->chromEnd < 1) errAbort("ERROR: line %d:'%s'\nchromEnd is less than 1\n", lf->lineIx, dupe); if (bed->chromStart == bed->chromEnd && !allowStartEqualEnd) errAbort("ERROR: line %d:'%s'\nchromStart == chromEnd (%d) (zero-length item)\n" "Use -allowStartEqualEnd if that is legit (e.g. for insertion point).\n", lf->lineIx, dupe, bed->chromStart); if (bed->chromStart > bed->chromEnd) errAbort("ERROR: line %d:'%s'\nchromStart after chromEnd (%d > %d)\n", lf->lineIx, dupe, bed->chromStart, bed->chromEnd); } bed->line = dupe; slAddHead(pList, bed); } }
void flagMhcClones(char *mhcFile, char *gsDir) /* flagMhcClones - Look for clones Stephan wants in MHC.. */ { struct lineFile *lf = lineFileOpen(mhcFile, TRUE); char *line, *words[16]; int lineSize, wordCount, i; char clonePath[512]; char *clone, *cloneVer; static char *phases[3] = {"fin", "draft", "predraft",}; boolean found; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#') continue; wordCount = chopLine(line, words); if (wordCount == 0) continue; lineFileExpectWords(lf, 7, wordCount); clone = words[0]; cloneVer = words[1]; found = FALSE; for (i = 0; i < 3; ++i) { char *phase = phases[i]; sprintf(clonePath, "%s/%s/fa/%s.fa", gsDir, phase, clone); if (fileExists(clonePath)) { struct dnaSeq *seq = faReadDna(clonePath); char *e = strchr(seq->name, '_'); if (e != NULL) *e = 0; if (!sameString(seq->name, cloneVer)) printf("%s\t(wrong version %s)\n", cloneVer, seq->name); else if (i != 0) printf("%s\t(not finished)\n", cloneVer); found = TRUE; } } if (!found) printf("%s\t(not found)\n", cloneVer); } }
static struct psl *fileNext(struct pslReader* pr) /* read the next record from a file */ { char *row[PSLX_NUM_COLS]; int numCols; while ((numCols = lineFileChopNextTab(pr->lf, row, PSLX_NUM_COLS)) > 0) { lineFileExpectWords(pr->lf, (pr->isPslx ? PSLX_NUM_COLS : PSL_NUM_COLS), numCols); if ((pr->chrom == NULL) || (sameString(row[13], pr->chrom))) { if (pr->isPslx) return pslxLoad(row); else return pslLoad(row); } } return NULL; }
struct tomRough *loadAllRough(char *fileName) /* Load up all bands from database. */ { struct tomRough *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[16], *line; int wordCount, lineSize; while (lineFileNext(lf, &line, &lineSize)) { wordCount = chopCommas(line, words); lineFileExpectWords(lf, 5, wordCount); el = tomRoughLoad(words); slAddHead(&list, el); } slReverse(&list); lineFileClose(&lf); printf("Loaded %d rough lines\n", slCount(list)); return list; }
void readBaseProbs(struct lineFile *lf, char **words, char *firstWord, float **pArray, int colCount) /* Allocate and read base probabilities. */ { char *line; int wordCount; float *array; int i; lineFileNeedNext(lf, &line, NULL); wordCount = chopByWhite(line, words, colCount+1); lineFileExpectWords(lf, colCount+1, wordCount); if (!sameString(words[0], firstWord)) errAbort("Expecting %s, got %s line %d of %s", firstWord, words[0], lf->lineIx, lf->fileName); AllocArray(array, colCount); for (i=0; i<colCount; ++i) array[i] = atof(words[i+1]); *pArray = array; }
struct clone *readTrans(char *fileName) /* Read info in trans file. */ { char cloneName[128], lastCloneName[128]; struct clone *cloneList = NULL, *clone = NULL; struct frag *frag; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[8], *parts[4], *subParts[3]; int wordCount, partCount, subCount; strcpy(lastCloneName, ""); while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 3, wordCount); partCount = chopString(words[2], "(:)", parts, ArraySize(parts)); if (partCount != 2) errAbort("Badly formatted third field line %d of %s", lf->lineIx, lf->fileName); subCount = chopString(parts[1], ".", subParts, ArraySize(subParts)); if (subCount != 2) errAbort("Badly formatted third field line %d of %s (expecting start..end)", lf->lineIx, lf->fileName); fragToCloneName(words[0], cloneName); if (!sameString(cloneName, lastCloneName)) { AllocVar(clone); clone->name = cloneString(cloneName); slAddHead(&cloneList, clone); } AllocVar(frag); frag->name = cloneString(words[0]); frag->ffaName = cloneString(words[1]); frag->start = lineFileNeedNum(lf, subParts, 0) - 1; frag->end = lineFileNeedNum(lf, subParts, 1); slAddTail(&clone->fragList, frag); strcpy(lastCloneName, cloneName); } lineFileClose(&lf); slReverse(&cloneList); return cloneList; }
struct agpFrag *readAgpFile(char *agpName) /* Read agps from file. */ { struct lineFile *lf = lineFileOpen(agpName, TRUE); int wordCount; char *words[16]; struct agpFrag *list = NULL, *el; while ((wordCount = lineFileChop(lf, words)) != 0) { if (words[4][0] != 'N') { lineFileExpectWords(lf, 9, wordCount); el = agpFragLoad(words); slAddHead(&list, el); } } lineFileClose(&lf); slReverse(&list); return list; }
struct hash *hashGeneLevels(char *fileName, int cellCount) /* Get a hash with key of gene name and value an array of expression values. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = hashNew(16); int fieldCount = cellCount+1; char *words[fieldCount+1]; int wordCount; while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, fieldCount, wordCount); char *name = words[0]; double *vals; AllocArray(vals, cellCount); int i; for (i=0; i<cellCount; ++i) vals[i] = sqlDouble(words[i+1]); hashAdd(hash, name, vals); } lineFileClose(&lf); return hash; }
void readGold(char *fileName, struct clonePos **retList, struct hash **retHash) /* Read .agp/gold formatted file */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[12]; struct clonePos *cpList = NULL, *cp; struct hash *hash = newHash(0); int wordCount; while ((wordCount = lineFileChop(lf, words)) != 0) { char *type = words[4]; char *clone = words[5]; int fragStart, fragEnd; double fragSize; if (type[0] == 'N') continue; lineFileExpectWords(lf, 9, wordCount); chopSuffix(clone); fragStart = lineFileNeedNum(lf, words, 1)-1; fragEnd = lineFileNeedNum(lf, words, 2); fragSize = fragEnd - fragStart; if ((cp = hashFindVal(hash, clone)) == NULL) { AllocVar(cp); hashAddSaveName(hash, clone, cp, &cp->name); slAddHead(&cpList, cp); } cp->weightedPos += fragSize * fragStart; cp->totSize += fragSize; } lineFileClose(&lf); slReverse(&cpList); for (cp = cpList; cp != NULL; cp = cp->next) cp->pos = cp->weightedPos/cp->totSize; *retList = cpList; *retHash = hash; }
void addTpfToTabFile(char *chromName, char *tabFile, FILE *f) /* Add one tpf FILE to tab-separated file */ { struct lineFile *lf = lineFileOpen(tabFile, TRUE); char *row[3]; int wordCount; int ix = 0; while ((wordCount = lineFileChop(lf, row)) != 0) { if (wordCount < 3) { if (wordCount < 2 || !sameWord("GAP", row[0])) lineFileExpectWords(lf, 3, wordCount); row[2] = "?"; } fprintf(f, "%s\t", chromName); fprintf(f, "%s\t", row[0]); fprintf(f, "%s\t", row[1]); fprintf(f, "%s\t", row[2]); fprintf(f, "%d\n", ix++); } lineFileClose(&lf); }
boolean readMotif(struct lineFile *lf, struct motif *m) /* Read five lines of motif info. */ { char *line; char *words[maxMotifSize+1]; int wordCount; int i,j; int colCount = 0; /* Get first line and parse it. */ ZeroVar(m); if (!lineFileNext(lf, &line, NULL)) return FALSE; wordCount = chopLine(line, words); if (wordCount < 6 || !sameString(words[1], "@")) errAbort("Bad line %d of %s", lf->lineIx, lf->fileName); m->score = atof(words[0]); m->pos = atof(words[2]); m->posSd = atof(words[4]); strncpy(m->consensus, words[5], sizeof(m->consensus)); /* Get next lines with columns. */ for (i=0; i<4; ++i) { if (!lineFileNext(lf, &line, NULL)) errAbort("Unexpected end of file in %s", lf->fileName); wordCount = chopLine(line, words); if (i == 0) m->size = colCount = wordCount - 1; else lineFileExpectWords(lf, colCount+1, wordCount); for (j=0; j<colCount; ++j) m->profile[i][j] = atof(words[j+1]); } return TRUE; }
void readProbeList(char *fileName, struct probe **retList, struct hash **retHash) /* Read in sequence list from file. (Set aliSize field to zero). */ { struct hash *hash = newHash(0); struct hashEl *hel; struct probe *list = NULL, *el; char *words[4]; int wordCount; struct lineFile *lf = lineFileOpen(fileName, TRUE); while ((wordCount = lineFileChop(lf, words)) > 0) { lineFileExpectWords(lf, 2, wordCount); AllocVar(el); hel = hashAdd(hash, words[0], el); el->name = hel->name; el->size = atoi(words[1]); slAddHead(&list, el); } slReverse(&list); lineFileClose(&lf); *retList = list; *retHash = hash; }
void cloneSpan(char *fileName) /* cloneSpan - List clones and the amount the span by looking at .gl file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount, lineSize; char *words[16], *line; struct hash *hash = newHash(0); struct hashEl *hel; char *cloneName; int start, end; struct clone *cloneList = NULL, *clone; int totalSpan = 0, totalBases = 0; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#') continue; wordCount = chopLine(line, words); if (wordCount == 0) continue; if (wordCount < 3) lineFileExpectWords(lf, 3, wordCount); cloneName = words[0]; chopSuffix(cloneName); start = sqlUnsigned(words[1]); end = sqlUnsigned(words[2]); clone = hashFindVal(hash, cloneName); if (clone == NULL) { AllocVar(clone); hel = hashAdd(hash, cloneName, clone); clone->name = hel->name; clone->start = start; clone->end = end; slAddHead(&cloneList, clone); } else { if (clone->start > start) clone->start = start; if (clone->end < end) clone->end = end; } clone->baseCount += end-start; } lineFileClose(&lf); slReverse(&cloneList); for (clone = cloneList; clone != NULL; clone = clone->next) { int span = clone->end - clone->start; #ifdef SOMETIMES printf("clone %s, bases %d, spans %d, density %4.2f%%\n", clone->name, clone->baseCount, span, 100.0 * (double)clone->baseCount/(double)span); #endif totalSpan += span; totalBases += clone->baseCount; } printf("%s bases %d, spans %d, density %4.2f%%\n", fileName, totalBases, totalSpan, 100.0 * (double)totalBases/(double)totalSpan); }
struct genScanFeature *parseGenscanLine(struct lineFile *lf, char *line) /* Parse a single line. */ { char *words[16], *parts[3]; int wordCount, partCount; char *type; struct genScanFeature *gsf; boolean isLong = FALSE; int size; wordCount = chopLine(line, words); if (wordCount < 2) errAbort("Expecting at least 2 words line %d of %s", lf->lineIx, lf->fileName); type = words[1]; if (sameString(type, "PlyA") || sameString(type, "Prom")) { lineFileExpectWords(lf, 7, wordCount); } else if (sameString(type, "Init") || sameString(type, "Intr") || sameString(type, "Term") || sameString(type, "Sngl")) { lineFileExpectWords(lf, 13, wordCount); isLong = TRUE; } else { errAbort("Unrecognized type %s line %d of %s", type, lf->lineIx, lf->fileName); } AllocVar(gsf); gsf->name = cloneString(words[0]); partCount = chopString(words[0], ".", parts, ArraySize(parts)); if (partCount != 2 || (parts[0][0] != 'S' && !isdigit(parts[0][0])) || !isdigit(parts[1][0])) errAbort("Expecting N.NN field 1 line %d of %s", lf->lineIx, lf->fileName); gsf->geneId = atoi(parts[0]); gsf->featId = atoi(parts[1]); gsf->type = cloneString(type); gsf->strand = words[2][0]; if (gsf->strand == '-') { gsf->start = lineFileNeedNum(lf, words, 4) - 1; gsf->end = lineFileNeedNum(lf, words, 3); } else { gsf->start = lineFileNeedNum(lf, words, 3) - 1; gsf->end = lineFileNeedNum(lf, words, 4); } size = lineFileNeedNum(lf, words, 5); if (size != gsf->end - gsf->start) errAbort("Len doesn't match Begin to End line %d of %s", lf->lineIx, lf->fileName); if (isLong) { gsf->frame = lineFileNeedNum(lf, words, 6); gsf->phase = lineFileNeedNum(lf, words, 7); gsf->iac = lineFileNeedNum(lf, words, 8); gsf->dot = lineFileNeedNum(lf, words, 9); gsf->codRg = lineFileNeedNum(lf, words, 10); gsf->p = atof(words[11]); gsf->tScore = atof(words[12]); } else gsf->tScore = atof(words[6]); return gsf; }
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, char *geneLevels, char *pairsIn, char *outDir) /* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell * lines. */ { /* Load up cell descriptions into cell array */ struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions); int cellCount = slCount(cellList); struct expRecord **cellArray; AllocArray(cellArray, cellCount); int i; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) cellArray[i] = cell; verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions); /* Load up enhBed into a hash keyed by name */ struct bed *enh, *enhList; int fieldCount; bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount); if (fieldCount != 15) errAbort("Expecting bed 15 format in %s", enhBed); struct hash *enhHash = hashNew(16); for (enh = enhList; enh != NULL; enh = enh->next) { if (enh->expCount != cellCount) errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", cellCount, cellDescriptions, enh->expCount, enhBed); hashAddUnique(enhHash, enh->name, enh); } verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed); /* Get a hash with key of gene name and value an array of expression values. */ struct hash *geneHash = hashGeneLevels(geneLevels, cellCount); verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels); /* Open inPairs.bed, just to make sure it's there before we do any output. */ struct lineFile *lf = lineFileOpen(pairsIn, TRUE); /* Remove trailing slash from output dir if any */ if (lastChar(outDir) == '/') { int len = strlen(outDir); outDir[len-1] = 0; } /* Make output directory and open all output files. */ makeDirsOnPath(outDir); FILE *outFiles[cellCount]; for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next) { char path[PATH_LEN]; safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description); outFiles[i] = mustOpen(path, "w"); } /* Stream through input file and copy to appropriate outputs. */ char *words[bedKnownFields*2]; // Make a little bigger than any known bed int wordCount, wordsRequired = 0; char *separator = "->"; int separatorSize = strlen(separator); int pairCount = 0; while ((wordCount = lineFileChop(lf, words)) != 0) { /* Make sure all lines have same # of fields, and at least 4. */ if (wordsRequired == 0) { wordsRequired = wordCount; lineFileExpectAtLeast(lf, 4, wordCount); } else lineFileExpectWords(lf, wordsRequired, wordCount); ++pairCount; /* Parse out name field. */ char *name = words[3]; char *sepPos = stringIn(separator, name); if (sepPos == NULL) errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName); char *enhName = cloneStringZ(name, sepPos-name); char *geneName = sepPos + separatorSize; /* Look up enhancer and gene. */ enh = hashMustFindVal(enhHash, enhName); double *geneLevels = hashMustFindVal(geneHash, geneName); freez(&enhName); /* Output ones over minimum levels. */ for (i=0; i < cellCount; ++i) { double enhLevel = enh->expScores[i]; double geneLevel = geneLevels[i]; if (enhLevel >= minAct && geneLevel >= minExp) { int j; FILE *f = outFiles[i]; fprintf(f, "%s", words[0]); for (j=1; j<wordCount; ++j) fprintf(f, "\t%s", words[j]); fprintf(f, "\n"); } } } verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn); /* Clean up. */ lineFileClose(&lf); for (i=0; i<cellCount; ++i) carefulClose(&outFiles[i]); }
static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, boolean doCompress, FILE *f, int resTryCount, int resScales[], int resSizes[], struct bbExIndexMaker *eim, int bedCount, bits16 fieldCount, bits32 *retMaxBlockSize) /* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot) * to boundsArray */ { int maxBlockSize = 0; struct bbiChromUsage *usage = usageList; char *line, *row[fieldCount+1]; int lastField = fieldCount-1; int itemIx = 0, sectionIx = 0; bits64 blockStartOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; struct bed *bed; AllocVar(bed); /* Help keep track of which beds are in current chunk so as to write out * namedChunks to eim if need be. */ long sectionStartIx = 0, sectionEndIx = 0; for (;;) { /* Get next line of input if any. */ if (lineFileNextReal(lf, &line)) { /* Chop up line and make sure the word count is right. */ int wordCount; if (tabSep) wordCount = chopTabs(line, row); else wordCount = chopLine(line, row); lineFileExpectWords(lf, fieldCount, wordCount); loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE); chrom = bed->chrom; start = bed->chromStart; end = bed->chromEnd; sameChrom = sameString(chrom, usage->name); } else /* No next line */ { atEnd = TRUE; } /* Check conditions that would end block and save block info and advance to next if need be. */ if (atEnd || !sameChrom || itemIx >= itemsPerSlot) { /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxBlockSize) maxBlockSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); // keep around an area of scratch memory static int compBufSize = 0; static char *compBuf = NULL; // check to see if buffer needed for compression is big enough if (compBufSize < maxCompSize) { // free up the old not-big-enough piece freez(&compBuf); // freez knows bout NULL // get new scratch area compBufSize = maxCompSize; compBuf = needLargeMem(compBufSize); } int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); dyStringClear(stream); /* Save block offset and size for all named chunks in this section. */ if (eim != NULL) { bits64 blockEndOffset = ftell(f); bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset, sectionStartIx, sectionEndIx); sectionStartIx = sectionEndIx; } /* Save info on existing block. */ struct bbiBoundsArray *b = &bounds[sectionIx]; b->offset = blockStartOffset; b->range.chromIx = chromId; b->range.start = startPos; b->range.end = endPos; ++sectionIx; itemIx = 0; if (atEnd) break; } /* Advance to next chromosome if need be and get chromosome id. */ if (!sameChrom) { usage = usage->next; assert(usage != NULL); assert(sameString(chrom, usage->name)); for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } chromId = usage->id; /* At start of block we save a lot of info. */ if (itemIx == 0) { blockStartOffset = ftell(f); startPos = start; endPos = end; } /* Otherwise just update end. */ { if (endPos < end) endPos = end; /* No need to update startPos since list is sorted. */ } /* Save name into namedOffset if need be. */ if (eim != NULL) { bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx); sectionEndIx += 1; } /* Write out data. */ dyStringWriteOne(stream, chromId); dyStringWriteOne(stream, start); dyStringWriteOne(stream, end); if (fieldCount > 3) { int i; /* Write 3rd through next to last field and a tab separator. */ for (i=3; i<lastField; ++i) { char *s = row[i]; dyStringAppend(stream, s); dyStringAppendC(stream, '\t'); } /* Write last field and terminal zero */ char *s = row[lastField]; dyStringAppend(stream, s); } dyStringAppendC(stream, 0); itemIx += 1; /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } } assert(sectionIx == sectionCount); freez(&bed); *retMaxBlockSize = maxBlockSize; }
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table, struct hash *geneToModuleHash, struct hash *moduleAndMotifHash, struct hash *motifHash, struct hash *positionsHash, char *regionTable) /* Load file which is a big matrix with genes for rows and motifs for * columns. There is a semicolon-separated list of numbers in the matrix * where a gene has the motif, and an empty (tab separated) field * where there is no motif. The numbers are relative to the * region associated with the gene in the positionsHash. * Only load bits of this where motif actually occurs in module associated * with gene. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; FILE *f = hgCreateTabFile(tmpDir, table); char *motifNames[32*1024], *row[32*1024]; int motifCount, rowSize, i; char *gene, *module; int geneCount = 0, total = 0; struct dyString *dy = dyStringNew(512); struct genomePos *motifPosList = NULL, *motifPosForGene; struct genomePos *regionPosList = NULL, *regionPos; /* Read first line, which is labels. */ if (!lineFileNextReal(lf, &line)) errAbort("Empty file %s", fileName); subChar(line, ' ', '_'); motifCount = chopLine(line, motifNames); if (motifCount >= ArraySize(motifNames)) errAbort("Too many motifs line 1 of %s", fileName); lineFileExpectAtLeast(lf, 2, motifCount); motifNames[0] = NULL; for (i=1; i<motifCount; ++i) { char name[64]; motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name))); if (!hashLookup(motifHash, motifNames[i])) errAbort("Motif %s is in %s but not modules_motifs.gxm", motifNames[i], fileName); } /* Read subsequent lines. */ while ((rowSize = lineFileChopTab(lf, row)) != 0) { lineFileExpectWords(lf, motifCount, rowSize); gene = row[0]; module = hashFindVal(geneToModuleHash, gene); if (module == NULL) { warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", gene, lf->lineIx, lf->fileName); continue; } regionPos = NULL; for (i=1; i<rowSize; ++i) { if (row[i][0] != 0) { if (hashLookup2(moduleAndMotifHash, module, motifNames[i])) { regionPos = hashFindVal(positionsHash, gene); if (regionPos == NULL) { warn("WARNING: %s in %s but not gene_positions.tab", gene, fileName); i = rowSize; continue; } motifPosForGene = convertMotifPos(row[i], regionPos, hashMustFindVal(motifHash, motifNames[i]), lf); motifPosList = slCat(motifPosForGene, motifPosList); ++total; } } } if (regionPos != NULL) { slAddHead(®ionPosList, regionPos); } ++geneCount; } lineFileClose(&lf); /* Output sorted table of all motif hits. */ { struct genomePos *pos; slSort(&motifPosList, genomePosCmp); for (pos = motifPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->motif); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\t", pos->strand); fprintf(f, "%s\n", pos->name); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " gene varchar(255) not null,\n" " #Indices\n" " INDEX(gene(12)),\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d genes, %d motifs, %d motifs in genes\n", geneCount, motifCount-1, total); hgLoadTabFile(conn, tmpDir, table, &f); // hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); slFreeList(&motifPosList); } /* Now output sorted table of upstream regions. */ { FILE *f = hgCreateTabFile(tmpDir, regionTable); struct genomePos *pos; dyStringClear(dy); dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " #Indices\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", regionTable); sqlRemakeTable(conn, regionTable, dy->string); slSort(®ionPosList, genomePosCmp); for (pos = regionPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->name); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\n", pos->strand); } hgLoadTabFile(conn, tmpDir, regionTable, &f); // hgRemoveTabFile(tmpDir, regionTable); } }
void dumpHapmapPhaseIIISummary() /* Read .bed files, accumulate info, and aggregate into hapmapPhaseIIISummary file. */ { int i; char inFile[256]; struct lineFile *lf = NULL; int wordCount; char *words[13]; char key[128]; struct summary *sum, *sumList = NULL; struct hash *hash = hashNew(24); for (i = 0; i < HAP_PHASEIII_POPCOUNT; i++) { struct hapmapSnps hs; safef(inFile, sizeof(inFile), "hapmapSnps%s.bed", hapmapPhaseIIIPops[i]); lf = lineFileOpen(inFile, TRUE); while ((wordCount = lineFileChopTab(lf, words)) > 0) { lineFileExpectWords(lf, 12, wordCount); hapmapSnpsStaticLoad(words, &hs); // Key by chrom as well as name because the pseudoautosomal regions (PAR) // of chrX and chrY have independent (but identical) SNP items. safef(key, sizeof(key), "%s:%s", hs.chrom, hs.name); sum = hashFindVal(hash, key); if (sum == NULL) { sum = summaryNew(&hs, i); hashAdd(hash, key, sum); slAddHead(&sumList, sum); } else addSnpToSum(sum, &hs, i); } lineFileClose(&lf); } for (i = 0; i < HAP_ORTHO_COUNT; i++) { struct hapmapAllelesOrtho ho; safef(inFile, sizeof(inFile), "hapmapAlleles%s.bed", hapmapOrthoSpecies[i]); lf = lineFileOpen(inFile, TRUE); while ((wordCount = lineFileChopTab(lf, words)) > 0) { lineFileExpectWords(lf, 13, wordCount); hapmapAllelesOrthoStaticLoad(words, &ho); safef(key, sizeof(key), "%s:%s", ho.chrom, ho.name); sum = hashFindVal(hash, key); if (sum == NULL) errAbort("Ortho SNP '%s' doesn't match any HapMap SNPs!", ho.name); addOrthoToSum(sum, &ho, i); } lineFileClose(&lf); } slReverse(&sumList); // That leaves it mostly sorted, but not all! Leave final sorting up to hgLoadBed. FILE *f = mustOpen("hapmapPhaseIIISummary.bed", "w"); for (sum = sumList; sum != NULL; sum = sum->next) { struct hapmapPhaseIIISummary *fs = sum->finalSum; // Convert fs->score (heterozygosity * 1000) from total into average: fs->score = (int)((float)fs->score / fs->popCount + 0.5); // Determine whether the overall{Major,Minor}Alleles are indeed the same // as the first population encountered: char firstPopMajorAl = fs->overallMajorAllele; char firstPopMinorAl = fs->overallMinorAllele; int firstPopYea = 0, firstPopNay = 0; for (i = 0; i < HAP_PHASEIII_POPCOUNT; i++) { if (fs->foundInPop[i]) { if (sum->popMajorAlleles[i] == firstPopMajorAl) firstPopYea++; else firstPopNay++; } } if (firstPopNay > firstPopYea) { fs->overallMajorAllele = firstPopMinorAl; fs->overallMinorAllele = firstPopMajorAl; } hapmapPhaseIIISummaryTabOut(fs, f); } carefulClose(&f); // All done -- no need to waste time freeing hash and sumList. }
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[], boolean doCompress, bits32 *retMaxSectionSize) /* Read through lf, chunking it into sections that get written to f. Save info * about sections in bounds. */ { int maxSectionSize = 0; struct bbiChromUsage *usage = usageList; int itemIx = 0, sectionIx = 0; bits32 reserved32 = 0; UBYTE reserved8 = 0; struct sectionItem items[itemsPerSlot]; struct sectionItem *lastB = NULL; bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; struct dyString *stream = dyStringNew(0); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Figure out whether need to output section. */ boolean sameChrom = FALSE; if (rowSize > 0) sameChrom = sameString(row[0], usage->name); if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom) { /* Figure out section position. */ bits32 chromId = usage->id; bits32 sectionStart = items[0].start; bits32 sectionEnd = items[itemIx-1].end; /* Save section info for indexing. */ assert(sectionIx < sectionCount); struct bbiBoundsArray *section = &bounds[sectionIx++]; section->offset = ftell(f); section->range.chromIx = chromId; section->range.start = sectionStart; section->range.end = sectionEnd; /* Output section header to stream. */ dyStringClear(stream); UBYTE type = bwgTypeBedGraph; bits16 itemCount = itemIx; dyStringWriteOne(stream, chromId); // chromId dyStringWriteOne(stream, sectionStart); // start dyStringWriteOne(stream, sectionEnd); // end dyStringWriteOne(stream, reserved32); // itemStep dyStringWriteOne(stream, reserved32); // itemSpan dyStringWriteOne(stream, type); // type dyStringWriteOne(stream, reserved8); // reserved dyStringWriteOne(stream, itemCount); // itemCount /* Output each item in section to stream. */ int i; for (i=0; i<itemIx; ++i) { struct sectionItem *item = &items[i]; dyStringWriteOne(stream, item->start); dyStringWriteOne(stream, item->end); dyStringWriteOne(stream, item->val); } /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxSectionSize) maxSectionSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); char compBuf[maxCompSize]; int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); /* If at end of input we are done. */ if (rowSize == 0) break; /* Set up for next section. */ itemIx = 0; if (!sameChrom) { usage = usage->next; assert(usage != NULL); if (!sameString(row[0], usage->name)) errAbort("read %s, expecting %s on line %d in file %s\n", row[0], usage->name, lf->lineIx, lf->fileName); assert(sameString(row[0], usage->name)); lastB = NULL; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } } /* Parse out input. */ lineFileExpectWords(lf, 4, rowSize); bits32 start = lineFileNeedNum(lf, row, 1); bits32 end = lineFileNeedNum(lf, row, 2); float val = lineFileNeedDouble(lf, row, 3); /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */ if (start > end) errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName); if (lastB != NULL) { if (lastB->start > start) errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName); if (lastB->end > start) errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName); } /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } /* Save values in output array. */ struct sectionItem *b = &items[itemIx]; b->start = start; b->end = end; b->val = val; lastB = b; itemIx += 1; } assert(sectionIx == sectionCount); *retMaxSectionSize = maxSectionSize; }
void outputUniqueOnSharedKey(char *inTab, struct asObject *as, struct asColumn *keyCol, struct slPair *fieldList, char *outTab, char *outErr) /* Scan through tab-separated file inTab and output fields in fieldList to * outTab. Make sure there is only one row for each value of sharedKey field. * If there would be multiple different rows in output with sharedKey, * complain about it in outErr. */ { /* Open input and output. */ struct lineFile *lf = lineFileOpen(inTab, TRUE); FILE *f = mustOpen(outTab, "w"); FILE *fErr = mustOpen(outErr, "w"); /* Set up array for input fields with more than we expect for better error reporting. */ int oldFieldCount = slCount(as->columnList); int newFieldCount = slCount(fieldList); int allocFields = oldFieldCount+10; char *words[allocFields]; /* Set up array for output fields that says where to find them in input. */ int *oldIx = makeNewToOldArray(as, fieldList); /* Figure out index of key field. */ int keyIx = slIxFromElement(as->columnList, keyCol); /* Go through each line of input, outputting selected columns. */ struct hash *uniqHash = hashNew(18); struct hash *errHash = hashNew(0); struct dyString *dy = dyStringNew(1024); int fieldCount; while ((fieldCount = lineFileChopNextTab(lf, words, allocFields)) > 0) { lineFileExpectWords(lf, oldFieldCount, fieldCount); /* Collect possible output into dy. */ dyStringClear(dy); dyStringPrintf(dy, "%s", words[oldIx[0]]); int i; for (i=1; i<newFieldCount; ++i) dyStringPrintf(dy, "\t%s", words[oldIx[i]]); dyStringPrintf(dy, "\n"); /* Check that this line is either unique for this key, or the same as previous lines * for the key. */ char *key = words[keyIx]; char *oldVal = hashFindVal(uniqHash, key); if (oldVal != NULL) { if (!sameString(oldVal, dy->string)) { /* Error reporting is a little complex. We want to output all lines associated * with key, including the first one, but we only want to do first line once. */ if (!hashLookup(errHash, key)) { hashAdd(errHash, key, NULL); fputs(oldVal, fErr); } fputs(dy->string, fErr); } } else { hashAdd(uniqHash, key, cloneString(dy->string)); fputs(dy->string, f); } } /* Report error summary */ if (errHash->elCount > 0) { warn("Warning: %d shared keys have multiple values in table 2. See %s.\n" "Only first row for each key put in %s" , errHash->elCount, outErr, outTab); if (!mergeOk) noWarnAbort(); } /* Clean up and go home. */ freez(&oldIx); carefulClose(&fErr); carefulClose(&f); lineFileClose(&lf); }
static void parseBedGraphSection(struct lineFile *lf, boolean clipDontDie, struct hash *chromSizeHash, struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList) /* Parse out bedGraph section until we get to something that is not in bedGraph format. */ { /* Set up hash and list to store chromosomes. */ struct hash *chromHash = hashNew(0); struct bedGraphChrom *chrom, *chromList = NULL; /* Collect lines in items on appropriate chromosomes. */ struct bwgBedGraphItem *item; char *line; while (lineFileNextReal(lf, &line)) { /* Check for end of section. */ if (stepTypeLine(line)) { lineFileReuse(lf); break; } /* Parse out our line and make sure it has exactly 4 columns. */ char *words[5]; int wordCount = chopLine(line, words); lineFileExpectWords(lf, 4, wordCount); /* Get chromosome. */ char *chromName = words[0]; chrom = hashFindVal(chromHash, chromName); if (chrom == NULL) { lmAllocVar(chromHash->lm, chrom); hashAddSaveName(chromHash, chromName, chrom, &chrom->name); chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM); slAddHead(&chromList, chrom); } /* Convert to item and add to chromosome list. */ lmAllocVar(lm, item); item->start = lineFileNeedNum(lf, words, 1); item->end = lineFileNeedNum(lf, words, 2); item->val = lineFileNeedDouble(lf, words, 3); /* Do sanity checking on coordinates. */ if (item->start > item->end) errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", item->start, item->end, lf->lineIx, lf->fileName); if (item->end > chrom->size) { warn("bedGraph error line %d of %s: chromosome %s has size %u but item ends at %u", lf->lineIx, lf->fileName, chrom->name, chrom->size, item->end); if (!clipDontDie) noWarnAbort(); } else { slAddHead(&chrom->itemList, item); } } slSort(&chromList, bedGraphChromCmpName); /* Loop through each chromosome and output the item list, broken into sections * for that chrom. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { slSort(&chrom->itemList, bwgBedGraphItemCmp); /* Check to make sure no overlap between items. */ struct bwgBedGraphItem *item = chrom->itemList, *nextItem; for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next) { if (item->end > nextItem->start) errAbort("Overlap between %s %d %d and %s %d %d.\nPlease remove overlaps and try again", chrom->name, item->start, item->end, chrom->name, nextItem->start, nextItem->end); item = nextItem; } /* Break up into sections of no more than items-per-slot size. */ struct bwgBedGraphItem *startItem, *endItem, *nextStartItem = chrom->itemList; for (startItem = chrom->itemList; startItem != NULL; startItem = nextStartItem) { /* Find end item of this section, and start item for next section. * Terminate list at end item. */ int sectionSize = 0; int i; endItem = startItem; for (i=0; i<itemsPerSlot; ++i) { if (nextStartItem == NULL) break; endItem = nextStartItem; nextStartItem = nextStartItem->next; ++sectionSize; } endItem->next = NULL; /* Fill in section and add it to section list. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = cloneString(chrom->name); section->start = startItem->start; section->end = endItem->end; section->type = bwgTypeBedGraph; section->items.bedGraphList = startItem; section->itemCount = sectionSize; slAddHead(pSectionList, section); } } /* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in * hash's memory. */ hashFree(&chromHash); chromList = NULL; }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. */ { char *row[3]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, ArraySize(row)); if (rowSize == 0) break; lineFileExpectWords(lf, 3, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); *retMinDiff = minDiff; *retAveSize = (double)totalBases/bedCount; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }