void readEpcr(struct lineFile *ef) /* Read in and record epcr records */ { int wordCount; char *words[8]; char *pos[4]; struct epcr *epcr; struct sts *sts; while (lineFileChopNext(ef, words, 4)) { if (words[3]) { AllocVar(epcr); epcr->next = NULL; epcr->contig = cloneString(words[0]); epcr->bases = cloneString(words[1]); epcr->dbstsId = cloneString(words[2]); epcr->ucscId = cloneString(words[3]); wordCount = chopByChar(words[1], '.', pos, ArraySize(pos)); if (wordCount != 3) errAbort("Not parsing epcr as expeceted\n"); epcr->start = sqlUnsigned(pos[0]); epcr->end = sqlUnsigned(pos[2]); sts = hashMustFindVal(stsHash, epcr->dbstsId); if (!epcrInList(sts->epcr, epcr)) { slAddHead(&sts->epcr, epcr); sts->epcrCount++; } } } }
struct genePred *getPredsFromBeds(char *file, char *table, char *db) { struct sqlConnection *conn = hAllocConn(db); struct lineFile *lf = lineFileOpen(file, TRUE); char *words[5000]; int wordsRead; struct genePred *list = NULL; while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *)) )) { if (wordsRead != 4) errAbort("file '%s' must be bed4. Line %d has %d fields", file, lf->lineIx, wordsRead); char where[10 * 1024]; sqlSafefFrag(where, sizeof where, "name = '%s' and chrom='%s' and txStart = %d and txEnd = %d", words[3], words[0], sqlUnsigned(words[1]), sqlUnsigned(words[2])); //printf("table %s where %s\n",table,where); struct genePredReader *reader = genePredReaderQuery( conn, table, where); struct genePred *pred; while ((pred = genePredReaderNext(reader)) != NULL) slAddHead(&list, pred); genePredReaderFree(&reader); } hFreeConn(&conn); if (list != NULL) slReverse(&list); return list; }
struct aaMap *readAAMap(char *fileName) /* Read in a amino acid counts from a file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[1024]; int wordsRead; int ii; struct aaMap *am; am = allocAAMap(); for(ii=0; ii < 256; ii++) am->empty[ii] = TRUE; while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *))) > 0) { if (wordsRead != 2) errAbort("expect two words on line %d of %s\n", lf->lineIx, fileName); if (strlen(words[0]) > 1) errAbort("expect first word to be one char on line %d of %s\n", lf->lineIx, fileName); int aa = *words[0]; am->map[aa] = atof(words[1]); am->empty[aa] = FALSE; } lineFileClose(&lf); return am; }
void readUnistsAliases(struct lineFile *uaf) /* Read in UniSTS alias names */ { char *words[4], *name = NULL, *id = NULL; int nameCount, i = 0, j, k, l = 0; struct alias *aliases = NULL, *testAlias = NULL; struct aliasId *aliasId = NULL; boolean idAdded = FALSE; /* store marker names keyed by ID and marker UniSTS IDs indexed by name. */ /* there can be many names per ID and many IDs per name */ aliasHash = newHash(16); aliasByNameHash = newHash(18); while (lineFileChopNext(uaf, words, 2) ) { id = cloneString(words[0]); /* allocate memory for alias structure */ AllocVar(aliases); nameCount = chopByChar(words[1], ';', aliases->names, ArraySize(aliases->names)); for (i = 0; i < nameCount; i++) { /* add alias to hash keyed by alias, value is UniSTS ID. */ /* alias can have multiple IDs */ idAdded = FALSE; name = cloneString(aliases->names[i]); /* get existing aliasId struct from hash for this name if exists */ /* else allocate memory for an aliasId struct */ /* and initialize array of ids */ if ((aliasId = hashFindVal(aliasByNameHash, name)) == NULL) { AllocVar(aliasId); for (j = 0; j < MAXIDS; j++) { aliasId->ids[j] = NULL; } } for (k = 0; (k < MAXIDS) && !idAdded; k++) { /* find next empty slot in array to add UniSTS ID */ if ((aliasId->ids[k] == NULL) && (!idAdded)) { aliasId->ids[k] = id; idAdded = TRUE; } } addHashElUnique(aliasByNameHash, name, aliasId); } /* add entry of "0" to signify the end of the array */ aliases->names[i] = "0"; /* Add this entry to a hash keyed by UniSTS id, hash key must be a string */ addHashElUnique(aliasHash, id, aliases); } }
boolean lineFileNextRow(struct lineFile *lf, char *words[], int wordCount) /* Return next non-blank line that doesn't start with '#' chopped into words. * Returns FALSE at EOF. Aborts on error. */ { int wordsRead; wordsRead = lineFileChopNext(lf, words, wordCount); if (wordsRead == 0) return FALSE; if (wordsRead < wordCount) lineFileExpectWords(lf, wordCount, wordsRead); return TRUE; }
struct bedNamedScore *bedNamedScoreLoadNext(struct lineFile *lf) /* Takes in an open lineFile and reads out the next bedNamedScore line */ { char *row[6]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); if (rowSize == 0) return NULL; struct bedNamedScore *bg; AllocVar(bg); bg->chrom = cloneString(row[0]); bg->chromStart = sqlUnsigned(row[1]); bg->chromEnd = sqlUnsigned(row[2]); bg->name = cloneString(row[3]); bg->score = sqlFloat(row[4]); bg->strand = row[5][0]; return bg; }
struct hash *makeExpsTable(char *database, char *expTable, char *expFile, int *expCount) /* Open experiment file and use it to create experiment table. Use optional fields if present, otherwise defaults. Return a hash of expId's, keyed by name */ { struct lineFile *lf = lineFileOpen(expFile, TRUE); FILE *f = hgCreateTabFile(tabDir, expTable); int expId = 0; char *words[6]; int wordCt; struct hash *expHash = newHash(0); while ((wordCt = lineFileChopNext(lf, words, ArraySize(words)))) { char *name = words[0]; hashAddInt(expHash, name, expId); fprintf(f, "%d\t%s\t", expId++, name); fprintf(f, "%s\t", wordCt > 1 ? words[1] : name); fprintf(f, "%s\t", wordCt > 2 ? words[2] : expUrl); fprintf(f, "%s\t", wordCt > 3 ? words[3] : expRef); fprintf(f, "%s\t", wordCt > 4 ? words[4] : expCredit); fprintf(f, "0\n"); /* extras */ } if (expId <= 0) errAbort("No experiments in %s", lf->fileName); verbose(2, "%d experiments\n", expId); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expRecordCreateTable(conn, expTable); hgLoadTabFile(conn, tabDir, expTable, &f); sqlDisconnect(&conn); } lineFileClose(&lf); if (expCount) *expCount = expId; return expHash; }
struct region *getUserRegions(char *fileName) /* Get user defined regions from fileName. */ { struct region *list = NULL, *region; struct lineFile *lf; char *words[4]; int wordCount; lf = lineFileOpen(fileName, TRUE); /* TRUE == replace CR with 0 */ while (0 != (wordCount = lineFileChopNext(lf, words, ArraySize(words)))) { AllocVar(region); region->chrom = cloneString(words[0]); region->start = atoi(words[1]); region->end = atoi(words[2]); if (wordCount > 3) region->name = cloneString(words[3]); else region->name = NULL; slAddHead(&list, region); } slReverse(&list); return list; }
static struct sqlDeleter* buildReloadDeleter(char *reloadList, unsigned srcDb, char *tmpDir) /* read reload list, building a deleter for the specified source DB */ { struct sqlDeleter* deleter = NULL; struct lineFile *lf = gzLineFileOpen(reloadList); int cnt = 0; char *row[1]; while (lineFileChopNext(lf, row, ArraySize(row))) { char *acc = trimSpaces(row[0]); if (gbGuessSrcDb(acc) == srcDb) { if (deleter == NULL) deleter = sqlDeleterNew(tmpDir, (gbVerbose >= 4)); sqlDeleterAddAcc(deleter, acc); cnt++; gbVerbMsg(5, "%s delete for reloading", acc); } } gzLineFileClose(&lf); gbVerbMsg(1, "delete %d entries for reloading", cnt); return deleter; }
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[], boolean doCompress, bits32 *retMaxSectionSize) /* Read through lf, chunking it into sections that get written to f. Save info * about sections in bounds. */ { int maxSectionSize = 0; struct bbiChromUsage *usage = usageList; int itemIx = 0, sectionIx = 0; bits32 reserved32 = 0; UBYTE reserved8 = 0; struct sectionItem items[itemsPerSlot]; struct sectionItem *lastB = NULL; bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; struct dyString *stream = dyStringNew(0); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Figure out whether need to output section. */ boolean sameChrom = FALSE; if (rowSize > 0) sameChrom = sameString(row[0], usage->name); if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom) { /* Figure out section position. */ bits32 chromId = usage->id; bits32 sectionStart = items[0].start; bits32 sectionEnd = items[itemIx-1].end; /* Save section info for indexing. */ assert(sectionIx < sectionCount); struct bbiBoundsArray *section = &bounds[sectionIx++]; section->offset = ftell(f); section->range.chromIx = chromId; section->range.start = sectionStart; section->range.end = sectionEnd; /* Output section header to stream. */ dyStringClear(stream); UBYTE type = bwgTypeBedGraph; bits16 itemCount = itemIx; dyStringWriteOne(stream, chromId); // chromId dyStringWriteOne(stream, sectionStart); // start dyStringWriteOne(stream, sectionEnd); // end dyStringWriteOne(stream, reserved32); // itemStep dyStringWriteOne(stream, reserved32); // itemSpan dyStringWriteOne(stream, type); // type dyStringWriteOne(stream, reserved8); // reserved dyStringWriteOne(stream, itemCount); // itemCount /* Output each item in section to stream. */ int i; for (i=0; i<itemIx; ++i) { struct sectionItem *item = &items[i]; dyStringWriteOne(stream, item->start); dyStringWriteOne(stream, item->end); dyStringWriteOne(stream, item->val); } /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxSectionSize) maxSectionSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); char compBuf[maxCompSize]; int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); /* If at end of input we are done. */ if (rowSize == 0) break; /* Set up for next section. */ itemIx = 0; if (!sameChrom) { usage = usage->next; assert(usage != NULL); if (!sameString(row[0], usage->name)) errAbort("read %s, expecting %s on line %d in file %s\n", row[0], usage->name, lf->lineIx, lf->fileName); assert(sameString(row[0], usage->name)); lastB = NULL; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } } /* Parse out input. */ lineFileExpectWords(lf, 4, rowSize); bits32 start = lineFileNeedNum(lf, row, 1); bits32 end = lineFileNeedNum(lf, row, 2); float val = lineFileNeedDouble(lf, row, 3); /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */ if (start > end) errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName); if (lastB != NULL) { if (lastB->start > start) errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName); if (lastB->end > start) errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName); } /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } /* Save values in output array. */ struct sectionItem *b = &items[itemIx]; b->start = start; b->end = end; b->val = val; lastB = b; itemIx += 1; } assert(sectionIx == sectionCount); *retMaxSectionSize = maxSectionSize; }
static struct bed *parseRegionInput(char *inputString) /* scan the user region definition, turn into a bed list */ { int itemCount = 0; struct bed *bedList = NULL; struct bed *bedEl; int wordCount; char *words[5]; struct lineFile *lf; lf = lineFileOnString("userData", TRUE, inputString); while (0 != (wordCount = lineFileChopNext(lf, words, ArraySize(words)))) { char *chromName = NULL; int chromStart = 0; int chromEnd = 0; char *regionName = NULL; /* might be something of the form: chrom:start-end optionalRegionName */ if (((1 == wordCount) || (2 == wordCount)) && hgParseChromRange(NULL, words[0], &chromName, &chromStart, &chromEnd)) { if (2 == wordCount) regionName = cloneString(words[1]); } else if (!((3 == wordCount) || (4 == wordCount))) { int i; struct dyString *errMessage = dyStringNew(0); for (i = 0; i < wordCount; ++i) dyStringPrintf(errMessage, "%s ", words[i]); errAbort("line %d: '%s'<BR>\n" "illegal bed size, expected 3 or 4 fields, found %d\n", lf->lineIx, dyStringCannibalize(&errMessage), wordCount); } else { chromName = hgOfficialChromName(database, words[0]); chromStart = sqlSigned(words[1]); chromEnd = sqlSigned(words[2]); if (wordCount > 3) regionName = cloneString(words[3]); } ++itemCount; if (itemCount > 1000) { warn("limit 1000 region definitions reached at line %d<BR>\n", lf->lineIx); break; } AllocVar(bedEl); bedEl->chrom = chromName; if (NULL == bedEl->chrom) errAbort("at line %d, chrom name '%s' %s %s not recognized in this assembly %d", lf->lineIx, words[0], words[1], words[2], wordCount); bedEl->chromStart = chromStart; bedEl->chromEnd = chromEnd; if (illegalCoordinate(bedEl->chrom, bedEl->chromStart, bedEl->chromEnd)) errAbort("illegal input at line %d: %s %d %d", lf->lineIx, bedEl->chrom, bedEl->chromStart, bedEl->chromEnd); if (wordCount > 3) bedEl->name = regionName; else bedEl->name = NULL; /* if we wanted to give artifical names to each item */ #ifdef NOT { char name[128]; safef(name, ArraySize(name), "item_%04d", itemCount); bedEl->name = cloneString(name); } #endif slAddHead(&bedList, bedEl); } lineFileClose(&lf); // slSort(&bedList, bedCmp); /* this would do chrom,chromStart order */ slReverse(&bedList); /* with no sort, it is in order as user entered */ return (bedList); }
void parseFasta(struct lineFile *lf, int numSpecies, struct slName *list, struct hash *seqHash, alignFunc afunc, columnFunc cfunc, void *closure) /* Parse an AA fasta, calling the alignment and column functions where appropriate. */ { char *words[5000]; int wordsRead; boolean expectGreat = TRUE; char *seqBuffer = NULL; struct alignDetail detail; struct slName *name; memset(&detail, 0, sizeof(detail)); detail.numSpecies = numSpecies; detail.seqBuffers = needMem(numSpecies * sizeof(struct seqBuffer)); int ii = 0; for(name = list; name; name = name->next) { struct hashContent *hc = hashMustFindVal(seqHash, name->name); detail.seqBuffers[ii].position = hc->position; detail.seqBuffers[ii].buffer = hc->seqBuffer; detail.seqBuffers[ii].species = name->name; detail.seqBuffers[ii].lastTwo[0] = '-'; detail.seqBuffers[ii].lastTwo[1] = '-'; ii++; } while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *)) )) { if (expectGreat) { if (*words[0] != '>') errAbort("expect '>' as first char on line %d",lf->lineIx); char *pName = words[0]; char *exonCountStr = strrchr(pName, '_'); if (exonCountStr == NULL) errAbort("expected to find underbar on line %d",lf->lineIx); int newExonCount = atoi(exonCountStr + 1); if (newExonCount == 0) errAbort("bad exon count on line %d",lf->lineIx); *exonCountStr = 0; char *exonNumStr = strrchr(pName, '_'); if (exonNumStr == NULL) errAbort("expected to find underbar on line %d",lf->lineIx); int newExonNum = atoi(exonNumStr + 1); *exonNumStr = 0; char *species = strrchr(pName, '_'); *species++ = 0; if (species == NULL) errAbort("expected to find species underbar on line %d",lf->lineIx); int newExonSize = atoi(words[1]); pName++; if (newExonSize > SEQBUFFER_SIZE) errAbort("exon is too big (%d) for constant SEQBUFFER (%d)", newExonSize, SEQBUFFER_SIZE); if (newExonSize <= 0) errAbort("expected size > 0 (2nd arg) on line %d",lf->lineIx); if ((detail.proName == NULL) || !sameString(pName, detail.proName)|| (detail.exonNum != newExonNum) || (detail.exonSize != newExonSize)) { if (detail.proName != NULL) analyzeAlign(&detail, afunc, cfunc, closure); else detail.exonSize = SEQBUFFER_SIZE; // initialize whole buffer struct seqBuffer *sb = detail.seqBuffers; struct seqBuffer *lastSb = &detail.seqBuffers[detail.numSpecies]; for(; sb < lastSb; sb++) { if (detail.exonSize == 1) { sb->lastTwo[0] = '-'; sb->lastTwo[1] = sb->buffer[detail.exonSize-1]; } else memcpy(sb->lastTwo, &sb->buffer[detail.exonSize-2], 2); } clearSpecies(seqHash, detail.exonSize); freez(&detail.proName); detail.proName = cloneString(pName); detail.exonNum = newExonNum; detail.exonCount = newExonCount; detail.exonSize = newExonSize; detail.startFrame = atoi(words[2]); detail.endFrame = atoi(words[3]); freez(&detail.geneName); if (wordsRead == 6) detail.geneName = cloneString(words[5]); } struct hashContent *hc = hashMustFindVal(seqHash, species); seqBuffer = hc->seqBuffer; if (strlen(words[4]) > MAX_POSITION_SIZE) errAbort("overflowed position buffer have %d need %d", MAX_POSITION_SIZE, (int)strlen(words[4])); strcpy(hc->position, words[4]); expectGreat = FALSE; } else { expectGreat = TRUE; if (wordsRead != 1) errAbort("expect only one word with sequence on line %d\n",lf->lineIx); if (strlen(words[0]) != detail.exonSize) errAbort("expect exonSize to be %d on line %d\n",detail.exonSize,lf->lineIx); memcpy(seqBuffer, words[0], detail.exonSize); } } if (detail.proName != NULL) analyzeAlign(&detail, afunc, cfunc, closure); }
void bedSplitOnChrom(char *inFile, char *outDir) /* bedSplitOnChrom - Split bed into a directory with one file per chromosome. */ { /* Create output directory. */ makeDir(outDir); /* Open file and figure out how many fields there are. */ struct lineFile *lf = lineFileOpen(inFile, TRUE); char *row[100]; int numFields = lineFileChopNext(lf, row, ArraySize(row)); char lastChrom[2048]; lastChrom[0] = 0; if (numFields == 0) return; /* Empty file, nothing to do. */ if (numFields >= ArraySize(row)) errAbort("Too many fields (%d) in bed file %s. Max is %d", numFields, lf->fileName, (int)(ArraySize(row)-1)); if (numFields < 3 || !isdigit(row[1][0]) || !isdigit(row[2][0])) errAbort("%s does not appear to be a bed file.", lf->fileName); /* Output as needed, creating a hash of open files. */ char path[PATH_LEN]; struct hash *fileHash = hashNew(8); char buffer[4096]; FILE *f = NULL; for (;;) { /* Look up file in hash, creating a new file if need be. */ char *chrom = row[0]; if (doStrand) { char *ptr = buffer; for(;*chrom; chrom++, ptr++) *ptr = *chrom; *ptr++ = row[5][0]; *ptr++ = 0; chrom = buffer; } if (differentString(chrom, lastChrom)) { f = hashFindVal(fileHash, chrom); strcpy(lastChrom, chrom); verbose(2, "new chrom %s f %p\n", lastChrom,f); } if (f == NULL) { if (fileHash->elCount >= maxChromCount) errAbort("%s is the %dth chromosome, which is too many. " "Use maxChromCount option if need be.", chrom, fileHash->elCount+1); safef(path, sizeof(path), "%s/%s.bed", outDir, chrom); f = mustOpen(path, doAppend ? "a" : "w"); verbose(2, "opened %s f %p\n",path, f); hashAdd(fileHash, chrom, f); } /* Output line of bed file, starting with the three fields that are always there. */ fprintf(f, "%s\t%s\t%s", row[0], row[1], row[2]); int i; for (i=3; i<numFields; ++i) fprintf(f, "\t%s", row[i]); fputc('\n', f); if (ferror(f)) { safef(path, sizeof(path), "%s/%s.bed", outDir, chrom); errnoAbort("Couldn't write to %s.", path); } /* Fetch next line, breaking loop if it's not there, * and maybe insuring that it has the usual number of fields. */ int fieldsInLine = lineFileChopNext(lf, row, ArraySize(row)); if (fieldsInLine == 0) break; if (nfCheck && (fieldsInLine != numFields)) errAbort("First line in %s had %d fields, but line %d has %d fields.", lf->fileName, numFields, lf->lineIx, fieldsInLine); } /* Careful close all output files to make sure last bytes really written. */ struct hashEl *hel; for (hel = hashElListHash(fileHash); hel != NULL; hel = hel->next) { FILE *f = hel->val; carefulClose(&f); } }
void initGapAid(char *gapFileName) /* Initialize gap aid structure for faster gap * computations. */ { int i, tableSize, startLong = -1; char *sizeDesc[2]; char *words[128]; if (gapFileName != NULL) { struct lineFile *lf = lineFileOpen(gapFileName, TRUE); int count; lineFileNextRowTab(lf, sizeDesc, 2); tableSize = atoi(sizeDesc[1]); AllocArray(gapInitPos,tableSize); AllocArray(gapInitQGap,tableSize); AllocArray(gapInitTGap,tableSize); AllocArray(gapInitBothGap,tableSize); while (count = lineFileChopNext(lf, words, tableSize+1)) { if (sameString(words[0],"smallSize")) { aid.smallSize = atoi(words[1]); } if (sameString(words[0],"position")) { for (i=0 ; i<count-1 ; i++) gapInitPos[i] = atoi(words[i+1]); } if (sameString(words[0],"qGap")) { for (i=0 ; i<count-1 ; i++) gapInitQGap[i] = atoi(words[i+1]); } if (sameString(words[0],"tGap")) { for (i=0 ; i<count-1 ; i++) gapInitTGap[i] = atoi(words[i+1]); } if (sameString(words[0],"bothGap")) { for (i=0 ; i<count-1 ; i++) gapInitBothGap[i] = atoi(words[i+1]); } } if (aid.smallSize == 0) errAbort("missing smallSize parameter in %s\n",gapFileName); lineFileClose(&lf); } else { /* if no gap file, then setup default values */ /* Set up to handle small values */ aid.smallSize = 111; tableSize = 11; AllocArray(gapInitPos,tableSize); AllocArray(gapInitQGap,tableSize); AllocArray(gapInitTGap,tableSize); AllocArray(gapInitBothGap,tableSize); for (i = 0 ; i < tableSize ; i++) { gapInitPos[i] = gapInitPosDefault[i]; gapInitTGap[i] = gapInitTGapDefault[i]; gapInitQGap[i] = gapInitQGapDefault[i]; gapInitBothGap[i] = gapInitBothGapDefault[i]; } } AllocArray(aid.qSmall, aid.smallSize); AllocArray(aid.tSmall, aid.smallSize); AllocArray(aid.bSmall, aid.smallSize); for (i=1; i<aid.smallSize; ++i) { aid.qSmall[i] = interpolate(i, gapInitPos, gapInitQGap, tableSize); aid.tSmall[i] = interpolate(i, gapInitPos, gapInitTGap, tableSize); aid.bSmall[i] = interpolate(i, gapInitPos, gapInitBothGap, tableSize); } /* Set up to handle intermediate values. */ for (i=0; i<tableSize; ++i) { if (aid.smallSize == gapInitPos[i]) { startLong = i; break; } } if (startLong < 0) errAbort("No position %d in initGapAid()\n", aid.smallSize); aid.longCount = tableSize - startLong; aid.qPosCount = tableSize - startLong; aid.tPosCount = tableSize - startLong; aid.bPosCount = tableSize - startLong; aid.longPos = cloneMem(gapInitPos + startLong, aid.longCount * sizeof(int)); aid.qLong = cloneMem(gapInitQGap + startLong, aid.qPosCount * sizeof(double)); aid.tLong = cloneMem(gapInitTGap + startLong, aid.tPosCount * sizeof(double)); aid.bLong = cloneMem(gapInitBothGap + startLong, aid.bPosCount * sizeof(double)); /* Set up to handle huge values. */ aid.qLastPos = aid.longPos[aid.qPosCount-1]; aid.tLastPos = aid.longPos[aid.tPosCount-1]; aid.bLastPos = aid.longPos[aid.bPosCount-1]; aid.qLastPosVal = aid.qLong[aid.qPosCount-1]; aid.tLastPosVal = aid.tLong[aid.tPosCount-1]; aid.bLastPosVal = aid.bLong[aid.bPosCount-1]; aid.qLastSlope = calcSlope(aid.qLastPosVal, aid.qLong[aid.qPosCount-2], aid.qLastPos, aid.longPos[aid.qPosCount-2]); aid.tLastSlope = calcSlope(aid.tLastPosVal, aid.tLong[aid.tPosCount-2], aid.tLastPos, aid.longPos[aid.tPosCount-2]); aid.bLastSlope = calcSlope(aid.bLastPosVal, aid.bLong[aid.bPosCount-2], aid.bLastPos, aid.longPos[aid.bPosCount-2]); // uglyf("qLastPos %d, qlastPosVal %f, qLastSlope %f\n", aid.qLastPos, aid.qLastPosVal, aid.qLastSlope); // uglyf("tLastPos %d, tlastPosVal %f, tLastSlope %f\n", aid.tLastPos, aid.tLastPosVal, aid.tLastSlope); // uglyf("bLastPos %d, blastPosVal %f, bLastSlope %f\n", aid.bLastPos, aid.bLastPosVal, aid.bLastSlope); }
void readDbstsNames(struct lineFile *daf) /* Read in dbSTS names and create new stsInfo record, if necessary */ { struct sts *s; struct stsInfo2 *si; struct primer *p; char *words[4], *names[64], name[64], *org; int dbstsId, nameCount, i; while (lineFileChopNext(daf, words, 2)) { /* Make sure this is a human marker */ org = hashFindVal(orgHash, words[0]); if (hashLookup(orgHash, words[0]) && !sameString(org, "H**o sapiens\0") && !sameString(org, "\0")) continue; dbstsId = sqlUnsigned(words[0]); /* Find the primers for this dbSTS id */ if (hashLookup(primerHash, words[0])) p = hashMustFindVal(primerHash, words[0]); /* Determine if this id is already being used */ if (hashLookup(ucscIdHash, words[0])) { s = hashMustFindVal(ucscIdHash, words[0]); } else { s = NULL; } /* If the id has not been assigned, see any of the names are being used */ if (s == NULL) { nameCount = chopByChar(words[1], ';', names, ArraySize(names)); for (i = 0; i < nameCount; i++) { touppers(names[i]); /* See if this name associated with a ucsc record already */ if (hashLookup(nameHash, names[i])) { s = hashMustFindVal(nameHash, names[i]); /* See if this record needs an dbSTS id */ if ((s->si->dbSTSid == 0) || (s->si->dbSTSid >= MAX_STS_ID) || (sameString(s->si->leftPrimer, "\0"))) { s->si->dbSTSid = dbstsId; /* If no primer info recorded, add it if possible */ if (((!s->mapped) || (sameString(s->si->leftPrimer, "\0"))) && (hashLookup(primerHash, words[0]))) { p = hashMustFindVal(primerHash, words[0]); s->si->leftPrimer = cloneString(p->left); s->si->rightPrimer = cloneString(p->right); s->si->distance = cloneString(p->dist); } i = nameCount; } else { addElementInt(dbstsId, &s->si->otherDbSTS, &s->si->otherDbstsCount); } } } } if (s != NULL) { /* Determine if all of the names are recorded */ if (s->si->dbSTSid == dbstsId) s->dbstsIdExists = TRUE; nameCount = chopByChar(words[1], ';', names, ArraySize(names)); for (i = 0; i < nameCount; i++) { touppers(names[i]); if (!hashLookup(nameHash, names[i])) { subChar(names[i],',',':'); addName(s, names[i]); hashAdd(nameHash, names[i], s); } } } else { /* If valid primers exist, then add record */ if (hashLookup(primerHash, words[0])) p = hashMustFindVal(primerHash, words[0]); else p = NULL; if (p != NULL) { nameCount = chopByChar(words[1], ';', names, ArraySize(names)); AllocVar(s); AllocVar(si); si->next = NULL; s->si = si; s->mapped = FALSE; s->dbstsIdExists = TRUE; s->fa = NULL; si->next = NULL; si->identNo = nextUcscId; nextUcscId++; touppers(names[0]); si->name = cloneString(names[0]); si->gbCount = 0; si->genbank = NULL; si->gdbCount = 0; si->gdb = NULL; si->nameCount = 0; si->otherNames = NULL; if (checkGb(names[0]) || checkGdb(names[0])) addName(s, names[0]); hashAdd(nameHash, names[0], s); for (i = 1; i < nameCount; i++) { subChar(names[i], ',', ':'); touppers(names[i]); addName(s, names[i]); hashAdd(nameHash, names[i], s); } si->dbSTSid = dbstsId; si->otherDbstsCount = 0; si->otherDbSTS = NULL; si->leftPrimer = cloneString(p->left); si->rightPrimer = cloneString(p->right); si->distance = cloneString(p->dist); si->organism = cloneString("H**o sapiens"); si->sequence = 0; si->otherUCSCcount = 0; si->otherUCSC = NULL; si->mergeUCSCcount = 0; si->mergeUCSC = NULL; si->genethonName = cloneString(""); si->genethonChr = cloneString(""); si->marshfieldName = cloneString(""); si->marshfieldChr = cloneString(""); si->wiyacName = cloneString(""); si->wiyacChr = cloneString(""); si->wirhName = cloneString(""); si->wirhChr = cloneString(""); si->gm99gb4Name = cloneString(""); si->gm99gb4Chr = cloneString(""); si->gm99g3Name = cloneString(""); si->gm99g3Chr = cloneString(""); si->tngName = cloneString(""); si->tngChr = cloneString(""); si->decodeName = cloneString(""); si->decodeChr = cloneString(""); slAddHead(&sList, s); hashAdd(ucscIdHash, words[0], s); safef(name, ArraySize(name), "%d", s->si->identNo); hashAdd(stsHash, name, s); hashAddInt(dbStsIdHash, name, dbstsId); p->ucscId = s->si->identNo; } } } }
struct hash *agpLoadAll(char *agpFile) /* load AGP entries into a hash of AGP lists, one per chromosome */ { struct hash *agpHash = newHash(0); struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *words[9]; int lastPos = 0; int wordCount; struct agpFrag *agpFrag; struct agpGap *agpGap; char *chrom; struct agp *agp; struct hashEl *hel; while ((wordCount = lineFileChopNext(lf, words, ArraySize(words))) != 0) { lineFileExpectAtLeast(lf, 8, wordCount); chrom = words[0]; if (!hashFindVal(agpHash, chrom)) lastPos = 1; AllocVar(agp); if (words[4][0] != 'N' && words[4][0] != 'U') { /* not a gap */ lineFileExpectWords(lf, 9, wordCount); agpFrag = agpFragLoad(words); if (agpFrag->chromStart != lastPos) errAbort( "Frag start (%d, %d) doesn't match previous end line %d of %s\n", agpFrag->chromStart, lastPos, lf->lineIx, lf->fileName); if (agpFrag->chromEnd - agpFrag->chromStart != agpFrag->fragEnd - agpFrag->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agpFrag->chrom, agpFrag->frag, lf->lineIx, lf->fileName); lastPos = agpFrag->chromEnd + 1; agp->entry = agpFrag; agp->isFrag = TRUE; } else { /* gap */ lineFileExpectWords(lf, 8, wordCount); agpGap = agpGapLoad(words); if (agpGap->chromStart != lastPos) errAbort("Gap start (%d, %d) doesn't match previous end line %d of %s\n", agpGap->chromStart, lastPos, lf->lineIx, lf->fileName); lastPos = agpGap->chromEnd + 1; agp->entry = agpGap; agp->isFrag = FALSE; } if ((hel = hashLookup(agpHash, chrom)) == NULL) hashAdd(agpHash, chrom, agp); else slAddHead(&(hel->val), agp); } #ifndef DEBUG { struct hashCookie cookie; struct hashEl *hel; cookie = hashFirst(agpHash); while ((hel = hashNext(&cookie)) != NULL) { struct agp *agpList; agpList = (struct agp *)hel->val; /* for (agp = agpList; agp != NULL; agp = agp->next) printf("isFrag: %d\n", agp->isFrag); */ } } #endif /* reverse AGP lists */ //hashTraverseVals(agpHash, slReverse); #ifndef DEBUG { struct hashCookie cookie; struct hashEl *hel; cookie = hashFirst(agpHash); while ((hel = hashNext(&cookie)) != NULL) { struct agp *agpList; slReverse(&hel->val); agpList = hel->val; /* agpList = (struct agp *)hel->val; slReverse(&agpList); hashRemove(agpHash, hel->name); hashAdd(agpHash, hel->name, agpList); */ /* for (agp = agpList; agp != NULL; agp = agp->next) printf("isFrag: %d\n", agp->isFrag); */ } } #endif return agpHash; }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); double aveSize = 0; if (bedCount > 0) aveSize = (double)totalBases/bedCount; *retMinDiff = minDiff; *retAveSize = aveSize; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }
struct bbiSummary *bedGraphWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiSummary oneSummary, *sum = NULL; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); boolean firstRow = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Output last section and break if at end of file. */ if (rowSize == 0 && sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); break; } /* Parse out row. */ char *chrom = row[0]; bits32 start = sqlUnsigned(row[1]); bits32 end = sqlUnsigned(row[2]); float val = sqlFloat(row[3]); /* Update total summary stuff. */ bits32 size = end-start; if (firstRow) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstRow = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If new chromosome output existing block. */ if (differentString(chrom, usage->name)) { usage = usage->next; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If start past existing block then output it. */ else if (sum != NULL && sum->end <= start) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { verbose(3, "Splitting start %d, end %d, sum->start %d, sum->end %d\n", start, end, sum->start, sum->end); /* Fold in bits that overlap with existing summary and output. */ bits32 overlap = rangeIntersection(start, end, sum->start, sum->end); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; }
void hgExperiment(char *database, char *table, char *expFile, char *posFile, char *dataFile) /* Main function */ { struct lineFile *lf; int *data = NULL; int *scores; FILE *f = NULL; char expTable[32]; char *words[3]; int wordCt; struct bed *bedList, *bed; int expCount; struct hash *expHash, *dataHash; struct hashEl *hel; /* Open experiment file and use it to create experiment table. Use optional fields if present, otherwise defaults */ safef(expTable, ArraySize(expTable), "%sExps", table); expHash = makeExpsTable(database, expTable, expFile, &expCount); /* Read in positions file */ bedList = bedLoadAll(posFile); slSort(&bedList, bedCmp); /* Read data file into a hash of arrays of data values, keyed by name */ dataHash = newHash(0); lf = lineFileOpen(dataFile, TRUE); while ((wordCt = lineFileChopNext(lf, words, ArraySize(words)))) { /* format: <region-name> <experiment-name> <data-value> */ char *name, *exp; int expId; int value; if (wordCt != 3) errAbort("Expecting 3 words in data file, got %d line %d of %s", wordCt, lf->lineIx, lf->fileName); name = words[0]; hel = hashLookup(dataHash, name); if (!hel) { AllocArray(data, expCount); hel = hashAdd(dataHash, name, data); } data = (int *)hel->val; exp = words[1]; expId = hashIntVal(expHash, exp); if (expId < 0 || expId > expCount-1) errAbort("Invalid experiment ID %d for %s, line %d of %s", expId, exp, lf->lineIx, lf->fileName); //value = atoi(words[2]); value = round(atof(words[2])); if (data[expId] != 0) errAbort("Extra experiment data value %d for %s %s, line %d of %s", value, name, exp, lf->lineIx, lf->fileName); data[expId] = value; } lineFileClose(&lf); /* Fill in BED15 fields - add experiment values, and setup block (only 1)*/ for (bed = bedList; bed != NULL; bed = bed->next) { int i; bed->thickStart = bed->chromStart; bed->thickEnd = bed->chromEnd; bed->blockCount = 1; AllocArray(bed->blockSizes, 1); bed->blockSizes[0] = bed->chromEnd - bed->chromStart; AllocArray(bed->chromStarts, 1); bed->chromStarts[0] = 0; bed->expCount = expCount; AllocArray(bed->expIds, expCount); for (i = 0; i < expCount; i++) bed->expIds[i] = i; AllocArray(bed->expScores, expCount); scores = hashMustFindVal(dataHash, bed->name); for (i = 0; i < expCount; i++) bed->expScores[i] = scores[i]; /* set score for bed to the average of the scores in all experiments */ calculateAverage(bed); } /* from affyPslAndAtlsoToBed ? convertIntensitiesToRatios(bedList); */ /* Write BED data file */ f = hgCreateTabFile(tabDir, table); for (bed = bedList; bed != NULL; bed = bed->next) bedTabOutN(bed, 15, f); /* Cleanup */ carefulClose(&f); freeHash(&expHash); freeHash(&dataHash); bedFreeList(&bedList); }