struct contig *contigsFromAgp(char *fileName, struct hash *hash) /* Build up a list of contigs looking at agp file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[3]; struct contig *contigList = NULL, *contig; while (lineFileChop(lf, row)) { char *name = row[0]; int s = lineFileNeedNum(lf, row, 1) - 1; int e = lineFileNeedNum(lf, row, 2); int size = e - s; if (size < 0) errAbort("Start before end line %d of %s", lf->lineIx, lf->fileName); if ((contig = hashFindVal(hash, name)) == NULL) { AllocVar(contig); hashAddSaveName(hash, name, contig, &contig->name); slAddHead(&contigList, contig); } if (s != contig->size) errAbort("Start doesn't match previous end line %d of %s", lf->lineIx, lf->fileName); contig->size = e; } lineFileClose(&lf); slReverse(&contigList); return contigList; }
double calcNormScoreFactor(char *fileName, int scoreCol) /* Figure out what to multiply things by to get a nice browser score (0-1000) */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[scoreCol+1]; double sum = 0, sumSquares = 0; int n = 0; double minVal=0, maxVal=0; int fieldCount; while ((fieldCount = lineFileChop(lf, row)) != 0) { lineFileExpectAtLeast(lf, scoreCol+1, fieldCount); double x = sqlDouble(row[scoreCol]); if (n == 0) minVal = maxVal = x; if (x < minVal) minVal = x; if (x > maxVal) maxVal = x; sum += x; sumSquares += x*x; n += 1; } lineFileClose(&lf); double std = calcStdFromSums(sum, sumSquares, n); double mean = sum/n; double highEnd = mean + std; if (highEnd > maxVal) highEnd = maxVal; return 1000.0/highEnd; }
struct liftSpec *readLifts(char *fileName) /* Read in lift file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *words[16]; struct liftSpec *list = NULL, *el; while ((wordCount = lineFileChop(lf, words)) != 0) { char *offs; lineFileExpectWords(lf, 5, wordCount); offs = words[0]; if (!isdigit(offs[0]) && !(offs[0] == '-' && isdigit(offs[1]))) errAbort("Expecting number in first field line %d of %s", lf->lineIx, lf->fileName); if (!isdigit(words[4][0])) errAbort("Expecting number in fifth field line %d of %s", lf->lineIx, lf->fileName); AllocVar(el); el->oldName = cloneString(words[1]); el->offset = atoi(offs); el->newName = cloneString(words[3]); el->size = atoi(words[4]); slAddHead(&list, el); } slReverse(&list); lineFileClose(&lf); printf("Got %d lifts in %s\n", slCount(list), fileName); if (list == NULL) errAbort("Empty liftSpec file %s", fileName); return list; }
void hgPhMouse(char *database, char *track, int fileCount, char *fileNames[]) /* hgPhMouse - Load phMouse track. */ { int i; char *fileName; char *tabName = "phMouse.tab"; FILE *f = mustOpen(tabName, "w"); struct lineFile *lf; char *words[32], *s, c; int wordCount; int oneSize, totalSize = 0; for (i=0; i<fileCount; ++i) { struct bed *bedList = NULL, *bed; fileName = fileNames[i]; lf = lineFileOpen(fileName, TRUE); printf("Reading %s ", fileName); fflush(stdout); while ((wordCount = lineFileChop(lf, words)) > 0) { if (wordCount < 7) errAbort("Expecting at least 7 words line %d of %s", lf->lineIx, fileName); AllocVar(bed); bed->chrom = cloneString(words[0]); bed->chromStart = lineFileNeedNum(lf, words, 1); bed->chromEnd = lineFileNeedNum(lf, words, 2); bed->score = lineFileNeedNum(lf, words, 6); s = strrchr(words[3], '|'); c = s[1]; s[0] = 0; if (c != '+' && c != '-') errAbort("Misformed strandless trace name line %d of %s", lf->lineIx, lf->fileName); bed->name = cloneString(words[3]); bed->strand[0] = c; slAddHead(&bedList, bed); } oneSize = slCount(bedList); printf("%d alignments ", oneSize); totalSize += oneSize; fflush(stdout); slSort(&bedList, bedCmp); printf("sorted "); fflush(stdout); for (bed = bedList; bed != NULL; bed = bed->next) { int bin = hFindBin(bed->chromStart, bed->chromEnd); fprintf(f, "%d\t", bin); bedTabOutN(bed, 6, f); } printf("tabbed out\n"); bedFreeList(&bedList); } carefulClose(&f); printf("Loading %d items into %s.%s\n", totalSize, database, track); loadDatabase(database, track, tabName); remove(tabName); }
void clumpEst3(char *inName, char *outName) /* clumpEst3 - Clump together 3' ESTs. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); struct est3 *cum = NULL, *cur; FILE *f = mustOpen(outName, "w"); char *words[8]; int wordCount; while ((wordCount = lineFileChop(lf, words)) > 0) { cur = est3Load(words); if (cum == NULL || cur->strand[0] != cum->strand[0] || cur->chromStart - cum->chromEnd > 2000 || !sameString(cum->chrom, cur->chrom) ) { writeEst3(f, cum); est3Free(&cum); cum = cur; } else { if (cur->chromStart - cum->chromEnd < 100 || cur->estCount > 1) { cum->chromEnd = cur->chromEnd; cum->estCount += cur->estCount; } est3Free(&cur); } } writeEst3(f, cum); }
struct groupSizeInfo *readSizes(char *fileName, struct hash *gsiHash) /* Read in file of format: * groupName guessedMin guessedMax * and save in hash and as list. */ { struct groupSizeInfo *gsiList = NULL, *gsi; struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *words[8]; struct hashEl *hel; while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 3, wordCount); AllocVar(gsi); hel = hashAddUnique(gsiHash, words[0], gsi); gsi->name = hel->name; gsi->guessedMin = atoi(words[1]); gsi->guessedMax = atoi(words[2]); slAddHead(&gsiList, gsi); } lineFileClose(&lf); slReverse(&gsiList); return gsiList; }
void readLoc2ref(char *fileName, struct hash **retPgiHash, struct hash **retLocHash) /* Read loc2ref file. Create hashes of rsInfo's indexed by pgi and locusLinkId. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[4]; int wordCount; struct rsInfo *rs; struct hash *locHash = newHash(0); struct hash *pgiHash = newHash(0); int lineCount = 0, count = 0; while ((wordCount = lineFileChop(lf, words)) > 0) { ++lineCount; if (wordCount == 4) { if (hashLookup(locHash, words[0]) != NULL && startsWith("XM_", words[1])) continue; AllocVar(rs); hashAddSaveName(locHash, words[0], rs, &rs->locusLinkId); rs->mrnaAcc = cloneString(words[1]); hashAddSaveName(pgiHash, words[2], rs, &rs->pgi); } } lineFileClose(&lf); *retLocHash = locHash; *retPgiHash = pgiHash; printf("Added %d locusLink ids from %s in %d lines\n", count, fileName, lineCount); }
void colTransform(char *column, char *input, char *addFactor, char *mulFactor, char *output) /* colTransform - Add and/or multiply column by constant.. */ { int col = sqlUnsigned(column) - 1; double add = sqlDouble(addFactor); double mul = sqlDouble(mulFactor); struct lineFile *lf = lineFileOpen(input, TRUE); FILE *f = mustOpen(output, "w"); char *words[512]; int wordCount; while ((wordCount = lineFileChop(lf, words)) > 0) { lineFileExpectAtLeast(lf, col, wordCount); double x = lineFileNeedDouble(lf, words, col); int i; for (i=0; i<wordCount; ++i) { if (i != 0) fputc('\t', f); if (i == col) fprintf(f, "%g", x*mul+add); else fputs(words[i], f); } fputc('\n', f); } carefulClose(&f); }
void readBad(struct hash *badHash, char *fileName, int cloneWord) /* Read bad clones into hash. */ { char *words[8]; struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *acc; int badCount = 0; while ((wordCount = lineFileChop(lf, words)) != 0) { if (wordCount < cloneWord+1) { errAbort("Expecting at least %d words line %d of %s", cloneWord+1, lf->lineIx, lf->fileName); } acc = words[cloneWord]; chopSuffix(acc); if (!checkAccFormat(acc)) errAbort("Badly formatted accession line %d of %s", lf->lineIx, lf->fileName); hashStore(badHash, acc); ++badCount; } lineFileClose(&lf); printf("Got %d clones to avoid from %s\n", badCount, fileName); }
struct chromInfo *readChromSizes(char *fileName) /* Create list of chromInfos based on a two column file <chrom><size> */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[2]; struct chromInfo *list = NULL, *el; bits64 maxTotal = (1LL << 32) - 1; bits64 total = 0; int chromCount = 0; struct hash *uniqHash = hashNew(16); while (lineFileChop(lf, row)) { char *name = row[0]; if (hashLookup(uniqHash, name)) errAbort("Duplicate chromosome or contig name %s line %d of %s", name, lf->lineIx, lf->fileName); hashAdd(uniqHash, name, NULL); AllocVar(el); el->name = cloneString(name); el->size = lineFileNeedNum(lf, row, 1); el->genomeOffset = total; total += el->size; if (total > maxTotal) errAbort("Too many bases line %d of %s. Max is %lld, total so far is %lld", lf->lineIx, lf->fileName, maxTotal, total); slAddHead(&list, el); ++chromCount; } hashFree(&uniqHash); lineFileClose(&lf); slReverse(&list); verbose(1, "Read %d chroms totalling %lld bases in %s\n", chromCount, total, fileName); return list; }
void ave(char *fileName) /* ave - Compute average and basic stats. */ { int count = 0; size_t alloc = 1024; double *array; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[128], *word; int wordCount; int wordIx = col-1; AllocArray(array, alloc); while ((wordCount = lineFileChop(lf, words)) > 0) { if (count >= alloc) { alloc <<= 1; ExpandArray(array, count, alloc); } word = words[wordIx]; if (word[0] == '-' || isdigit(word[0])) { array[count++] = atof(word); } } if (count == 0) errAbort("No numerical data column %d of %s", col, fileName); qsort(array, count, sizeof(array[0]), cmpDouble); showStats(array, count); }
struct finf *finfReadNext(struct lineFile *lf) /* Read in next finf from file, or NULL at EOF. */ { char ucscName[32]; char *parts[4], *words[16]; int partCount, wordCount; struct finf *finf; wordCount = lineFileChop(lf, words); if (wordCount <= 0) return NULL; lineFileExpectWords(lf, 7, wordCount); AllocVar(finf); gsToUcsc(words[0], ucscName); finf->name = cloneString(ucscName); finf->start = atoi(words[2])-1; finf->end = atoi(words[3]); if (words[5][0] != '?') { partCount = chopString(words[5], ",/", parts, ArraySize(parts)); if (partCount != 3) errAbort("Misformed field 6 line %d of %s\n", lf->lineIx, lf->fileName); finf->chainId = atoi(parts[0]); finf->linkIx = atoi(parts[1]); finf->linkCount = atoi(parts[2]); } strncpy(finf->endInfo, words[6], sizeof(finf->endInfo)); return finf; }
void bedIntersect(char *aFile, char *bFile, char *outFile) /* bedIntersect - Intersect two bed files. */ { struct lineFile *lf = lineFileOpen(aFile, TRUE); struct hash *bHash = readBed(bFile); FILE *f = mustOpen(outFile, "w"); char *row[40]; int wordCount; while ((wordCount = (strictTab ? lineFileChopTab(lf, row) : lineFileChop(lf, row))) != 0) { char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (start > end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); if (start == end && !allowStartEqualEnd) lineFileAbort(lf, "start==end (if this is legit, use -allowStartEqualEnd)"); struct binKeeper *bk = hashFindVal(bHash, chrom); if (bk != NULL) { struct binElement *hitList = NULL, *hit; if (allowStartEqualEnd && start == end) hitList = binKeeperFind(bk, start-1, end+1); else hitList = binKeeperFind(bk, start, end); if (aHitAny) { for (hit = hitList; hit != NULL; hit = hit->next) { float cov = getCov(start, end, hit->val); if (cov >= minCoverage) { outputBed(f, row, wordCount, start, end, hit->val); break; } else { struct bed5 *b = hit->val; verbose(1, "filter out %s %d %d %d %d overlap %d %d %d %.3f\n", chrom, start, end, b->start, b->end, positiveRangeIntersection(start, end, b->start, b->end), end-start, b->end-b->start, cov); } } } else { for (hit = hitList; hit != NULL; hit = hit->next) { if (getCov(start, end, hit->val) >= minCoverage) outputBed(f, row, wordCount, start, end, hit->val); } } slFreeList(&hitList); } } }
void setupHugeGaps(char *insertFile) /* Setup things to lookup gaps. */ { struct lineFile *lf; char *words[8]; int wordCount; struct chromGaps *chromList = NULL, *cg; struct hugeGap *gap; char *chrom; char query[512]; struct sqlResult *sr; char **row; struct ctgPos ctgPos; int start, size; struct hashEl *hel; struct sqlConnection *conn = sqlConnect("hg4"); hugeHash = newHash(6); lf = lineFileOpen(insertFile, TRUE); while ((wordCount = lineFileChop(lf, words)) != 0) { chrom = words[0]; if (sameString(words[2], "-")) continue; if ((cg = hashFindVal(hugeHash, chrom)) == NULL) { AllocVar(cg); slAddHead(&chromList, cg); hel = hashAdd(hugeHash, chrom, cg); cg->chrom = hel->name; } size = atoi(words[3]); sqlSafef(query, sizeof query, "select * from ctgPos where contig = '%s'", words[2]); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) == NULL) errAbort("Couldn't find %s from %s in database", words[2], lf->fileName); ctgPosStaticLoad(row, &ctgPos); if (!sameString(chrom, ctgPos.chrom)) errAbort("%s is in %s in database and %s in %s", ctgPos.contig, ctgPos.chrom, chrom, lf->fileName); start = ctgPos.chromStart; uglyf("%s %s (%d size %d) %s \n", chrom, words[1], start, size, words[2]); sqlFreeResult(&sr); AllocVar(gap); slAddHead(&cg->gapList, gap); gap->offset = start; gap->size = size; } lineFileClose(&lf); sqlDisconnect(&conn); for (cg = chromList; cg != NULL; cg = cg->next) { slSort(&cg->gapList, cmpHugeGap); } }
void gapFileToTable(struct sqlConnection *conn, char *gapFileName, char *gapTableName) /* Build a single gap table from a single gap file. */ { struct lineFile *lf = lineFileOpen(gapFileName, TRUE); char tabFileName[256]; FILE *tabFile = NULL; char *words[16]; int wordCount; safef(tabFileName, sizeof(tabFileName), "%s.tab", gapTableName); tabFile = mustOpen(tabFileName, "w"); while ((wordCount = lineFileChop(lf, words)) > 0) { if (wordCount < 5) errAbort("Short line %d of %s", lf->lineIx, lf->fileName); if (words[4][0] == 'N' || words[4][0] == 'U') { int len = strlen(words[0]); if (len > maxChromNameSize) { maxChromNameSize = len; if (maxChromNameSize > 254) errAbort("ERROR: chrom name size is over 254(%d) characters: " "'%s'", maxChromNameSize, words[0]); } struct agpGap gap; agpGapStaticLoad(words, &gap); gap.chromStart -= 1; fprintf(tabFile, "%u\t", hFindBin(gap.chromStart, gap.chromEnd)); agpGapTabOut(&gap, tabFile); } } lineFileClose(&lf); fclose(tabFile); if (! noLoad) { struct dyString *ds = newDyString(2048); if (unsplit) sqlDyStringPrintf(ds, createGapUnsplit, gapTableName, maxChromNameSize, maxChromNameSize); else sqlDyStringPrintf(ds, createGapSplit, gapTableName); char query[1024]; sqlRemakeTable(conn, gapTableName, ds->string); sqlSafef(query, sizeof(query), "LOAD data local infile '%s' into table %s", tabFileName, gapTableName); sqlUpdate(conn, query); remove(tabFileName); freeDyString(&ds); } }
struct vcfRecord *vcfNextRecord(struct vcfFile *vcff) /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ { char *words[VCF_MAX_COLUMNS]; int wordCount; if ((wordCount = lineFileChop(vcff->lf, words)) <= 0) return NULL; int expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; lineFileExpectWords(vcff->lf, expected, wordCount); return vcfRecordFromRow(vcff, words); }
struct scaffold *readScaffoldsFromAgp(char *fileName) /* Read in agp file and return as list of scaffolds. */ { struct hash *scaffoldHash = newHash(17); struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[9]; int wordCount; struct scaffold *scaffoldList = NULL, *scaffold; struct agpFrag *frag; int size; for (;;) { wordCount = lineFileChop(lf, row); if (wordCount <= 0) break; if (wordCount < 8) lineFileShort(lf); if (row[4][0] == 'N' || row[4][0] == 'U') continue; if (wordCount < 9) lineFileShort(lf); frag = agpFragLoad(row); frag->chromStart -= 1; frag->fragStart -= 1; size = frag->fragEnd - frag->fragStart; if (size != frag->chromEnd - frag->chromStart) errAbort("scaffold/contig size mismatch line %d of %s", lf->lineIx, lf->fileName); if (frag->strand[0] != '+') errAbort("Strand not + line %d of %s", lf->lineIx, lf->fileName); scaffold = hashFindVal(scaffoldHash, frag->chrom); if (scaffold == NULL) { AllocVar(scaffold); hashAdd(scaffoldHash, frag->chrom, scaffold); slAddHead(&scaffoldList, scaffold); } slAddHead(&scaffold->list, frag); if (frag->chromEnd > scaffold->size) scaffold->size = frag->chromEnd; } slReverse(&scaffoldList); for (scaffold = scaffoldList; scaffold != NULL; scaffold = scaffold->next) slReverse(&scaffold->list); printf("Got %d scaffolds in %s\n", slCount(scaffoldList), lf->fileName); lineFileClose(&lf); hashFree(&scaffoldHash); return scaffoldList; }
int firstLinePos(char *fileName) /* Return position of first line. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[3]; int retVal = 0; int wordCount = lineFileChop(lf, words); if (wordCount > 0) { if (wordCount != 2) errAbort("%s is not a two column file", fileName); retVal = lineFileNeedNum(lf, words, 0); lineFileClose(&lf); } return retVal; }
struct rmskOut2 *rmskOut2ReadNext(struct lineFile *lf) /* Read next record from repeat masker file. Return NULL at EOF. */ { char *words[32]; int wordCount; char id; struct rmskOut2 *ret; char *class, *family; if ((wordCount = lineFileChop(lf, words)) == 0) return NULL; if (wordCount != 15 ) errAbort("Expecting 15 words - line %d of %s", lf->lineIx, lf->fileName); id = words[14][0]; AllocVar(ret); ret->swScore = lineFileNeedNum(lf, words, 0); ret->milliDiv = round(10.0*atof(words[1])); ret->milliDel = round(10.0*atof(words[2])); ret->milliIns = round(10.0*atof(words[3])); ret->genoName = cloneString(words[4]); ret->genoStart = lineFileNeedNum(lf, words, 5)-1; ret->genoEnd = lineFileNeedNum(lf, words, 6); ret->genoLeft = -negParenNum2(lf, words[7]); if (sameString(words[8], "C")) ret->strand[0] = '-'; else if (sameString(words[8], "+")) ret->strand[0] = '+'; else errAbort("Unexpected strand char line %d of %s", lf->lineIx, lf->fileName); ret->repName = cloneString(words[9]); parseClassAndFamily(words[10], &class, &family); ret->repClass = cloneString(class); ret->repFamily = cloneString(family); if (sameString(words[8], "C")) { ret->repStart = negParenNum2(lf, words[11])-1; ret->repEnd = sqlSigned(words[12]); ret->repLeft = -negParenNum2(lf, words[13]); }else { ret->repLeft = -negParenNum2(lf, words[11]); ret->repEnd = sqlSigned(words[12]); ret->repStart = negParenNum2(lf, words[13])-1; } return ret; }
struct hash *hashTwoColumnFile(char *fileName) /* Given a two column file (key, value) return a hash. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = hashNew(16); char *row[3]; int fields = 0; while ((fields = lineFileChop(lf, row)) != 0) { lineFileExpectWords(lf, 2, fields); char *name = row[0]; char *value = lmCloneString(hash->lm, row[1]); hashAdd(hash, name, value); } lineFileClose(&lf); return hash; }
void doContig(char *dir, char *chrom, char *contig) /* Sniff contig for dupes. */ { char fileName[512]; char *words[16]; int wordCount; struct lineFile *lf; sprintf(fileName, "%s/%s", dir, goldName); lf = lineFileOpen(fileName, TRUE); while ((wordCount = lineFileChop(lf, words)) != 0) { if (wordCount < 8) errAbort("Short line %d of %s", lf->lineIx, lf->fileName); if (words[4][0] != 'N' && words[4][0] != 'U') { char *frag = words[5]; char cloneName[256]; struct cloneLoc *cl; strcpy(cloneName, frag); chopSuffix(cloneName); cl = hashFindVal(fragHash, frag); if (cl != NULL) { printf("%s duplicated in %s/%s and %s/%s\n", frag, cl->chrom, cl->contig, chrom, contig); ++errCount; } else { cl = hashFindVal(cloneHash, cloneName); if (cl != NULL && !sameString(contig, cl->contig)) { printf("%s duplicated in %s/%s and %s/%s\n", cloneName, cl->chrom, cl->contig, chrom, contig); ++errCount; } if (cl == NULL) { cl = cloneLocNew(cloneName, contig, chrom); hashAdd(cloneHash, cloneName, cl); } hashAdd(fragHash, frag, cl); } } } lineFileClose(&lf); }
struct rgi *readRgi(char *inName) { struct rgi *rgiList = NULL, *rgi; struct lineFile *lf = lineFileOpen(inName, TRUE); int wordCount; char *words[8]; while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 4, wordCount); rgi = rgiLoad(words); slAddHead(&rgiList, rgi); uglyf("%s %s: min %d, max %d\n", rgi->a, rgi->b, rgi->minDistance, rgi->maxDistance); } lineFileClose(&lf); slReverse(&rgiList); return rgiList; }
static void vcfParseData(struct vcfFile *vcff, int maxRecords) /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned * at the beginning of a data row, parse and store all data rows from lineFile. */ { if (vcff == NULL) return; int recCount = 0, expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; char *words[VCF_MAX_COLUMNS]; int wordCount; while ((wordCount = lineFileChop(vcff->lf, words)) > 0) { if (maxRecords >= 0 && recCount >= maxRecords) break; lineFileExpectWords(vcff->lf, expected, wordCount); struct vcfRecord *record; AllocVar(record); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn. record->chromEnd = record->chromStart+1; record->name = vcfFilePooledStr(vcff, words[2]); parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } slAddHead(&(vcff->records), record); recCount++; } slReverse(&(vcff->records)); lineFileClose(&(vcff->lf)); }
void aveNoQuartiles(char *fileName) /* aveNoQuartiles - Compute only min,max,mean,stdDev no quartiles */ { bits64 count = 0; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[128], *word; int wordCount; int wordIx = col-1; double sumData = 0.0, sumSquares = 0.0; double minVal = DBL_MAX, maxVal = -DBL_MAX; while ((wordCount = lineFileChop(lf, words)) > 0) { word = words[wordIx]; if (word[0] == '-' || isdigit(word[0])) { double val = sqlDouble(word); if (minVal > val) minVal = val; if (maxVal < val) maxVal = val; sumData += val; sumSquares += val * val; ++count; } } if (count == 0) errAbort("No numerical data column %d of %s", col, fileName); double average = sumData/count; double stdDev = calcStdFromSums(sumData, sumSquares, count); if (tableOut) { printf("# min max mean N sum stddev\n"); printf("%g %g %g %llu %g %g\n", minVal, maxVal, average, count, sumData, stdDev); } else { printf("average %f\n", average); printf("min %f\n", minVal); printf("max %f\n", maxVal); printf("count %llu\n", count); printf("total %f\n", sumData); printf("standard deviation %f\n", stdDev); } }
struct hash *readAgp(char *fileName) /* Read AGP file into hash */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = newHash(0); int wordCount; char *words[16]; while ((wordCount = lineFileChop(lf, words)) > 0) { if (wordCount == 9 && !sameString(words[4], "N")) { addCloneToHash(hash, words[5]); } } uglyf("Read %d lines in %s\n", lf->lineIx, fileName); lineFileClose(&lf); return hash; }
struct liftSpec *readLifts(char *fileName) /* Read in lift file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *words[16]; struct liftSpec *list = NULL, *el; while ((wordCount = lineFileChop(lf, words)) != 0) { char *offs; if (wordCount < 5) errAbort("Need at least 5 words line %d of %s", lf->lineIx, lf->fileName); offs = words[0]; if (!isdigit(offs[0]) && !(offs[0] == '-' && isdigit(offs[1]))) errAbort("Expecting number in first field line %d of %s", lf->lineIx, lf->fileName); if (!isdigit(words[4][0])) errAbort("Expecting number in fifth field line %d of %s", lf->lineIx, lf->fileName); AllocVar(el); el->offset = atol(offs); el->oldName = cloneString(words[1]); el->oldSize = atoi(words[2]); el->newName = cloneString(words[3]); el->newSize = atoi(words[4]); if (wordCount >= 6) { char c = words[5][0]; if (c == '+' || c == '-') el->strand = c; else errAbort("Expecting + or - field 6, line %d of %s", lf->lineIx, lf->fileName); } else el->strand = '+'; slAddHead(&list, el); } slReverse(&list); lineFileClose(&lf); if (list == NULL) errAbort("Empty liftSpec file %s", fileName); return list; }
void tempLower(char *inName, char *insertsFile, char *outName) /* tempLower - Remove centromeres etc. from oo.18 cytobands. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); FILE *f = mustOpen(outName, "w"); int wordCount, i; int start, end, offset; char *chrom; char *words[128]; int count = 0, liftCount = 0; struct chromGaps *cg; setupHugeGaps(insertsFile); while ((wordCount = lineFileChop(lf, words)) != 0) { chrom = words[0]; start = atoi(words[1]); end = atoi(words[2]); cg = hashFindVal(hugeHash, chrom); if (cg != NULL) { offset = gapOffset(cg, start); if (offset != 0) { start += offset; end += offset; liftCount += 1; } } fprintf(f, "%s\t%d\t%d", chrom, start, end); for (i=3; i<wordCount; ++i) fprintf(f, "\t%s", words[i]); fprintf(f, "\n"); ++count; } printf("Lifted %d of %d lines of %s to %s\n", liftCount, count, inName, outName); fclose(f); lineFileClose(&lf); }
struct clone *readTrans(char *fileName) /* Read info in trans file. */ { char cloneName[128], lastCloneName[128]; struct clone *cloneList = NULL, *clone = NULL; struct frag *frag; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[8], *parts[4], *subParts[3]; int wordCount, partCount, subCount; strcpy(lastCloneName, ""); while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 3, wordCount); partCount = chopString(words[2], "(:)", parts, ArraySize(parts)); if (partCount != 2) errAbort("Badly formatted third field line %d of %s", lf->lineIx, lf->fileName); subCount = chopString(parts[1], ".", subParts, ArraySize(subParts)); if (subCount != 2) errAbort("Badly formatted third field line %d of %s (expecting start..end)", lf->lineIx, lf->fileName); fragToCloneName(words[0], cloneName); if (!sameString(cloneName, lastCloneName)) { AllocVar(clone); clone->name = cloneString(cloneName); slAddHead(&cloneList, clone); } AllocVar(frag); frag->name = cloneString(words[0]); frag->ffaName = cloneString(words[1]); frag->start = lineFileNeedNum(lf, subParts, 0) - 1; frag->end = lineFileNeedNum(lf, subParts, 1); slAddTail(&clone->fragList, frag); strcpy(lastCloneName, cloneName); } lineFileClose(&lf); slReverse(&cloneList); return cloneList; }
struct chain *chainReadChainLine(struct lineFile *lf) /* Read line that starts with chain. Allocate memory * and fill in values. However don't read link lines. */ { char *row[13]; int wordCount; struct chain *chain; wordCount = lineFileChop(lf, row); if (wordCount == 0) return NULL; if (wordCount < 12) errAbort("Expecting at least 12 words line %d of %s", lf->lineIx, lf->fileName); if (!sameString(row[0], "chain")) errAbort("Expecting 'chain' line %d of %s", lf->lineIx, lf->fileName); AllocVar(chain); chain->score = atof(row[1]); chain->tName = cloneString(row[2]); chain->tSize = lineFileNeedNum(lf, row, 3); if (wordCount >= 13) chain->id = lineFileNeedNum(lf, row, 12); else chainIdNext(chain); /* skip tStrand for now, always implicitly + */ chain->tStart = lineFileNeedNum(lf, row, 5); chain->tEnd = lineFileNeedNum(lf, row, 6); chain->qName = cloneString(row[7]); chain->qSize = lineFileNeedNum(lf, row, 8); chain->qStrand = row[9][0]; chain->qStart = lineFileNeedNum(lf, row, 10); chain->qEnd = lineFileNeedNum(lf, row, 11); if (chain->qStart >= chain->qEnd || chain->tStart >= chain->tEnd) errAbort("End before start line %d of %s", lf->lineIx, lf->fileName); if (chain->qStart < 0 || chain->tStart < 0) errAbort("Start before zero line %d of %s", lf->lineIx, lf->fileName); if (chain->qEnd > chain->qSize || chain->tEnd > chain->tSize) errAbort("Past end of sequence line %d of %s", lf->lineIx, lf->fileName); return chain; }
struct agpFrag *readAgpFile(char *agpName) /* Read agps from file. */ { struct lineFile *lf = lineFileOpen(agpName, TRUE); int wordCount; char *words[16]; struct agpFrag *list = NULL, *el; while ((wordCount = lineFileChop(lf, words)) != 0) { if (words[4][0] != 'N') { lineFileExpectWords(lf, 9, wordCount); el = agpFragLoad(words); slAddHead(&list, el); } } lineFileClose(&lf); slReverse(&list); return list; }