void gapFileToTable(struct sqlConnection *conn, char *gapFileName, char *gapTableName) /* Build a single gap table from a single gap file. */ { struct lineFile *lf = lineFileOpen(gapFileName, TRUE); char tabFileName[256]; FILE *tabFile = NULL; char *words[16]; int wordCount; safef(tabFileName, sizeof(tabFileName), "%s.tab", gapTableName); tabFile = mustOpen(tabFileName, "w"); while ((wordCount = lineFileChop(lf, words)) > 0) { if (wordCount < 5) errAbort("Short line %d of %s", lf->lineIx, lf->fileName); if (words[4][0] == 'N' || words[4][0] == 'U') { int len = strlen(words[0]); if (len > maxChromNameSize) { maxChromNameSize = len; if (maxChromNameSize > 254) errAbort("ERROR: chrom name size is over 254(%d) characters: " "'%s'", maxChromNameSize, words[0]); } struct agpGap gap; agpGapStaticLoad(words, &gap); gap.chromStart -= 1; fprintf(tabFile, "%u\t", hFindBin(gap.chromStart, gap.chromEnd)); agpGapTabOut(&gap, tabFile); } } lineFileClose(&lf); fclose(tabFile); if (! noLoad) { struct dyString *ds = newDyString(2048); if (unsplit) sqlDyStringPrintf(ds, createGapUnsplit, gapTableName, maxChromNameSize, maxChromNameSize); else sqlDyStringPrintf(ds, createGapSplit, gapTableName); char query[1024]; sqlRemakeTable(conn, gapTableName, ds->string); sqlSafef(query, sizeof(query), "LOAD data local infile '%s' into table %s", tabFileName, gapTableName); sqlUpdate(conn, query); remove(tabFileName); freeDyString(&ds); } }
void getSeqGapsUnsplit(struct sqlConnection *conn, struct hash *chromHash) /* Return a tree of ranges for sequence gaps in all chromosomes, * assuming an unsplit gap table -- when the table is unsplit, it's * probably for a scaffold assembly where we *really* don't want * to do one query per scaffold! */ { struct rbTreeNode **stack = lmAlloc(qLm, 256 * sizeof(stack[0])); struct rbTree *tree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack); int rowOffset = hOffsetPastBin(sqlGetDatabase(conn), NULL, "gap"); struct sqlResult *sr; char **row; char *prevChrom = NULL; sr = sqlGetResult(conn, "NOSQLINJ select * from gap order by chrom"); while ((row = sqlNextRow(sr)) != NULL) { struct agpGap gap; struct simpleRange *range; agpGapStaticLoad(row+rowOffset, &gap); if (prevChrom == NULL) prevChrom = cloneString(gap.chrom); else if (! sameString(prevChrom, gap.chrom)) { setNGap(prevChrom, chromHash, tree); freeMem(prevChrom); stack = lmAlloc(qLm, 256 * sizeof(stack[0])); tree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack); prevChrom = cloneString(gap.chrom); } lmAllocVar(tree->lm, range); range->start = gap.chromStart; range->end = gap.chromEnd; rbTreeAdd(tree, range); } if (prevChrom != NULL) { setNGap(prevChrom, chromHash, tree); freeMem(prevChrom); } sqlFreeResult(&sr); }
struct rbTree *getSeqGaps(struct sqlConnection *conn, char *chrom) /* Return a tree of ranges for sequence gaps in chromosome */ { struct rbTree *tree = rbTreeNew(simpleRangeCmp); int rowOffset; struct sqlResult *sr = hChromQuery(conn, "gap", chrom, NULL, &rowOffset); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct agpGap gap; struct simpleRange *range; agpGapStaticLoad(row+rowOffset, &gap); lmAllocVar(tree->lm, range); range->start = gap.chromStart; range->end = gap.chromEnd; rbTreeAdd(tree, range); } sqlFreeResult(&sr); return tree; }
int countBases(struct sqlConnection *conn, char *chrom, int chromSize, char *database) /* Count bases, generally not including gaps, in chromosome. */ { static boolean gapsLoaded = FALSE; struct sqlResult *sr; int totalGaps = 0; char **row; int rowOffset; if (countGaps) return chromSize; /* If doing all chroms, then load up all the gaps and be done with * it instead of re-reading the gap table for every chrom */ if (sameWord(clChrom,"all")) { if (!gapsLoaded) gapHash = loadAllGaps(conn, database); gapsLoaded = TRUE; totalGaps = hashIntValDefault(gapHash, chrom, 0); } else { sr = hChromQuery(conn, "gap", chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { int gapSize; struct agpGap gap; agpGapStaticLoad(row+rowOffset, &gap); gapSize = gap.chromEnd - gap.chromStart; totalGaps += gapSize; } sqlFreeResult(&sr); } return chromSize - totalGaps; }
long long gapsInRegion(struct sqlConnection *conn, struct region *regionList, int limit) /* Return count of gaps in all regions to limit number of regions, * limit=0 == no limit, do them all */ { long long gapBases = 0; char *splitTable = chromTable(conn, "gap"); int regionCount = 0; if (sqlTableExists(conn, splitTable)) { struct region *region; for (region = regionList; (region != NULL) && (!(limit && (regionCount >= limit))); region = region->next, ++regionCount) { int rowOffset; char **row; struct agpGap gap; struct sqlResult *sr = hRangeQuery(conn, "gap", region->chrom, region->start, region->end, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { agpGapStaticLoad(row + rowOffset, &gap); if (gap.chromStart < region->start) gap.chromStart = region->start; if (gap.chromEnd > region->end) gap.chromEnd = region->end; gapBases += gap.chromEnd - gap.chromStart; } sqlFreeResult(&sr); } } freez(&splitTable); return gapBases; }
static struct hash *loadAllGaps(struct sqlConnection *conn, char *db) /* working on all chroms, fetch all per-chrom gap counts at once * returns hash by chrom name to gap counts for that chrom */ { struct chromInfo *cInfo; struct sqlResult *sr; char **row; struct hash *ret; int totalGapSize = 0; int gapCount = 0; ret = newHash(0); /* If not split, read in whole gulp, create per-chrom hash of sizes */ if (hTableExists(db, "gap")) { char *prevChrom = NULL; int totalGapsThisChrom = 0; sr = sqlGetResult(conn, NOSQLINJ "select chrom,chromStart,chromEnd from gap order by chrom"); while ((row = sqlNextRow(sr)) != NULL) { int gapSize = sqlUnsigned(row[2]) - sqlUnsigned(row[1]); ++gapCount; if (prevChrom && sameWord(prevChrom,row[0])) { totalGapsThisChrom += gapSize; totalGapSize += gapSize; } else { if (prevChrom) { hashAddInt(ret, prevChrom, totalGapsThisChrom); freeMem(prevChrom); prevChrom = cloneString(row[0]); totalGapsThisChrom = gapSize; totalGapSize += gapSize; } else { prevChrom = cloneString(row[0]); totalGapsThisChrom = gapSize; totalGapSize += gapSize; } } } /* and the last one */ if (prevChrom && (totalGapsThisChrom > 0)) { hashAddInt(ret, prevChrom, totalGapsThisChrom); freeMem(prevChrom); } sqlFreeResult(&sr); } else { /* for each chrom name, fetch the gap count */ for (cInfo = chromInfoList; cInfo != NULL; cInfo = cInfo->next) { int rowOffset; int totalGapsThisChrom = 0; sr = hChromQuery(conn, "gap", cInfo->chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { int gapSize; struct agpGap gap; ++gapCount; agpGapStaticLoad(row+rowOffset, &gap); gapSize = gap.chromEnd - gap.chromStart; totalGapsThisChrom += gapSize; totalGapSize += gapSize; } sqlFreeResult(&sr); hashAddInt(ret, cInfo->chrom, totalGapsThisChrom); } } verbose(2,"#\tloaded %d gaps covering %d bases\n", gapCount, totalGapSize); return ret; }
void splitAgp(char *agpName, char *goldFileName, char *gapFileName) /* Split up agp file into gold and gap files. */ { struct lineFile *lf; char *words[16]; int wordCount; FILE *goldTab, *gapTab; /* Scan through .agp file splitting it into gold * and gap components. */ goldTab = mustOpen(goldFileName, "w"); gapTab = mustOpen(gapFileName, "w"); lf = lineFileOpen(agpName, TRUE); while ((wordCount = lineFileChop(lf, words)) > 0) { int start, end; if (wordCount < 5) errAbort("Short line %d of %s", lf->lineIx, lf->fileName); int len = strlen(words[0]); if (len > maxChromNameSize) { maxChromNameSize = len; if (maxChromNameSize > 254) errAbort("ERROR: chrom name size is over 254(%d) characters: " "'%s'", maxChromNameSize, words[0]); } start = sqlUnsigned(words[1])-1; end = sqlUnsigned(words[2]); if (words[4][0] == 'N' || words[4][0] == 'U') { struct agpGap gap; agpGapStaticLoad(words, &gap); gap.chromStart -= 1; fprintf(gapTab, "%u\t", hFindBin(start, end)); agpGapTabOut(&gap, gapTab); verbose(3,"#GAP\t%s:%d-%d\n", gap.chrom, gap.chromStart, gap.chromEnd); } else { struct agpFrag gold; agpFragStaticLoad(words, &gold); agpFragValidate(&gold); len = strlen(words[5]); if (len > maxFragNameSize) { maxFragNameSize = len; if (maxFragNameSize > 254) errAbort("ERROR: fragment name size is over 254(%d) " "characters: '%s'", maxFragNameSize, words[5]); } // file is 1-based. agpFragLoad() now assumes 0-based. // and agpFragTabOut() will assume 1-based, but we will load // the generated file straight into the database, so // subtract 2: gold.chromStart -= 2; gold.fragStart -= 2; fprintf(goldTab, "%u\t", hFindBin(start, end)); agpFragTabOut(&gold, goldTab); } } lineFileClose(&lf); carefulClose(&goldTab); carefulClose(&gapTab); }