void doOneChrom(char *database, char *chrom, char *rnaTable, char *expTable, FILE *f)
/* Process one chromosome. */
{
int chromSize = hChromSize(database, chrom);
struct binKeeper *bk = binKeeperNew(0, chromSize);
struct sqlConnection *conn = hAllocConn(database);
struct sqlResult *sr;
char **row;
struct bed *exp, *rna;
int rowOffset;
struct binElement *be, *beList;
int oneCount;

/* Load up expTable into bin-keeper. */
sr = hChromQuery(conn, expTable, chrom, NULL, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
    {
    exp = bedLoadN(row + rowOffset, 12);
    binKeeperAdd(bk, exp->chromStart, exp->chromEnd, exp);
    }
sqlFreeResult(&sr);

/* Loop through rnaTable and look at intersections. */
sr = hChromQuery(conn, rnaTable, chrom, NULL, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
    {
    rna = bedLoadN(row + rowOffset, 12);
    beList = binKeeperFind(bk, rna->chromStart, rna->chromEnd);
    oneCount = 0;
    for (be = beList; be != NULL; be = be->next)
        {
	exp = be->val;
	if (exp->strand[0] == rna->strand[0])
	    {
	    ++oneCount;
	    ++hitCount;
//	    fprintf(f, "%s:%d-%d\t%s\t%s\n", 
//	    	rna->chrom, rna->chromStart, rna->chromEnd, rna->name, exp->name);
	    }
	}
    slFreeList(&beList);
    if (oneCount == 0)
	{
        ++missCount;
	fprintf(f, "miss %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name);
	}
    else if (oneCount == 1)
	{
	fprintf(f, "uniq %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name);
        ++uniqCount;
	}
    else
	{
	fprintf(f, "dupe %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name);
        ++dupeCount;
	}
    }
sqlFreeResult(&sr);
hFreeConn(&conn);
}
void getBinKeeper(char *chromName)
/* put SNPs in binKeeper */
{
char query[512];
struct sqlConnection *conn = hAllocConn();
struct sqlResult *sr;
char **row;

int start = 0;
int end = 0;
char *rsId = NULL;

int chromSize = hChromSize(chromName);

verbose(1, "constructing binKeeper...\n");
snps = binKeeperNew(0, chromSize);
safef(query, sizeof(query), 
      "select chromStart, chromEnd, name from %s where chrom = '%s'", snpTable, chromName);

sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    start = sqlUnsigned(row[0]);
    end = sqlUnsigned(row[1]);
    rsId = cloneString(row[2]);
    binKeeperAdd(snps, start, end, rsId);
    }

sqlFreeResult(&sr);
hFreeConn(&conn);
}
Ejemplo n.º 3
0
struct hash *readChainToBinKeeper(char *sizeFileName, char *fileName)
{
struct binKeeper *bk; 
struct chain *chain;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct lineFile *sf = lineFileOpen(sizeFileName, TRUE);
struct hash *hash = newHash(0);
char *chromRow[2];

while (lineFileRow(sf, chromRow))
    {
    char *name = chromRow[0];
    int size = lineFileNeedNum(sf, chromRow, 1);

    if (hashLookup(hash, name) != NULL)
        warn("Duplicate %s, ignoring all but first\n", name);
    else
        {
        bk = binKeeperNew(0, size);
        assert(size > 1);
	hashAdd(hash, name, bk);
        }
    }
while ((chain = chainRead(lf)) != NULL)
    {
    bk = hashMustFindVal(hash, chain->tName);
    binKeeperAdd(bk, chain->tStart, chain->tEnd, chain);
    }
lineFileClose(&lf);
return hash;
}
struct binKeeper *fbToBinKeeper(struct featureBits *fbList, int chromSize)
/* Make a binKeeper filled with fbList. */
{
struct binKeeper *bk = binKeeperNew(0, chromSize);
struct featureBits *fb;
for (fb = fbList; fb != NULL; fb = fb->next)
    binKeeperAdd(bk, fb->start, fb->end, fb);
return bk;
}
Ejemplo n.º 5
0
struct altGraphX *agFromAlignments(char *db, struct ggMrnaAli *maList, struct dnaSeq *seq, struct sqlConnection *conn,
				   int chromStart, int chromEnd, FILE *out )
/** Custer overlaps from maList into altGraphX structure. */
{
struct altGraphX *ag = NULL, *agList = NULL;
struct ggMrnaCluster *mcList=NULL, *mc=NULL;
struct ggMrnaInput *ci = NULL;
struct geneGraph *gg = NULL;
static int count = 0;
ci = ggMrnaInputFromAlignments(maList, seq);
mcList = ggClusterMrna(ci);
if(mcList == NULL)
    {	
    freeGgMrnaInput(&ci);
    return NULL;
    }    

clusterCount++;
for(mc = mcList; mc != NULL; mc = mc->next)
    {
    if(optionExists("consensus"))
	{
	gg = ggGraphConsensusCluster(db, mc, ci, tissLibHash, !optionExists("skipTissues"));
	}
    else
	gg = ggGraphCluster(db, mc,ci);
    assert(checkEvidenceMatrix(gg));
    ag = ggToAltGraphX(gg);
    if(ag != NULL)
	{
	char name[256];
	freez(&ag->name);
	safef(name, sizeof(name), "%s.%d", ag->tName, count++);
	ag->name = cloneString(name);
	/* Convert back to genomic coordinates. */
	altGraphXoffset(ag, chromStart);
	/* Sort vertices so that they are chromosomal order */
 	altGraphXVertPosSort(ag);
	/* write to file */
	binKeeperAdd(agxSeenBin, ag->tStart, ag->tEnd, ag);
	slAddHead(&agList, ag);
	}
    }

/* Sometimes get nested, partial transcripts. Want to filter 
   those out. */
for(ag = agList; ag != NULL; ag = ag->next)
    {
    if(!agxIsRedundant(ag))
       altGraphXTabOut(ag, out); 
    }
/* genoSeq and maList are freed with ci and gg */
ggFreeMrnaClusterList(&mcList);
freeGgMrnaInput(&ci);
freeGeneGraph(&gg);
return agList;
}
Ejemplo n.º 6
0
struct hash *readBed(char *fileName)
/* Read bed and return it as a hash keyed by chromName
 * with binKeeper values. */
{
char *row[5];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = newHash(0);
int expectedCols = bScore ? 5 : 3;

while (lineFileNextRow(lf, row, expectedCols))
    {
    struct binKeeper *bk;
    struct bed5 *bed;
    struct hashEl *hel = hashLookup(hash, row[0]);
    if (hel == NULL)
       {
       bk = binKeeperNew(0, 1024*1024*1024);
       hel = hashAdd(hash, row[0], bk);
       }
    bk = hel->val;
    AllocVar(bed);
    bed->chrom = hel->name;
    bed->start = lineFileNeedNum(lf, row, 1);
    bed->end = lineFileNeedNum(lf, row, 2);
    if (bScore)
	bed->score = lineFileNeedNum(lf, row, 4);
    if (bed->start > bed->end)
        errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
    if (bed->start == bed->end)
	{
	if (allowStartEqualEnd)
	    // Note we are tweaking binKeeper coords here, so use bed->start and bed->end.
	    binKeeperAdd(bk, max(0, bed->start-1), bed->end+1, bed);
	else
	    lineFileAbort(lf, "start==end (if this is legit, use -allowStartEqualEnd)");
	}
    else
	binKeeperAdd(bk, bed->start, bed->end, bed);
    }
lineFileClose(&lf);
return hash;
}
Ejemplo n.º 7
0
void bestProbeOverlap(struct sqlConnection *conn, char *probeTable, 
	struct genePred *gpList, struct hash *gpToProbeHash)
/* Create hash of most overlapping probe if any for each gene. Require
 * at least 100 base overlap. */
{
/* Create a hash of binKeepers filled with probes. */
struct hash *keeperHash = keepersForChroms(conn);
struct hashCookie it = hashFirst(keeperHash);
struct hashEl *hel;
int pslCount = 0;
while ((hel = hashNext(&it)) != NULL)
    {
    char *chrom = hel->name;
    struct binKeeper *bk = hel->val;
    int rowOffset;
    struct sqlResult *sr = hChromQuery(conn, probeTable, chrom, NULL, &rowOffset);
    char **row;
    while ((row = sqlNextRow(sr)) != NULL)
        {
	struct psl *psl = pslLoad(row+rowOffset);
	binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
	++pslCount;
	}
    sqlFreeResult(&sr);
    }
verbose(2, "Loaded %d psls from %s\n", pslCount, probeTable);

/* Loop through gene list, finding best probe if any for each gene. */
struct genePred *gp;
for (gp = gpList; gp != NULL; gp = gp->next)
    {
    struct rbTree *rangeTree = genePredToRangeTree(gp, FALSE);
    struct psl *bestPsl = NULL;
    int bestOverlap = 99;	/* MinOverlap - 1 */
    struct binKeeper *bk = hashMustFindVal(keeperHash, gp->chrom);
    struct binElement *bin, *binList = binKeeperFind(bk, gp->txStart, gp->txEnd);
    for (bin = binList; bin != NULL; bin = bin->next)
        {
	struct psl *psl = bin->val;
	if (psl->strand[0] == gp->strand[0])
	    {
	    int overlap = pslRangeTreeOverlap(psl, rangeTree);
	    if (overlap > bestOverlap)
		{
		bestOverlap = overlap;
		bestPsl = psl;
		}
	    }
	}
    if (bestPsl != NULL)
        hashAdd(gpToProbeHash, gp->name, bestPsl->qName);
    }
}
Ejemplo n.º 8
0
struct hash *readBed(char *fileName)
/* Read in bed file into hash of binKeepers keyed by
 * target. */
{
struct lineFile *lf = NULL;
struct hash *hash = newHash(0);
char *row[3];
struct chromInfo *ciList = NULL, *ci;
int count = 0, chromCount = 0;

/* Make first pass through just figuring out maximum size
 * of each chromosome info. */
lf = lineFileOpen(fileName, TRUE);
while (lineFileRow(lf, row))
    {
    char *chrom = row[0];
    int e = lineFileNeedNum(lf, row, 2);
    ci = hashFindVal(hash, chrom);
    if (ci == NULL)
        {
	AllocVar(ci);
	hashAddSaveName(hash, chrom, ci, &ci->name);
	slAddHead(&ciList, ci);
	++chromCount;
	}
    if (e > ci->maxEnd)
        ci->maxEnd = e;
    ++count;
    }
lineFileClose(&lf);

/* Allocate binKeeper on each chromosome. */
for (ci = ciList; ci != NULL; ci = ci->next)
    {
    ci->bk = binKeeperNew(0, ci->maxEnd);
    }

/* Make second pass filling in binKeeper */
lf = lineFileOpen(fileName, TRUE);
while (lineFileRow(lf, row))
    {
    char *chrom = row[0];
    int s = lineFileNeedNum(lf, row, 1);
    int e = lineFileNeedNum(lf, row, 2);
    ci = hashMustFindVal(hash, chrom);
    binKeeperAdd(ci->bk, s, e, NULL);
    }
lineFileClose(&lf);
printf("Read %d items in %d target chromosomes from %s\n", 
	count, chromCount, fileName);
return hash;
}
Ejemplo n.º 9
0
struct hash *bedsIntoKeeperHash(struct bed *bedList)
/* Create a hash full of bin keepers (one for each chromosome or contig.
 * The binKeepers are full of beds. */
{
struct hash *sizeHash = minChromSizeFromBeds(bedList);
struct hash *bkHash = minChromSizeKeeperHash(sizeHash);
struct bed *bed;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    struct binKeeper *bk = hashMustFindVal(bkHash, bed->chrom);
    binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
    }
hashFree(&sizeHash);
return bkHash;
}
Ejemplo n.º 10
0
struct mouseChromCache *newMouseChromCache(char *chrom, int chromSize, 
	char *ratMouseDir)
/* Create a new chromCache. */
{
struct mouseChromCache *mcc;
char fileName[512];
struct lineFile *lf;
char *row[3];
int start,end;
long long *pPos;

/* Open up file with actual alignments.  Warn and return NULL
 * if it doesn't exist. */
sprintf(fileName, "%s/%s.axt", ratMouseDir, chrom);
lf = lineFileMayOpen(fileName, TRUE);

/* Allocate structure and store basic info in it. */
AllocVar(mcc);
mcc->name = cloneString(chrom);
mcc->size = chromSize;
mcc->lf = lf;
if (lf == NULL)
    {
    warn("%s doesn't exist", fileName);
    if (!noDieMissing)
        noWarnAbort(); 
    return mcc;
    }

/* Read index file into bk. */
sprintf(fileName, "%s/%s.axt.ix", ratMouseDir, chrom);
mcc->bk = binKeeperNew(0, chromSize);
lf = lineFileOpen(fileName, TRUE);
verbose(1, "Reading %s\n", fileName);
while (lineFileRow(lf, row))
    {
    start = lineFileNeedNum(lf, row, 0);
    end = lineFileNeedNum(lf, row, 1) + start;
    AllocVar(pPos);
    *pPos = atoll(row[2]);
    binKeeperAdd(mcc->bk, start, end, pPos);
    }
lineFileClose(&lf);

/* Return initialized object. */
return mcc;
}
Ejemplo n.º 11
0
struct binKeeper *loadAxtsIntoRange(char *fileName, char *tPrefix, char *qPrefix)
/* Read in an axt file and shove it into a bin-keeper. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct binKeeper *bk = binKeeperNew(0, maxChromSize);
struct axt *axt;
int count = 0;

while ((axt = axtRead(lf)) != NULL)
    {
    binKeeperAdd(bk, axt->tStart, axt->tEnd, axt);
    ++count;
    }
uglyf("LOaded %d from %s\n", count, fileName);
lineFileClose(&lf);
return bk;
}
void chromKeeperAdd(char *chrom, int chromStart, int chromEnd, void *val)
/* Add an item to the chromKeeper. */
{
int i=0;
boolean added = FALSE;
for(i=0; i<chromCount; i++)
    {
    if(sameString(chrom,chromNames[i])) 
	{
	binKeeperAdd(chromRanges[i], chromStart, chromEnd, val);
	added=TRUE;
	break;
	}
    }
if(!added)
    errAbort("chromKeeper::chromKeeperAdd() - Don't recognize chrom %s", chrom);
}
Ejemplo n.º 13
0
struct hash *netToBkHash(char *netFile)
/* Read net file into a hash full of binKeepers keyed by chromosome.
 * The binKeepers are full of nets. */
{
struct hash *netHash = hashNew(0);
struct lineFile *lf = lineFileOpen(netFile, TRUE);
struct chainNet *net, *netList = chainNetRead(lf);
for (net = netList; net != NULL; net = net->next)
    {
    if (hashLookup(netHash, net->name))
        errAbort("%s has multiple %s records", netFile, net->name);
    struct binKeeper *bk = binKeeperNew(0, net->size);
    hashAdd(netHash, net->name, bk);
    struct cnFill *fill;
    for(fill=net->fillList; fill != NULL; fill = fill->next)
	binKeeperAdd(bk, fill->tStart, fill->tStart+fill->tSize, fill);
    }
lineFileClose(&lf);
return netHash;                
}
Ejemplo n.º 14
0
struct binKeeper *readRepeats2(char *chrom, char *rmskFileName, struct hash *tSizeHash)
/* read all repeats for a chromosome of size size, returns results in binKeeper structure for fast query*/
{
    boolean rmskRet;
    struct lineFile *rmskF = NULL;
    struct rmskOut2 *rmsk;
    struct binKeeper *bk; 
    int size;

    size = hashIntVal(tSizeHash, chrom);
    bk = binKeeperNew(0, size);
    assert(size > 1);
    rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet);
    while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL)
        {
        binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk);
        }
    lineFileClose(&rmskF);
    return bk;
}
Ejemplo n.º 15
0
struct hash *bedsIntoHashOfKeepers(struct bed *bedList)
/* Return a hash full of binKeepers, keyed by chromosome (or contig)
 * that contains the bedList */
{
struct hash *sizeHash = chromMinSizeHash(bedList);
struct hash *keeperHash = hashNew(16);
struct bed *bed;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    struct binKeeper *keeper = hashFindVal(keeperHash, bed->chrom);
    if (keeper == NULL)
        {
	struct minChromSize *chrom = hashMustFindVal(sizeHash, bed->chrom);
	keeper = binKeeperNew(0, chrom->minSize);
	hashAdd(keeperHash, chrom->name, keeper);
	}
    binKeeperAdd(keeper, bed->chromStart, bed->chromEnd, bed);
    }
hashFree(&sizeHash);
return keeperHash;
}
Ejemplo n.º 16
0
static struct cdsExon *loadExon(struct gene *gene, struct binKeeper *chrBins,
                                struct genePred *gp, int iExon, int start, int end,
                                int cdsOff)
/* load information about an exon into various structures */
{
struct cdsExon *exon;
checkOverlap(gene->genes, chrBins, gp, start, end);
lmAllocVar(gene->genes->memPool, exon);
exon->gene = gene;
exon->chromStart = start;
exon->chromEnd = end;
exon->frame = gp->exonFrames[iExon];
if (exon->gene->strand == '+')
    exon->exonNum = iExon;
else
    exon->exonNum = (gp->exonCount-1) - iExon;
exon->cdsOff = cdsOff;
binKeeperAdd(chrBins, start, end, exon);
slAddHead(&gene->exons, exon);
return exon;
}
Ejemplo n.º 17
0
void loadPslsFromFile(char *pslFile, char *chrom, struct sqlConnection *conn)
/** Load the psls from the directed file (instead of the database. */
{
struct psl *psl = NULL, *pslNext = NULL, *pslList = NULL;
pslList = pslLoadAll(pslFile);
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    minPslStart = min(psl->tStart, minPslStart);
    maxPslEnd = max(psl->tEnd, maxPslEnd);
    }
chromPslBin = binKeeperNew(minPslStart, maxPslEnd);
agxSeenBin = binKeeperNew(minPslStart, maxPslEnd);
for(psl = pslList; psl != NULL; psl = pslNext)
    {
    pslNext = psl->next;
    if(sameString(psl->tName, chrom))
	binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl);
    else
	pslFree(&psl);
    }
}
struct genePred *loadGenePred(char *database, char *chrom, char *track, struct binKeeper *bk)
/* Load in a gene prediction track to bk. */
{
struct sqlConnection *conn = hAllocConn(database);
struct sqlResult *sr;
char **row;
int rowOffset;
struct genePred *list = NULL, *el;

sr = hChromQuery(conn, track, chrom, NULL, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
    {
    el = genePredLoad(row + rowOffset);
    binKeeperAdd(bk, el->txStart, el->txEnd, el);
    slAddHead(&list, el);
    }
sqlFreeResult(&sr);
hFreeConn(&conn);
slReverse(&list);
return list;
}
Ejemplo n.º 19
0
struct hash *txgIntoKeeperHash(struct txGraph *txgList)
/* Create a hash full of bin keepers (one for each chromosome or contig.
 * The binKeepers are full of txGraphs. */
{
struct hash *sizeHash = txgChromMinSizeHash(txgList);
struct hash *bkHash = hashNew(16);
struct txGraph *txg;
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    struct binKeeper *bk = hashFindVal(bkHash, txg->tName);
    if (bk == NULL)
        {
	struct minChromSize *chrom = hashMustFindVal(sizeHash, txg->tName);
	verbose(3, "New binKeeper for %s\n", txg->tName);
	bk = binKeeperNew(0, chrom->minSize);
	hashAdd(bkHash, txg->tName, bk);
	}
    binKeeperAdd(bk, txg->tStart, txg->tEnd, txg);
    }
hashFree(&sizeHash);
return bkHash;
}
Ejemplo n.º 20
0
struct hash *readLiftOverMapChainHash(char *fileName)
/* taken from kent/src/hg/lib/liftOver.c */
/* Read map file into hashes. */
{
    struct hash *chainHash = hashNew(10);
    struct lineFile *lf = lineFileOpen(fileName, TRUE);
    struct chain *chain;
    struct liftOverChromMap *map;
    
    while ((chain = chainRead(lf)) != NULL)
    {
	if ((map = hashFindVal(chainHash, chain->tName)) == NULL)
	{
	    AllocVar(map);
	    map->bk = binKeeperNew(0, chain->tSize);
	    hashAddSaveName(chainHash, chain->tName, map, &map->name);
	}
	binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain);
    }
    lineFileClose(&lf);
    return chainHash;
}
void addGenePred(struct hash *chromHash, char **row)
/* add a genePred's exons to the approriate binkeeper object in hash */
{
struct genePred *gene = genePredLoad(row);
int iExon;
struct binKeeper *chromBins = getChromBins(chromHash, gene->chrom,
                                           gene->strand);
struct geneLoc *geneLoc = geneLocNew(chromHash->lm, gene->name, gene->chrom,
                                     gene->strand, gene->txStart, gene->txEnd);
for (iExon = 0; iExon < gene->exonCount; iExon++)
    {
    int exonStart = gene->exonStarts[iExon];
    int exonEnd = gene->exonEnds[iExon];
    if (gCdsOnly)
        {
        exonStart = max(exonStart, gene->cdsStart);
        exonEnd = min(exonEnd, gene->cdsEnd);
        }
    if (exonStart < exonEnd)
        binKeeperAdd(chromBins, exonStart, exonEnd, geneLoc);
    }
genePredFree(&gene);
}
Ejemplo n.º 22
0
void loadPslsFromDatabase(struct sqlConnection *conn, char *db, char *chrom) 
/** Load all of the desired alignments into the chromkeeper structure
    from the desired pslTables. */
{
int i = 0;
struct sqlResult *sr = NULL;
char **row = NULL;
int rowOffset = 0;
struct psl *pslList = NULL, *psl = NULL;
for(i = 0; i < numDbTables; i++)
    {
    sr = hChromQuery(conn, dbTables[i], chrom, NULL, &rowOffset); 
    while((row = sqlNextRow(sr)) != NULL)
	{
	psl = pslLoad(row+rowOffset);
	slAddHead(&pslList, psl);
	minPslStart = min(psl->tStart, minPslStart);
	maxPslEnd = max(psl->tEnd, maxPslEnd);
	/* This just adds the mrna twice to the list, cheat way to add more
	   weight to certain tables. */
	if(weightMrna && (stringIn("refSeqAli", dbTables[i]) || stringIn("mrna", dbTables[i])))
	    {
	    psl = clonePsl(psl);
	    slAddHead(&pslList, psl);
	    }
	}
    sqlFreeResult(&sr);
    }

chromPslBin = binKeeperNew(minPslStart, maxPslEnd);
agxSeenBin = binKeeperNew(minPslStart, maxPslEnd);
for(psl = pslList; psl != NULL; psl = psl->next)
    {
    binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl);
    }
}
Ejemplo n.º 23
0
struct hash *readRepeatsAll2(char *sizeFileName, char *rmskDir)
/* read all repeats for a all chromosomes getting sizes from sizeFileNmae , returns results in hash of binKeeper structure for fast query*/
{
boolean rmskRet;
struct binKeeper *bk; 
struct lineFile *rmskF = NULL;
struct rmskOut2 *rmsk;
struct lineFile *lf = lineFileOpen(sizeFileName, TRUE);
struct hash *hash = newHash(0);
char *row[2];
char rmskFileName[256];

while (lineFileRow(lf, row))
    {
    char *name = row[0];
    int size = lineFileNeedNum(lf, row, 1);

    if (hashLookup(hash, name) != NULL)
        warn("Duplicate %s, ignoring all but first\n", name);
    else
        {
        bk = binKeeperNew(0, size);
        assert(size > 1);
        safef(rmskFileName, sizeof(rmskFileName), "%s/%s.fa.out",rmskDir,name);
        rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet);
        while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL)
            {
            binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk);
            }
        lineFileClose(&rmskF);
	hashAdd(hash, name, bk);
        }
    }
lineFileClose(&lf);
return hash;
}
void oneChrom(char *database, char *chrom, char *refAliTrack, char *bedTrack,
              struct hash *otherHash, struct stats *stats)
/* Process one chromosome. */
{
    struct bed *bedList = NULL, *bed;
    struct sqlConnection *conn = hAllocConn(database);
    struct sqlResult *sr;
    char **row;
    int rowOffset;
    int chromSize = hChromSize(database, chrom);
    struct binKeeper *bk = binKeeperNew(0, chromSize);
    struct psl *pslList = NULL;
    struct dnaSeq *chromSeq = NULL;

    if (endsWith(bedTrack, ".bed"))
    {
        struct lineFile *lf = lineFileOpen(bedTrack, TRUE);
        char *row[3];
        while (lineFileRow(lf, row))
        {
            if (sameString(chrom, row[0]))
            {
                bed = bedLoad3(row);
                slAddHead(&bedList, bed);
            }
        }
        lineFileClose(&lf);
    }
    else
    {
        sr = hChromQuery(conn, bedTrack, chrom, NULL, &rowOffset);
        while ((row = sqlNextRow(sr)) != NULL)
        {
            bed = bedLoad3(row+rowOffset);
            slAddHead(&bedList, bed);
        }
        sqlFreeResult(&sr);
    }
    slReverse(&bedList);
    uglyf("Loaded beds\n");

    sr = hChromQuery(conn, refAliTrack, chrom, NULL, &rowOffset);
    while ((row = sqlNextRow(sr)) != NULL)
    {
        struct psl *psl = pslLoad(row + rowOffset);
        slAddHead(&pslList, psl);
        binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
    }
    sqlFreeResult(&sr);
    uglyf("Loaded psls\n");

    chromSeq = hLoadChrom(database, chrom);
    /* Fetch entire chromosome into memory. */
    uglyf("Loaded human seq\n");

    for (bed = bedList; bed != NULL; bed = bed->next)
    {
        struct binElement *el, *list = binKeeperFind(bk, bed->chromStart, bed->chromEnd);
        for (el = list; el != NULL; el = el->next)
        {
            struct psl *fullPsl = el->val;
            struct psl *psl = pslTrimToTargetRange(fullPsl,
                                                   bed->chromStart, bed->chromEnd);
            if (psl != NULL)
            {
                foldPslIntoStats(psl, chromSeq, otherHash, stats);
                pslFree(&psl);
            }
        }
        slFreeList(&list);
        stats->bedCount += 1;
        stats->bedBaseCount += bed->chromEnd - bed->chromStart;
        sqlFreeResult(&sr);
    }
    freeDnaSeq(&chromSeq);
    pslFreeList(&pslList);
    binKeeperFree(&bk);
    hFreeConn(&conn);
}
void txGeneCanonical(char *codingCluster, char *infoFile, 
	char *noncodingGraph, char *genesBed, char *nearCoding, 
	char *outCanonical, char *outIsoforms, char *outClusters)
/* txGeneCanonical - Pick a canonical version of each gene - that is the form
 * to use when just interested in a single splicing varient. Produces final
 * transcript clusters as well. */
{
/* Read in input into lists in memory. */
struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster);
struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph);
struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12);
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
struct bed *nearList = bedLoadNAll(nearCoding, 12);

/* Make hash of all beds. */
struct hash *bedHash = hashNew(18);
for (bed = bedList; bed != NULL; bed = bed->next)
    hashAdd(bedHash, bed->name, bed);

/* Make has of all info. */
struct hash *infoHash = hashNew(18);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Make a binKeeper structure that we'll populate with coding genes. */
struct hash *sizeHash = minChromSizeFromBeds(bedList);
struct hash *keeperHash = minChromSizeKeeperHash(sizeHash);

/* Make list of coding genes and toss them into binKeeper.
 * This will eat up bed list, but bedHash is ok. */
struct gene *gene, *geneList = NULL;
for (coding = codingList; coding != NULL; coding = coding->next)
    {
    gene = geneFromCluster(coding, bedHash, infoHash);
    slAddHead(&geneList, gene);
    struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom);
    binKeeperAdd(bk, gene->start, gene->end, gene);
    }

/* Go through near-coding genes and add them to the coding gene
 * they most overlap. */
for (bed = nearList; bed != NULL; bed = nextBed)
    {
    nextBed = bed->next;
    gene = mostOverlappingGene(keeperHash, bed);
    if (gene == NULL)
        errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name);
    geneAddBed(gene, bed);
    }

/* Add non-coding genes. */
for (graph = graphList; graph != NULL; graph = graph->next)
    {
    gene = geneFromGraph(graph, bedHash);
    slAddHead(&geneList, gene);
    }

/* Sort so it all looks nicer. */
slSort(&geneList, geneCmp);

/* Open up output files. */
FILE *fCan = mustOpen(outCanonical, "w");
FILE *fIso = mustOpen(outIsoforms, "w");
FILE *fClus = mustOpen(outClusters, "w");

/* Loop through, making up gene name, and writing output. */
int geneId = 0;
for (gene = geneList; gene != NULL; gene = gene->next)
    {
    /* Make up name. */
    char name[16];
    safef(name, sizeof(name), "g%05d", ++geneId);

    /* Reverse transcript list just to make it look better. */
    slReverse(&gene->txList);

    /* Write out canonical file output */
    bed = hashMustFindVal(bedHash, gene->niceTx->name);
    fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n",
    	bed->chrom, bed->chromStart, bed->chromEnd, geneId,
	gene->niceTx->name, gene->niceTx->name);

    /* Write out isoforms output. */
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fIso, "%d\t%s\n", geneId, bed->name);

    /* Write out cluster output, starting with bed 6 standard fields. */
    fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t",
    	gene->chrom, gene->start, gene->end, name, 0, gene->strand);

    /* Write out thick-start/thick end. */
    if (gene->isCoding)
        {
	int thickStart = gene->end, thickEnd  = gene->start;
	for (bed = gene->txList; bed != NULL; bed = bed->next)
	    {
	    if (bed->thickStart < bed->thickEnd)
	        {
		thickStart = min(thickStart, bed->thickStart);
		thickEnd = max(thickEnd, bed->thickEnd);
		}
	    }
	fprintf(fClus, "%d\t%d\t", thickStart, thickEnd);
	}
    else
        {
	fprintf(fClus, "%d\t%d\t", gene->start, gene->start);
	}

    /* We got no rgb value, just write out zero. */
    fprintf(fClus, "0\t");

    /* Get exons from exonTree. */
    struct range *exon, *exonList = rangeTreeList(gene->exonTree);
    fprintf(fClus, "%d\t", slCount(exonList));
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->start - gene->start);
    fprintf(fClus, "\t");
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->end - exon->start);
    fprintf(fClus, "\t");

    /* Write out associated transcripts. */
    fprintf(fClus, "%d\t", slCount(gene->txList));
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fClus, "%s,", bed->name);
    fprintf(fClus, "\t");

    /* Write out nice value */
    fprintf(fClus, "%s\t", gene->niceTx->name);

    /* Write out coding/noncoding value. */
    fprintf(fClus, "%d\n", gene->isCoding);
    }

/* Close up files. */
carefulClose(&fCan);
carefulClose(&fIso);
carefulClose(&fClus);
}
void sortGenes(struct sqlConnection *conn)
/* Put up sort gene page. */
{
cartWebStart(cart, database, "Finding Candidate Genes for Gene Sorter");
if (!hgNearOk(database))
    errAbort("Sorry, gene sorter not available for this database.");

/* Get list of regions. */
struct genoGraph *gg = ggFirstVisible();
double threshold = getThreshold();
struct bed3 *bed, *bedList = regionsOverThreshold(gg);

/* Figure out what table and column are the sorter's main gene set. */
struct hash *genomeRa = hgReadRa(genome, database, "hgNearData", 
	"genome.ra", NULL);
char *geneTable = hashMustFindVal(genomeRa, "geneTable");
char *idColumn = hashMustFindVal(genomeRa, "idColumn");

/* if marker labels were present when the file was uploaded, they are saved here */
char cgmName[256];
safef(cgmName, sizeof(cgmName), "%s.cgm", gg->binFileName);
struct lineFile *m = lineFileMayOpen(cgmName, TRUE);
char *cgmRow[4];
cgmRow[0] = "";    /* dummy row */
cgmRow[1] = "";
cgmRow[2] = "0";
cgmRow[3] = "0";

FILE *g = NULL;
int markerCount = 0;
struct tempName snpTn;

if (m)
    {
    /* Create custom column output file. */
    trashDirFile(&snpTn, "hgg", "marker", ".mrk");  
    g = mustOpen(snpTn.forCgi, "w");
    fprintf(g, 
	"column name=\"%s Markers\" shortLabel=\"%s Markers over threshold\" longLabel=\"%s Markers in regions over threshold\" " 
	"visibility=on priority=99 "
        "\n"
        , gg->shortLabel
        , gg->shortLabel
        , gg->shortLabel
	);
    }

/*** Build up hash of all transcriptHash that are in region. */
struct hash *transcriptHash = hashNew(16);

/* This loop handles one chromosome at a time.  It depends on
 * the bedList being sorted by chromosome. */
for (bed = bedList; bed != NULL; )
    {

    /* Make binKeeper and stuff in all regions in this chromosome into it. */
    char *chrom = bed->chrom;
    int chromSize = hChromSize(database, chrom);
    struct binKeeper *bk = binKeeperNew(0, chromSize);
    while (bed != NULL && sameString(chrom, bed->chrom))
	{
	binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
	bed = bed->next;
	}

    struct binKeeper *bkGenes = NULL;
    if (m)
       bkGenes = binKeeperNew(0, chromSize);

    /* Query database to find out bounds of all genes on this chromosome
     * and if they overlap any of the regions then put them in the hash. */
    char query[512];
    safef(query, sizeof(query), 
    	"select name,txStart,txEnd from %s where chrom='%s'", geneTable, chrom);
    struct sqlResult *sr = sqlGetResult(conn, query);
    char **row;
    while ((row = sqlNextRow(sr)) != NULL)
        {
	char *name = row[0];
	int start = sqlUnsigned(row[1]);
	int end = sqlUnsigned(row[2]);
	if (binKeeperAnyOverlap(bk, start, end))
	    {
	    hashStore(transcriptHash, name);
	    if (m)
		binKeeperAdd(bkGenes, start, end, cloneString(name));
	    }
	}
    sqlFreeResult(&sr);

    if (m)
	{
	/* Read cgm file if it exists, looking at all markers on this chromosome
	 * and if they overlap any of the regions and genes then output them. */
	do 
	    {
	    // marker, chrom, chromStart, val
	    char *marker = cgmRow[0];
	    char *chr = cgmRow[1];
	    int start = sqlUnsigned(cgmRow[2]);
	    int end = start+1;
	    double val = sqlDouble(cgmRow[3]);
            int cmp = strcmp(chr,chrom);
            if (cmp > 0)
                break;
            if (cmp == 0)
		{
		if (val >= threshold)
		    {
		    struct binElement *el, *bkList = binKeeperFind(bkGenes, start, end);
		    for (el = bkList; el; el=el->next)
			{
			/* output to custom column trash file */
			fprintf(g, "%s %s\n", (char *)el->val, marker);
			}
		    if (bkList)
			{
			++markerCount;
			slFreeList(&bkList);
			}
		    }
		}
	    }
	while (lineFileRow(m, cgmRow));
	}

    /* Clean up for this chromosome. */
    binKeeperFree(&bk);

    if (m)
	{
	/* For speed, we do not free up the values (cloned the kg names earlier) */
	binKeeperFree(&bkGenes);  
	}

    }

/* Get list of all transcripts in regions. */
struct hashEl *el, *list = hashElListHash(transcriptHash);

/* Create file with all matching gene IDs. */
struct tempName keyTn;
trashDirFile(&keyTn, "hgg", "key", ".key");
FILE *f = mustOpen(keyTn.forCgi, "w");
for (el = list; el != NULL; el = el->next)
    fprintf(f, "%s\n", el->name);
carefulClose(&f);

/* Print out some info. */
hPrintf("Thresholding <i>%s</i> at %g. ", gg->shortLabel, threshold);
hPrintf("There are %d regions covering %lld bases.<BR>\n",
    slCount(bedList), bedTotalSize((struct bed*)bedList) );
hPrintf("Installed a Gene Sorter filter that selects only genes in these regions.<BR>\n");
if (m)
    {
    hPrintf("There are %d markers in the regions over threshold that overlap knownGenes.<BR>\n", markerCount);
    hPrintf("Installed a Gene Sorter custom column called \"%s Markers\" with these markers.<BR>\n", gg->shortLabel);
    }

/* close custom column output file */
if (m)
    {
    lineFileClose(&m);
    carefulClose(&g);
    }

/* Stuff cart variable with name of file. */
char keyCartName[256];
safef(keyCartName, sizeof(keyCartName), "%s%s.keyFile",
	advFilterPrefix, idColumn);
cartSetString(cart, keyCartName, keyTn.forCgi);

cartSetString(cart, customFileVarName, snpTn.forCgi);

char snpVisCartNameTemp[256];
char *snpVisCartName = NULL;
safef(snpVisCartNameTemp, sizeof(snpVisCartNameTemp), "%s%s Markers.vis",
	colConfigPrefix, gg->shortLabel);
snpVisCartName = replaceChars(snpVisCartNameTemp, " ", "_");
cartSetString(cart, snpVisCartName, "1");
freeMem(snpVisCartName);

hPrintf("<FORM ACTION=\"../cgi-bin/hgNear\" METHOD=GET>\n");
cartSaveSession(cart);
hPrintf("<CENTER>");
cgiMakeButton("submit", "go to gene sorter");
hPrintf("</CENTER>");
hPrintf("</FORM>");

cartWebEnd();
}