Exemple #1
0
void bedIntersect(char *aFile, char *bFile, char *outFile)
/* bedIntersect - Intersect two bed files. */
{
struct lineFile *lf = lineFileOpen(aFile, TRUE);
struct hash *bHash = readBed(bFile);
FILE *f = mustOpen(outFile, "w");
char *row[40];
int wordCount;

while ((wordCount = (strictTab ? lineFileChopTab(lf, row) : lineFileChop(lf, row))) != 0)
    {
    char *chrom = row[0];
    int start = lineFileNeedNum(lf, row, 1);
    int end = lineFileNeedNum(lf, row, 2);
    if (start > end)
        errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
    if (start == end && !allowStartEqualEnd)
	lineFileAbort(lf, "start==end (if this is legit, use -allowStartEqualEnd)");
    struct binKeeper *bk = hashFindVal(bHash, chrom);
    if (bk != NULL)
	{
	struct binElement *hitList = NULL, *hit;
	if (allowStartEqualEnd && start == end)
	    hitList = binKeeperFind(bk, start-1, end+1);
	else
	    hitList = binKeeperFind(bk, start, end);
	if (aHitAny)
	    {
	    for (hit = hitList; hit != NULL; hit = hit->next)
		{
		float cov = getCov(start, end, hit->val);
		if (cov >= minCoverage)
		    {
		    outputBed(f, row, wordCount, start, end, hit->val);
		    break;
		    }
		else
		    {
		    struct bed5 *b = hit->val;
		    verbose(1, "filter out %s %d %d %d %d overlap %d %d %d %.3f\n",
			    chrom, start, end, b->start, b->end,
			    positiveRangeIntersection(start, end, b->start, b->end),
			    end-start, b->end-b->start, cov);
		    }
		}
	    }
	else
	    {
	    for (hit = hitList; hit != NULL; hit = hit->next)
	        {
		if (getCov(start, end, hit->val) >= minCoverage)
		    outputBed(f, row, wordCount, start, end, hit->val);
		}
	    }
	slFreeList(&hitList);
	}
    }
}
Exemple #2
0
boolean hasRetainedIntron(struct bed *bed, struct hash *altSpliceHash)
/* See if any exons in bed enclose any retained introns in keeper-hash */
{
struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom);
boolean gotOne = FALSE;
if (keeper == NULL)
     return FALSE;
int i;
for (i=0; i<bed->blockCount; ++i)
    {
    int start = bed->chromStarts[i] + bed->chromStart;
    int end = start + bed->blockSizes[i];
    struct binElement *bin, *binList = binKeeperFind(keeper, start, end);
    for (bin = binList; bin != NULL; bin = bin->next)
        {
	struct bed *intron = bin->val;
	if (sameString(intron->name, "retainedIntron"))
	    {
	    if (intron->strand[0] == bed->strand[0] 
		    && start < intron->chromStart && end > intron->chromEnd)
		{
		gotOne = TRUE;
		break;
		}
	    }
	}
    slFreeList(&binList);
    if (gotOne)
        break;
    }
return gotOne;
}
Exemple #3
0
int countMatchingIntrons(struct bed *bed, char *type, struct hash *altSpliceHash)
/* Count number of introns of a particular type . */
{
struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom);
if (keeper == NULL)
     return 0;
int total = 0;
int i, lastBlock = bed->blockCount-1;
for (i=0; i<lastBlock; ++i)
    {
    int start = bed->chromStarts[i] + bed->blockSizes[i] + bed->chromStart;
    int end = bed->chromStart + bed->chromStarts[i+1];
    struct binElement *bin, *binList = binKeeperFind(keeper, start, end);
    for (bin = binList; bin != NULL; bin = bin->next)
        {
	struct bed *intron = bin->val;
	if (sameString(intron->name, type))
	    {
	    if (intron->strand[0] == bed->strand[0] 
		    && start == intron->chromStart && end == intron->chromEnd)
		{
		if (end - start > 3)
		    {
		    ++total;
		    break;
		    }
		}
	    }
	}
    slFreeList(&binList);
    }
return total;
}
Exemple #4
0
struct bed *mostOverlappingBed(struct bed *ref, struct hash *geneHash, double *retRatio)
/* Find most overlapping gene to ref. */
{
struct bed *bestBed = NULL;
double bestRatio = 0;
struct binKeeper *bk = hashFindVal(geneHash, ref->chrom);
if (bk != NULL)
    {
    struct binElement *el, *list = binKeeperFind(bk, ref->chromStart, ref->chromEnd);
    for (el = list; el != NULL; el = el->next)
        {
	struct bed *bed = el->val;
	if (bed->strand[0] == ref->strand[0])
	    {
	    double ratio = bedOverlapRatio(ref, bed);
	    if (ratio > bestRatio)
	        {
		bestRatio = ratio;
		bestBed = bed;
		}
	    }
	}
    }
*retRatio = bestRatio;
return bestBed;
}
struct bed *findCompatible(struct bed *newBed, struct hash *oldHash, struct hash *usedHash)
/* Try and find an old bed compatible with new bed. */
{
struct binKeeper *bk = hashFindVal(oldHash, newBed->chrom);
int bestDiff = BIGNUM;
struct bed *bestBed = NULL;
if (bk == NULL)
    return NULL;
struct binElement *bin, *binList = binKeeperFind(bk, newBed->chromStart, newBed->chromEnd);
for (bin = binList; bin != NULL; bin = bin->next)
    {
    struct bed *oldBed = bin->val;
    if (oldBed->strand[0] == newBed->strand[0])
	{
	if (!hashLookup(usedHash, oldBed->name))
	    {
	    if (bedCompatibleExtension(oldBed, newBed) || endUtrChangeOnly(oldBed, newBed))
		{
		int diff = bedTotalBlockSize(oldBed) - bedTotalBlockSize(newBed);
		if (diff < 0) diff = -diff;
		if (diff < bestDiff)
		    {
		    bestDiff = diff;
		    bestBed = oldBed;
		    }
		}
	    }
	}
    }
slFreeList(&binList);
return bestBed;
}
struct bed *findMostOverlapping(struct bed *bed, struct hash *keeperHash)
/* Try find most overlapping thing to bed in keeper hash. */
{
struct bed *bestBed = NULL;
int bestOverlap = 0;
struct binKeeper *bk = hashFindVal(keeperHash, bed->chrom);
if (bk == NULL)
    return NULL;
struct binElement *bin, *binList = binKeeperFind(bk, bed->chromStart, bed->chromEnd);
for (bin = binList; bin != NULL; bin = bin->next)
    {
    struct bed *bed2 = bin->val;
    if (bed2->strand[0] == bed->strand[0])
	{
	int overlap = bedSameStrandOverlap(bed2, bed);
	if (overlap > bestOverlap)
	    {
	    bestOverlap = overlap;
	    bestBed = bed2;
	    }
	}
    }
slFreeList(&binList);
return bestBed;
}
Exemple #7
0
void axtAndBed(char *inAxt, char *inBed, char *outAxt)
/* axtAndBed - Intersect an axt with a bed file and output axt.. */
{
struct hash *tHash = readBed(inBed); /* target keyed, binKeeper value */
struct lineFile *lf = lineFileOpen(inAxt, TRUE);
struct axt *axt;
struct binElement *list = NULL, *el;
FILE *f = mustOpen(outAxt, "w");
struct axtScoreScheme *ss = axtScoreSchemeDefault();

while ((axt = axtRead(lf)) != NULL)
    {
    struct chromInfo *ci = hashFindVal(tHash, axt->tName);
    if (ci != NULL)
	{
	list = binKeeperFind(ci->bk, axt->tStart, axt->tEnd);
	if (list != NULL)
	    {
	    /* Flatten out any overlapping elements by projecting them
	     * onto a 0/1 valued character array and then looking for 
	     * runs of 1 in this array. */
	    int tStart = axt->tStart;
	    int tEnd = axt->tEnd;
	    int tSize = tEnd - tStart;
	    int i, s = 0;
	    char c, lastC = 0;
	    char *merger = NULL;
	    AllocArray(merger, tSize+1);
	    for (el = list; el != NULL; el = el->next)
		{
		int s = el->start - tStart;
		int e = el->end - tStart;
		int sz;
		if (s < 0) s = 0;
		if (e > tSize) e = tSize;
		sz = e - s;
		if (sz > 0)
		    memset(merger + s, 1, sz);
		}
	    for (i=0; i<=tSize; ++i)
		{
		c = merger[i];
		if (c && !lastC)
		    {
		    s = i;
		    lastC = c;
		    }
		else if (!c && lastC)
		    {
		    axtSubsetOnT(axt, s+tStart, i+tStart, ss, f);
		    lastC = c;
		    }
		}
	    freez(&merger);
	    slFreeList(&list);
	    }
	}
    axtFree(&axt);
    }
}
void doOneChrom(char *database, char *chrom, char *rnaTable, char *expTable, FILE *f)
/* Process one chromosome. */
{
int chromSize = hChromSize(database, chrom);
struct binKeeper *bk = binKeeperNew(0, chromSize);
struct sqlConnection *conn = hAllocConn(database);
struct sqlResult *sr;
char **row;
struct bed *exp, *rna;
int rowOffset;
struct binElement *be, *beList;
int oneCount;

/* Load up expTable into bin-keeper. */
sr = hChromQuery(conn, expTable, chrom, NULL, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
    {
    exp = bedLoadN(row + rowOffset, 12);
    binKeeperAdd(bk, exp->chromStart, exp->chromEnd, exp);
    }
sqlFreeResult(&sr);

/* Loop through rnaTable and look at intersections. */
sr = hChromQuery(conn, rnaTable, chrom, NULL, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
    {
    rna = bedLoadN(row + rowOffset, 12);
    beList = binKeeperFind(bk, rna->chromStart, rna->chromEnd);
    oneCount = 0;
    for (be = beList; be != NULL; be = be->next)
        {
	exp = be->val;
	if (exp->strand[0] == rna->strand[0])
	    {
	    ++oneCount;
	    ++hitCount;
//	    fprintf(f, "%s:%d-%d\t%s\t%s\n", 
//	    	rna->chrom, rna->chromStart, rna->chromEnd, rna->name, exp->name);
	    }
	}
    slFreeList(&beList);
    if (oneCount == 0)
	{
        ++missCount;
	fprintf(f, "miss %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name);
	}
    else if (oneCount == 1)
	{
	fprintf(f, "uniq %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name);
        ++uniqCount;
	}
    else
	{
	fprintf(f, "dupe %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name);
        ++dupeCount;
	}
    }
sqlFreeResult(&sr);
hFreeConn(&conn);
}
struct bed *findExact(struct bed *newBed, struct hash *oldHash, struct hash *usedHash)
/* Try and find an old bed identical with new bed. */
{
struct binKeeper *bk = hashFindVal(oldHash, newBed->chrom);
if (bk == NULL)
    return NULL;
struct bed *matchingBed = NULL;
struct binElement *bin, *binList = binKeeperFind(bk, newBed->chromStart, newBed->chromEnd);
for (bin = binList; bin != NULL; bin = bin->next)
    {
    struct bed *oldBed = bin->val;
    if (oldBed->strand[0] == newBed->strand[0])
        {
	if (!hashLookup(usedHash, oldBed->name))
	    {
	    if (bedExactMatch(oldBed, newBed))
		{
		matchingBed = oldBed;
		break;
		}
	    }
	}
    }
slFreeList(&binList);
return matchingBed;
}
char *findCommonName(struct bed *range, struct binKeeper *knownBk,
	struct hash *refLinkHash)
/* Try and find a common name for range based on overlap with
 * known genes. */
{
struct binElement *beList=NULL, *be;
struct refLink *link = NULL;
struct genePred *gp;
int matchCount = 0;

beList = binKeeperFind(knownBk, range->chromStart, range->chromEnd);
for (be = beList; be != NULL; be = be->next)
    {
    gp = be->val;
    if (gp->strand[0] == range->strand[0])
        {
	++matchCount;
	link = hashFindVal(refLinkHash, gp->name);
	}
    }
slFreeList(&beList);
if (matchCount == 1 && link != 0)
    return link->name;
else
    return range->name;
}
Exemple #11
0
enum remapResult remapBase(struct hash *chainHash, char *orig_chrom, int orig_base, char **dest_chrom, int *dest_base)
{
    struct liftOverChromMap *map = hashFindVal(chainHash, orig_chrom);
    struct binElement *list = NULL;
    struct chain *chainHit = NULL;
    struct chain *toFree;
    struct chain *subChain;
    int start = orig_base, end = start+1;
    if (map)
	list = binKeeperFind(map->bk, start, start+1);
    if (!list)
	return deleted;
    else if (list->next != NULL)
    {
	slFreeList(&list);
	return duplicated;    
    }
    chainHit = list->val;
    if (!mapThroughChain(chainHit, 1, &start, &end, &subChain, &toFree))
    {
	slFreeList(&list);
	return problem;
    }
    chainFree(&toFree);
    *dest_base = start;
    *dest_chrom = chainHit->qName;
    slFreeList(&list);
    return lifted;
}
void bestProbeOverlap(struct sqlConnection *conn, char *probeTable, 
	struct genePred *gpList, struct hash *gpToProbeHash)
/* Create hash of most overlapping probe if any for each gene. Require
 * at least 100 base overlap. */
{
/* Create a hash of binKeepers filled with probes. */
struct hash *keeperHash = keepersForChroms(conn);
struct hashCookie it = hashFirst(keeperHash);
struct hashEl *hel;
int pslCount = 0;
while ((hel = hashNext(&it)) != NULL)
    {
    char *chrom = hel->name;
    struct binKeeper *bk = hel->val;
    int rowOffset;
    struct sqlResult *sr = hChromQuery(conn, probeTable, chrom, NULL, &rowOffset);
    char **row;
    while ((row = sqlNextRow(sr)) != NULL)
        {
	struct psl *psl = pslLoad(row+rowOffset);
	binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
	++pslCount;
	}
    sqlFreeResult(&sr);
    }
verbose(2, "Loaded %d psls from %s\n", pslCount, probeTable);

/* Loop through gene list, finding best probe if any for each gene. */
struct genePred *gp;
for (gp = gpList; gp != NULL; gp = gp->next)
    {
    struct rbTree *rangeTree = genePredToRangeTree(gp, FALSE);
    struct psl *bestPsl = NULL;
    int bestOverlap = 99;	/* MinOverlap - 1 */
    struct binKeeper *bk = hashMustFindVal(keeperHash, gp->chrom);
    struct binElement *bin, *binList = binKeeperFind(bk, gp->txStart, gp->txEnd);
    for (bin = binList; bin != NULL; bin = bin->next)
        {
	struct psl *psl = bin->val;
	if (psl->strand[0] == gp->strand[0])
	    {
	    int overlap = pslRangeTreeOverlap(psl, rangeTree);
	    if (overlap > bestOverlap)
		{
		bestOverlap = overlap;
		bestPsl = psl;
		}
	    }
	}
    if (bestPsl != NULL)
        hashAdd(gpToProbeHash, gp->name, bestPsl->qName);
    }
}
int bkCountOverlappingRange(struct binKeeper *bk, int start, int end)
/* Return biggest overlap of anything in binKeeper with given range. */
{
struct binElement *el, *list = binKeeperFind(bk, start, end);
int overlap, bestOverlap = 0;

for (el = list; el != NULL; el = el->next)
    {
    overlap = rangeIntersection(el->start, el->end, start, end);
    if (overlap > bestOverlap)
        bestOverlap = overlap;
    }
return bestOverlap;
}
Exemple #14
0
struct psl *getPslsFromCache(char *chrom, int chromStart, int chromEnd)
/** Get all of the psls for a given gp of interest. */
{
struct psl *pslList = NULL, *psl = NULL;
struct binElement *beList = NULL, *be = NULL;
beList = binKeeperFind(chromPslBin, chromStart, chromEnd);
for(be = beList; be != NULL; be = be->next)
    {
    psl = be->val;
    slAddHead(&pslList, psl);
    }
slFreeList(&beList);
return pslList;
}
void checkForClusters(char *chromName)
/* describe collisions */
{
char query[512];
struct sqlConnection *conn = hAllocConn();
struct sqlResult *sr;
char **row;

int start = 0;
int end = 0;
char *rsId = NULL;

struct binElement *el, *elList = NULL;

char *matchName = NULL;
int candidateCount = 0;
int matchCount = 0;

verbose(1, "checking for collisions...\n");
safef(query, sizeof(query), 
      "select chromStart, chromEnd, name from %s where chrom = '%s'", snpTable, chromName);

sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    candidateCount++;
    start = sqlUnsigned(row[0]);
    end = sqlUnsigned(row[1]);
    rsId = cloneString(row[2]);

    elList = binKeeperFind(snps, start, end);
    for (el = elList; el != NULL; el = el->next)
        {
        matchName = cloneString((char *)el->val);
        /* skip self hits */
        if (sameString(matchName, rsId)) continue;
        fprintf(outputFileHandle, "%s\t%d\t%d\t%s\n", chromName, start, end, (char *)el->val);
	matchCount++;
	}
    }

sqlFreeResult(&sr);
hFreeConn(&conn);
verbose(1, "  candidate count = %d\n", candidateCount);
verbose(1, "  match count = %d\n", matchCount);
}
Exemple #16
0
int addIntronBleed(struct bed *bed, struct hash *altSpliceHash)
/* Return the number of bases at start or end that bleed into introns
 * of other, probably better, transcripts. */
{
struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom);
if (keeper == NULL)
     return 0;
int i;
int total = 0;
int lastBlock = bed->blockCount-1;
if (lastBlock == 0)
    return 0;	/* Single exon case. */
/* This funny loop just checks first and last block. */
for (i=0; i<=lastBlock; i += lastBlock)
    {
    int start = bed->chromStarts[i] + bed->chromStart;
    int end = start + bed->blockSizes[i];
    struct binElement *bin, *binList = binKeeperFind(keeper, start, end);
    for (bin = binList; bin != NULL; bin = bin->next)
        {
	struct bed *bleeder = bin->val;
	if (sameString(bleeder->name, "bleedingExon"))
	    {
	    if (bleeder->strand[0] == bed->strand[0])
		{
		if (i == 0)  /* First block, want start to be same */
		    {
		    if (bleeder->chromStart == start && bleeder->chromEnd < end)
		        total += bleeder->chromEnd - bleeder->chromStart;
		    break;
		    }
		else if (i == lastBlock)
		    {
		    if (bleeder->chromEnd == end && bleeder->chromStart > start)
		        total += bleeder->chromEnd - bleeder->chromStart;
		    break;
		    }
		}
	    }
	}
    slFreeList(&binList);
    }
return total;
}
Exemple #17
0
struct cnFill *netFillAt(char *chrom, int start, int end, struct hash *netHash)
/* Get list of highest level fill for net at given position. */
{
struct cnFill *fillList = NULL, *fill;
struct binKeeper *bk = hashFindVal(netHash, chrom);
if (bk != NULL)
    {
    struct binElement *beList = NULL, *be = NULL;
    beList = binKeeperFind(bk, start, end);
    for (be = beList; be != NULL; be = be->next)
        {
	fill = be->val;
	slAddHead(&fillList, fill);
	}
    slFreeList(&beList);
    slReverse(&fillList);
    }
return fillList;
}
void findOverlapingExons(struct geneLoc **geneLocList,
                         struct binKeeper *chromBins,
                         int exonStart, int exonEnd)
/* Find overlaping exons, add their genes to the list if not already there */
{
struct binElement *overExons = binKeeperFind(chromBins, exonStart, exonEnd);
struct binElement *overExon;
int overLen;
for (overExon = overExons; overExon != NULL; overExon = overExon->next)
    {
    if (overlapsByThreshold(overExon, exonStart, exonEnd, &overLen))
        {
        struct geneLoc *gl = overExon->val;
        gl->numOverlap += overLen;
        if (!containsGeneLoc(geneLocList, gl))
            slAddHead(geneLocList, gl);
        }
    }
}
Exemple #19
0
struct txGraph *agxForCoordinates(char *chrom, int chromStart, int chromEnd, char strand, 
				    struct hash *orthoChromHash)
/* Get list of graphs that cover a particular region. */
{
struct binElement *beList = NULL, *be = NULL;
struct txGraph *agx = NULL, *agxList = NULL;
struct binKeeper *bk = hashFindVal(orthoChromHash, chrom);
if (bk != NULL)
    {
    beList = binKeeperFind(bk, chromStart, chromEnd);
    for(be = beList; be != NULL; be = be->next)
	{
	agx = be->val;
	if(agx->strand[0] == strand)
	    slSafeAddHead(&agxList, agx);
	}
    slReverse(&agxList);
    slFreeList(&beList);
    }
return agxList;
}
Exemple #20
0
boolean agxIsRedundant(struct altGraphX *agx)
/** Return TRUE if there has already been an altGraphX record that
    was a superSet of this data. */
{
struct binElement *be = NULL, *beList = NULL;
struct altGraphX *agxSeen = NULL;
boolean alreadySeen = FALSE;
beList = binKeeperFind(agxSeenBin, agx->tStart, agx->tEnd);
for(be = beList; be != NULL; be = be->next)
    {
    agxSeen = (struct altGraphX *)be->val;
    if(agxSeen == agx) 
	continue;
    if(agxIsSubset(agx, agxSeen)) 
	{
	alreadySeen = TRUE;
	break;
	}
    }
slFreeList(&beList);
return alreadySeen;
}
struct gene *mostOverlappingGene(struct hash *keeperHash, struct bed *bed)
/* Find most overlapping gene in hash of keepers of genes. */
{
struct binKeeper *bk = hashMustFindVal(keeperHash, bed->chrom);
struct binElement *bin, *binList = binKeeperFind(bk, bed->chromStart, bed->chromEnd);
int bestOverlap = 0;
struct gene *bestGene = NULL;
for (bin = binList; bin != NULL; bin = bin->next)
    {
    struct gene *gene = bin->val;
    if (gene->strand == bed->strand[0])
	{
	int overlap = bedRangeTreeOverlap(bed, gene->exonTree);
	if (overlap > bestOverlap)
	    {
	    bestOverlap = overlap;
	    bestGene = gene;
	    }
	}
    }
slFreeList(&binList);
return bestGene;
}
struct binElement *chromKeeperFind(char *chrom, int chromStart, int chromEnd)
/* Return a list of all items in chromKeeper that intersect range.
   Free this list with slFreeList. */
{
int i;
static boolean warned = FALSE;
struct binElement *be = NULL;
boolean found = FALSE;
for(i=0; i<chromCount; i++)
    {
    if(sameString(chromNames[i], chrom))
	{
	be = binKeeperFind(chromRanges[i], chromStart, chromEnd);
	found = TRUE;
	break;
	}
    }
if(!found && !warned)
    {
    warn("chromKeeper::chromKeeperFind() - Don't recognize chrom %s", chrom);
    warned = TRUE;
    }
return be;
}
Exemple #23
0
void checkExp(char *bedFileName, char *tNibDir, char *nibList)
{
struct lineFile *bf = lineFileOpen(bedFileName , TRUE), *af = NULL;
char *row[PSEUDOGENELINK_NUM_COLS] ;
struct pseudoGeneLink *ps;
char *tmpName[512], cmd[512];
struct axt *axtList = NULL, *axt, *mAxt = NULL;
struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seqList = NULL;
struct nibInfo *qNib = NULL, *tNib = NULL;
FILE *op;
int ret;

if (nibHash == NULL)
    nibHash = hashNew(0);
while (lineFileNextRow(bf, row, ArraySize(row)))
    {
    struct misMatch *misMatchList = NULL;
    struct binKeeper *bk = NULL;
    struct binElement *el, *elist = NULL;
    struct psl *mPsl = NULL, *rPsl = NULL, *pPsl = NULL, *psl ;
    struct misMatch *mf = NULL;
    ps = pseudoGeneLinkLoad(row);
    tmpName[0] = cloneString(ps->name);
    chopByChar(tmpName[0], '.', tmpName, sizeof(tmpName));
    verbose(2,"name %s %s:%d-%d\n",
            ps->name, ps->chrom, ps->chromStart,ps->chromEnd);
    /* get expressed retro from hash */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart, ps->chromEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        rPsl = el->val;
        verbose(2,"retroGene %s %s:%d-%d\n",rPsl->qName, ps->chrom, ps->chromStart,ps->chromEnd);
        }
    /* find mrnas that overlap parent gene */
    bk = hashFindVal(mrnaHash, ps->gChrom);
    elist = binKeeperFindSorted(bk, ps->gStart , ps->gEnd ) ;
    for (el = elist; el != NULL ; el = el->next)
        {
        pPsl = el->val;
        verbose(2,"parent %s %s:%d %d,%d\n",
                pPsl->qName, pPsl->tName,pPsl->tStart,
                pPsl->match, pPsl->misMatch);
        }
    /* find self chain */
    bk = hashFindVal(chainHash, ps->chrom);
    elist = binKeeperFind(bk, ps->chromStart , ps->chromEnd ) ;
    slSort(&elist, chainCmpScoreDesc);
    for (el = elist; el != NULL ; el = el->next)
        {
        struct chain *chain = el->val, *subChain, *retChainToFree, *retChainToFree2;
        int qs = chain->qStart;
        int qe = chain->qEnd;
        int id = chain->id;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        if (!sameString(chain->qName , ps->gChrom) || 
                !positiveRangeIntersection(qs, qe, ps->gStart, ps->gEnd))
            {
            verbose(2," wrong chain %s:%d-%d %s:%d-%d parent %s:%d-%d\n", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd,
                ps->gChrom,ps->gStart,ps->gEnd);
            continue;
            }
        verbose(2,"chain id %d %4.0f",chain->id, chain->score);
        chainSubsetOnT(chain, ps->chromStart+7, ps->chromEnd-7, 
            &subChain,  &retChainToFree);
        if (subChain != NULL)
            chain = subChain;
        chainSubsetOnQ(chain, ps->gStart, ps->gEnd, 
            &subChain,  &retChainToFree2);
        if (subChain != NULL)
            chain = subChain;
        if (chain->qStrand == '-')
            {
            qs = chain->qSize - chain->qEnd;
            qe = chain->qSize - chain->qStart;
            }
        verbose(2," %s:%d-%d %s:%d-%d ", 
                chain->qName, qs, qe, 
                chain->tName,chain->tStart,chain->tEnd);
        if (subChain != NULL)
            verbose(2,"subChain %s:%d-%d %s:%d-%d\n",
                    subChain->qName, subChain->qStart, subChain->qEnd, 
                    subChain->tName,subChain->tStart,subChain->tEnd);

	qNib = nibInfoFromCache(nibHash, tNibDir, chain->qName);
	tNib = nibInfoFromCache(nibHash, tNibDir, chain->tName);
	tSeq = nibInfoLoadStrand(tNib, chain->tStart, chain->tEnd, '+');
	qSeq = nibInfoLoadStrand(qNib, chain->qStart, chain->qEnd, chain->qStrand);
	axtList = chainToAxt(chain, qSeq, chain->qStart, tSeq, chain->tStart,
	    maxGap, BIGNUM);
        verbose(2,"axt count %d misMatch cnt %d\n",slCount(axtList), slCount(misMatchList));
        for (axt = axtList; axt != NULL ; axt = axt->next)
            {
            addMisMatch(&misMatchList, axt, chain->qSize);
            }
        verbose(2,"%d in mismatch list %s id %d \n",slCount(misMatchList), chain->qName, id);
        chainFree(&retChainToFree);
        chainFree(&retChainToFree2);
        break;
        }
    /* create axt of each expressed retroGene to parent gene */
        /* get alignment for each mrna overlapping retroGene */
    bk = hashFindVal(mrnaHash, ps->chrom);
    elist = binKeeperFindSorted(bk, ps->chromStart , ps->chromEnd ) ;
    {
    char queryName[512];
    char axtName[512];
    char pslName[512];
    safef(queryName, sizeof(queryName), "/tmp/query.%s.fa", ps->chrom);
    safef(axtName, sizeof(axtName), "/tmp/tmp.%s.axt", ps->chrom);
    safef(pslName, sizeof(pslName), "/tmp/tmp.%s.psl", ps->chrom);
    op = fopen(pslName,"w");
    for (el = elist ; el != NULL ; el = el->next)
        {
        psl = el->val;
        pslOutput(psl, op, '\t','\n');
        qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0);

        if (qSeq != NULL)
            slAddHead(&seqList, qSeq);
        else
            errAbort("seq %s not found \n", psl->qName);
        }
    fclose(op);
    faWriteAll(queryName, seqList);
    safef(cmd,sizeof(cmd),"pslPretty -long -axt %s %s %s %s",pslName , nibList, queryName, axtName);
    ret = system(cmd);
    if (ret != 0)
        errAbort("ret is %d %s\n",ret,cmd);
    verbose(2, "ret is %d %s\n",ret,cmd);
    af = lineFileOpen(axtName, TRUE);
    while ((axt = axtRead(af)) != NULL)
        slAddHead(&mAxt, axt);
    lineFileClose(&af);
    }
    slReverse(&mAxt);
    /* for each parent/retro pair, count bases matching retro and parent better */
    for (el = elist; el != NULL ; el = el->next)
        {
        int i, scoreRetro=0, scoreParent=0, scoreNeither=0;
        struct dyString *parentMatch = newDyString(16*1024);
        struct dyString *retroMatch = newDyString(16*1024);
        mPsl = el->val;

        if (mAxt != NULL)
            {
            verbose(2,"mrna %s %s:%d %d,%d axt %s\n",
                    mPsl->qName, mPsl->tName,mPsl->tStart,
                    mPsl->match, mPsl->misMatch, 
                    mAxt->qName);
            assert(sameString(mPsl->qName, mAxt->qName));
            for (i = 0 ; i< (mPsl->tEnd-mPsl->tStart) ; i++)
                {
                int j = mAxt->tStart - mPsl->tStart;
                verbose(5, "listLen = %d\n",slCount(&misMatchList));
                if ((mf = matchFound(&misMatchList, (mPsl->tStart)+i)) != NULL)
                    {
                    if (toupper(mf->retroBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match retro[%d] %d %c == %c parent %c %d\n",
                                i,mf->retroLoc, mf->retroBase, mAxt->qSym[j+i], 
                                mf->parentBase, mf->parentLoc);
                        dyStringPrintf(retroMatch, "%d,", mf->retroLoc);
                        scoreRetro++;
                        }
                    else if (toupper(mf->parentBase) == toupper(mAxt->qSym[j+i]))
                        {
                        verbose (3,"match parent[%d] %d %c == %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->qSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        dyStringPrintf(parentMatch, "%d,", mf->parentLoc);
                        scoreParent++;
                        }
                    else
                        {
                        verbose (3,"match neither[%d] %d %c != %c retro %c %d\n",
                                i,mf->parentLoc, mf->parentBase, mAxt->tSym[j+i], 
                                mf->retroBase, mf->retroLoc);
                        scoreNeither++;
                        }
                    }
                }
            verbose(2,"final score %s parent %d retro %d  neither %d\n",
                    mPsl->qName, scoreParent, scoreRetro, scoreNeither);
            fprintf(outFile,"%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n",
                    ps->chrom, ps->chromStart, ps->chromEnd, ps->name, ps->score, 
                    mPsl->tName, mPsl->tStart, mPsl->tEnd, mPsl->qName, 
                    scoreParent, scoreRetro, scoreNeither, parentMatch->string, retroMatch->string);
            mAxt = mAxt->next;
            }
        dyStringFree(&parentMatch);
        dyStringFree(&retroMatch);
        }
    }
}
void oneChromInput(char *database, char *chrom, int chromSize, 	
	char *rangeTrack, char *expTrack, 
	struct hash *refLinkHash, struct hash *erHash, FILE *f)
/* Read in info for one chromosome. */
{
struct binKeeper *rangeBk = binKeeperNew(0, chromSize);
struct binKeeper *expBk = binKeeperNew(0, chromSize);
struct binKeeper *knownBk = binKeeperNew(0, chromSize);
struct bed *rangeList = NULL, *range;
struct bed *expList = NULL;
struct genePred *knownList = NULL;
struct rangeInfo *riList = NULL, *ri;
struct hash *riHash = hashNew(0); /* rangeInfo values. */
struct binElement *rangeBeList = NULL, *rangeBe, *beList = NULL, *be;

/* Load up data from database. */
rangeList = loadBed(database, chrom, rangeTrack, 12, rangeBk);
expList = loadBed(database, chrom, expTrack, 15, expBk);
knownList = loadGenePred(database, chrom, "refGene", knownBk);

/* Build range info basics. */
rangeBeList = binKeeperFindAll(rangeBk);
for (rangeBe = rangeBeList; rangeBe != NULL; rangeBe = rangeBe->next)
    {
    range = rangeBe->val;
    AllocVar(ri);
    slAddHead(&riList, ri);
    hashAddSaveName(riHash, range->name, ri, &ri->id);
    ri->range = range;
    ri->commonName = findCommonName(range, knownBk, refLinkHash);
    }
slReverse(&riList);

/* Mark split ones. */
beList = binKeeperFindAll(expBk);
for (be = beList; be != NULL; be = be->next)
    {
    struct bed *exp = be->val;
    struct binElement *subList = binKeeperFind(rangeBk, 
    	exp->chromStart, exp->chromEnd);
    if (slCount(subList) > 1)
        {
	struct binElement *sub;
	for (sub = subList; sub != NULL; sub = sub->next)
	    {
	    struct bed *range = sub->val;
	    struct rangeInfo *ri = hashMustFindVal(riHash, range->name);
	    ri->isSplit = TRUE;
	    }
	}
    slFreeList(&subList);
    }

/* Output the nice ones: not split and having some expression info. */
for (ri = riList; ri != NULL; ri = ri->next)
    {
    if (!ri->isSplit)
        {
	struct bed *range =  ri->range;
	beList = binKeeperFind(expBk, range->chromStart, range->chromEnd);
	if (beList != NULL)
	    outputAveraged(f, ri, erHash, beList);
	slFreeList(&beList);
	}
    }

/* Clean up time! */
freeHash(&riHash);
genePredFreeList(&knownList);
bedFree(&rangeList);
bedFree(&expList);
slFreeList(&rangeBeList);
slFreeList(&beList);
slFreeList(&riList);
binKeeperFree(&rangeBk);
binKeeperFree(&expBk);
binKeeperFree(&knownBk);
}
void sortGenes(struct sqlConnection *conn)
/* Put up sort gene page. */
{
cartWebStart(cart, database, "Finding Candidate Genes for Gene Sorter");
if (!hgNearOk(database))
    errAbort("Sorry, gene sorter not available for this database.");

/* Get list of regions. */
struct genoGraph *gg = ggFirstVisible();
double threshold = getThreshold();
struct bed3 *bed, *bedList = regionsOverThreshold(gg);

/* Figure out what table and column are the sorter's main gene set. */
struct hash *genomeRa = hgReadRa(genome, database, "hgNearData", 
	"genome.ra", NULL);
char *geneTable = hashMustFindVal(genomeRa, "geneTable");
char *idColumn = hashMustFindVal(genomeRa, "idColumn");

/* if marker labels were present when the file was uploaded, they are saved here */
char cgmName[256];
safef(cgmName, sizeof(cgmName), "%s.cgm", gg->binFileName);
struct lineFile *m = lineFileMayOpen(cgmName, TRUE);
char *cgmRow[4];
cgmRow[0] = "";    /* dummy row */
cgmRow[1] = "";
cgmRow[2] = "0";
cgmRow[3] = "0";

FILE *g = NULL;
int markerCount = 0;
struct tempName snpTn;

if (m)
    {
    /* Create custom column output file. */
    trashDirFile(&snpTn, "hgg", "marker", ".mrk");  
    g = mustOpen(snpTn.forCgi, "w");
    fprintf(g, 
	"column name=\"%s Markers\" shortLabel=\"%s Markers over threshold\" longLabel=\"%s Markers in regions over threshold\" " 
	"visibility=on priority=99 "
        "\n"
        , gg->shortLabel
        , gg->shortLabel
        , gg->shortLabel
	);
    }

/*** Build up hash of all transcriptHash that are in region. */
struct hash *transcriptHash = hashNew(16);

/* This loop handles one chromosome at a time.  It depends on
 * the bedList being sorted by chromosome. */
for (bed = bedList; bed != NULL; )
    {

    /* Make binKeeper and stuff in all regions in this chromosome into it. */
    char *chrom = bed->chrom;
    int chromSize = hChromSize(database, chrom);
    struct binKeeper *bk = binKeeperNew(0, chromSize);
    while (bed != NULL && sameString(chrom, bed->chrom))
	{
	binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
	bed = bed->next;
	}

    struct binKeeper *bkGenes = NULL;
    if (m)
       bkGenes = binKeeperNew(0, chromSize);

    /* Query database to find out bounds of all genes on this chromosome
     * and if they overlap any of the regions then put them in the hash. */
    char query[512];
    safef(query, sizeof(query), 
    	"select name,txStart,txEnd from %s where chrom='%s'", geneTable, chrom);
    struct sqlResult *sr = sqlGetResult(conn, query);
    char **row;
    while ((row = sqlNextRow(sr)) != NULL)
        {
	char *name = row[0];
	int start = sqlUnsigned(row[1]);
	int end = sqlUnsigned(row[2]);
	if (binKeeperAnyOverlap(bk, start, end))
	    {
	    hashStore(transcriptHash, name);
	    if (m)
		binKeeperAdd(bkGenes, start, end, cloneString(name));
	    }
	}
    sqlFreeResult(&sr);

    if (m)
	{
	/* Read cgm file if it exists, looking at all markers on this chromosome
	 * and if they overlap any of the regions and genes then output them. */
	do 
	    {
	    // marker, chrom, chromStart, val
	    char *marker = cgmRow[0];
	    char *chr = cgmRow[1];
	    int start = sqlUnsigned(cgmRow[2]);
	    int end = start+1;
	    double val = sqlDouble(cgmRow[3]);
            int cmp = strcmp(chr,chrom);
            if (cmp > 0)
                break;
            if (cmp == 0)
		{
		if (val >= threshold)
		    {
		    struct binElement *el, *bkList = binKeeperFind(bkGenes, start, end);
		    for (el = bkList; el; el=el->next)
			{
			/* output to custom column trash file */
			fprintf(g, "%s %s\n", (char *)el->val, marker);
			}
		    if (bkList)
			{
			++markerCount;
			slFreeList(&bkList);
			}
		    }
		}
	    }
	while (lineFileRow(m, cgmRow));
	}

    /* Clean up for this chromosome. */
    binKeeperFree(&bk);

    if (m)
	{
	/* For speed, we do not free up the values (cloned the kg names earlier) */
	binKeeperFree(&bkGenes);  
	}

    }

/* Get list of all transcripts in regions. */
struct hashEl *el, *list = hashElListHash(transcriptHash);

/* Create file with all matching gene IDs. */
struct tempName keyTn;
trashDirFile(&keyTn, "hgg", "key", ".key");
FILE *f = mustOpen(keyTn.forCgi, "w");
for (el = list; el != NULL; el = el->next)
    fprintf(f, "%s\n", el->name);
carefulClose(&f);

/* Print out some info. */
hPrintf("Thresholding <i>%s</i> at %g. ", gg->shortLabel, threshold);
hPrintf("There are %d regions covering %lld bases.<BR>\n",
    slCount(bedList), bedTotalSize((struct bed*)bedList) );
hPrintf("Installed a Gene Sorter filter that selects only genes in these regions.<BR>\n");
if (m)
    {
    hPrintf("There are %d markers in the regions over threshold that overlap knownGenes.<BR>\n", markerCount);
    hPrintf("Installed a Gene Sorter custom column called \"%s Markers\" with these markers.<BR>\n", gg->shortLabel);
    }

/* close custom column output file */
if (m)
    {
    lineFileClose(&m);
    carefulClose(&g);
    }

/* Stuff cart variable with name of file. */
char keyCartName[256];
safef(keyCartName, sizeof(keyCartName), "%s%s.keyFile",
	advFilterPrefix, idColumn);
cartSetString(cart, keyCartName, keyTn.forCgi);

cartSetString(cart, customFileVarName, snpTn.forCgi);

char snpVisCartNameTemp[256];
char *snpVisCartName = NULL;
safef(snpVisCartNameTemp, sizeof(snpVisCartNameTemp), "%s%s Markers.vis",
	colConfigPrefix, gg->shortLabel);
snpVisCartName = replaceChars(snpVisCartNameTemp, " ", "_");
cartSetString(cart, snpVisCartName, "1");
freeMem(snpVisCartName);

hPrintf("<FORM ACTION=\"../cgi-bin/hgNear\" METHOD=GET>\n");
cartSaveSession(cart);
hPrintf("<CENTER>");
cgiMakeButton("submit", "go to gene sorter");
hPrintf("</CENTER>");
hPrintf("</FORM>");

cartWebEnd();
}
void oneChrom(char *database, char *chrom, char *refAliTrack, char *bedTrack,
              struct hash *otherHash, struct stats *stats)
/* Process one chromosome. */
{
    struct bed *bedList = NULL, *bed;
    struct sqlConnection *conn = hAllocConn(database);
    struct sqlResult *sr;
    char **row;
    int rowOffset;
    int chromSize = hChromSize(database, chrom);
    struct binKeeper *bk = binKeeperNew(0, chromSize);
    struct psl *pslList = NULL;
    struct dnaSeq *chromSeq = NULL;

    if (endsWith(bedTrack, ".bed"))
    {
        struct lineFile *lf = lineFileOpen(bedTrack, TRUE);
        char *row[3];
        while (lineFileRow(lf, row))
        {
            if (sameString(chrom, row[0]))
            {
                bed = bedLoad3(row);
                slAddHead(&bedList, bed);
            }
        }
        lineFileClose(&lf);
    }
    else
    {
        sr = hChromQuery(conn, bedTrack, chrom, NULL, &rowOffset);
        while ((row = sqlNextRow(sr)) != NULL)
        {
            bed = bedLoad3(row+rowOffset);
            slAddHead(&bedList, bed);
        }
        sqlFreeResult(&sr);
    }
    slReverse(&bedList);
    uglyf("Loaded beds\n");

    sr = hChromQuery(conn, refAliTrack, chrom, NULL, &rowOffset);
    while ((row = sqlNextRow(sr)) != NULL)
    {
        struct psl *psl = pslLoad(row + rowOffset);
        slAddHead(&pslList, psl);
        binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
    }
    sqlFreeResult(&sr);
    uglyf("Loaded psls\n");

    chromSeq = hLoadChrom(database, chrom);
    /* Fetch entire chromosome into memory. */
    uglyf("Loaded human seq\n");

    for (bed = bedList; bed != NULL; bed = bed->next)
    {
        struct binElement *el, *list = binKeeperFind(bk, bed->chromStart, bed->chromEnd);
        for (el = list; el != NULL; el = el->next)
        {
            struct psl *fullPsl = el->val;
            struct psl *psl = pslTrimToTargetRange(fullPsl,
                                                   bed->chromStart, bed->chromEnd);
            if (psl != NULL)
            {
                foldPslIntoStats(psl, chromSeq, otherHash, stats);
                pslFree(&psl);
            }
        }
        slFreeList(&list);
        stats->bedCount += 1;
        stats->bedBaseCount += bed->chromEnd - bed->chromStart;
        sqlFreeResult(&sr);
    }
    freeDnaSeq(&chromSeq);
    pslFreeList(&pslList);
    binKeeperFree(&bk);
    hFreeConn(&conn);
}