static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh)
/* convert a genePred to a psl and CDS */
int chromSize = hashIntValDefault(chromHash, gp->chrom, 0);
if (chromSize == 0)
    errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom);
int qSize = 0;
if (qSizes != NULL)
    qSize = hashIntValDefault(qSizeHash, gp->name, 0);
struct psl *psl = genePredToPsl(gp, chromSize, qSize);
pslTabOut(psl, pslFh);
if (gp->cdsStart < gp->cdsEnd)
    cnvGenePredCds(gp, qSize, cdsFh);
static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh)
/* convert a genePred to a psl and CDS */
int chromSize = hashIntValDefault(chromHash, gp->chrom, 0);
if (chromSize == 0)
    errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom);
int e = 0, qSize=0;

for (e = 0; e < gp->exonCount; ++e)
    qSize+=(gp->exonEnds[e] - gp->exonStarts[e]);
struct psl *psl = pslNew(gp->name, qSize, 0, qSize,
                         gp->chrom, chromSize, gp->txStart, gp->txEnd,
                         gp->strand, gp->exonCount, 0);
psl->blockCount = gp->exonCount;		    
for (e = 0; e < gp->exonCount; ++e)
    psl->blockSizes[e] = (gp->exonEnds[e] - gp->exonStarts[e]);
    psl->qStarts[e] = e==0 ? 0 : psl->qStarts[e-1] + psl->blockSizes[e-1];
    psl->tStarts[e] = gp->exonStarts[e];
psl->match = qSize;	
psl->tNumInsert = psl->blockCount-1; 
psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize;
pslTabOut(psl, pslFh);
if (gp->cdsStart < gp->cdsEnd)
    cnvGenePredCds(gp, qSize, cdsFh);
void gensatFixFull(char *captionFile)
/* Fix missing captions. */
struct lineFile *lf = lineFileOpen(captionFile, TRUE);
char *row[2];
struct dyString *sql = dyStringNew(0);
struct sqlConnection *conn = sqlConnect(database);
struct hash *capHash = newHash(16);
while (lineFileRowTab(lf, row))
    int captionId;
    char *submitId = row[0];
    char *caption = row[1];
    captionId = hashIntValDefault(capHash, caption, 0);
    if (captionId == 0)
	dyStringAppend(sql, "insert into caption values(default, \"");
	dyStringAppend(sql, caption);
	dyStringAppend(sql, "\")");
	sqlUpdate(conn, sql->string);
	verbose(1, "%s\n", sql->string);
	captionId = sqlLastAutoId(conn);
	hashAddInt(capHash, caption, captionId);
    dyStringPrintf(sql, "update imageFile set caption=%d ", captionId);
    dyStringPrintf(sql, "where submissionSet=%d ", gensatId);
    dyStringPrintf(sql, "and submitId = \"%s\"", submitId);
    sqlUpdate(conn, sql->string);
    verbose(1, "%s\n", sql->string);
Exemple #4
static int idLookup(struct hash *hash, void *obj)
/* Look up object in hash.  Return 0 if can't find it.
 * Otherwise return object id. */
char buf[17];
safef(buf, sizeof(buf), "%p", obj);
return hashIntValDefault(hash, buf, 0);
double scoreLiftOverChain(struct liftOverChain *chain,
    char *fromOrg, char *fromDb, char *toOrg, char *toDb,
    char *cartOrg, char *cartDb, struct hash *dbRank )
/* Score the chain in terms of best match for cart settings */
double score = 0;

char *chainFromOrg = hArchiveOrganism(chain->fromDb);
char *chainToOrg = hArchiveOrganism(chain->toDb);
int fromRank = hashIntValDefault(dbRank, chain->fromDb, 0);  /* values up to approx. #assemblies */
int toRank = hashIntValDefault(dbRank, chain->toDb, 0);
int maxRank = hashIntVal(dbRank, "maxRank"); 

if (sameOk(fromOrg,chainFromOrg) &&
    sameOk(fromDb,chain->fromDb) && 
    sameOk(toOrg,chainToOrg) &&
    score += 10000000;

if (sameOk(fromOrg,chainFromOrg)) 
    score += 2000000;
if (sameOk(fromDb,chain->fromDb)) 
    score += 1000000;

if (sameOk(toOrg,chainToOrg))
    score += 200000;
if (sameOk(toDb,chain->toDb))
    score += 100000;

if (sameOk(cartDb,chain->fromDb)) 
    score +=  20000;
if (sameOk(cartDb,chain->toDb)) 
    score +=  10000;

if (sameOk(cartOrg,chainFromOrg)) 
    score +=  2000;
if (sameOk(cartOrg,chainToOrg)) 
    score +=  1000;

score += 10*(maxRank-fromRank);
score += (maxRank - toRank);

return score;
static boolean columnIsIncluded(struct annoFormatTab *self, char *sourceName, char *colName)
// Return TRUE if column has not been explicitly deselected.
if (self->columnVis)
    char fullName[PATH_LEN];
    makeFullColumnName(fullName, sizeof(fullName), sourceName, colName);
    int vis = hashIntValDefault(self->columnVis, fullName, 1);
    if (vis == 0)
        return FALSE;
return TRUE;
double calcDistanceFromCluster(struct rnaBinder *rb, struct clusterMember *cmList,
			       struct dMatrix *sjIndex, struct dMatrix *psInten)
/* Calculate the distance from the rnaBinder intensity measurement to
   the sjIndexes of the cluster members. If no intensity present use
   0 as it will fall in the middle of [-1,1]. */
double sum = 0;
int count = 0;
int sjIx = 0, gsIx = 0;
struct clusterMember *cm = NULL;
double corr = 0;
if(sjIndex->colCount != psInten->colCount)
    errAbort("Splice Junction and Intensity files must have same number of columns.");

/* Get the index of the gene set in the intensity file. */
gsIx = hashIntValDefault(psInten->nameIndex, rb->psName, -1);
if(gsIx == -1)
/*     warn("Probe Set %s not found in intensitiy file."); */
    return 0;
for(cm = cmList; cm != NULL; cm = cm->next)
    /* For each member get the index in the splice junction file. */
    sjIx = hashIntValDefault(sjIndex->nameIndex, cm->psName, -1);
    if(sjIx == -1)
	errAbort("Probe Set %s not found in SJ index file.");
    corr = correlation(psInten->matrix[gsIx], sjIndex->matrix[sjIx], sjIndex->colCount);
    sum += corr;
if(count == 0)
    errAbort("No junctions in cluster.");
sum = sum / (double) count;
return sum;
Exemple #8
struct psl *removeKillList(struct psl* pslList)
/* Remove all of the psls that are in the kill hash. */
struct psl *psl = NULL, *pslNext = NULL,  *pslNewList = NULL;
if(killHash == NULL)
    return pslList;
for(psl = pslList; psl != NULL; psl = pslNext)
    pslNext = psl->next;
    /* If the accession is in the kill list with value 1
       don't add it. */
    if(hashIntValDefault(killHash, psl->qName, 0) == 0)
	slAddHead(&pslNewList, psl);
return pslNewList;
Exemple #9
int oneHubTrackSettings(char *hubUrl, struct hash *totals)
/* Read hub trackDb files, noting settings used */
struct trackHub *hub = NULL;
struct errCatch *errCatch = errCatchNew();
if (errCatchStart(errCatch))
    hub = trackHubOpen(hubUrl, "hub_0");

if (hub == NULL)
    return 1;

printf("%s (%s)\n", hubUrl, hub->shortLabel);
struct trackHubGenome *genome;
struct hash *counts;
if (totals)
    counts = totals;
    counts = newHash(0);
struct hashEl *el;
for (genome = hub->genomeList; genome != NULL; genome = genome->next)
    struct trackDb *tdb, *tdbs = trackHubTracksForGenome(hub, genome);
    for (tdb = tdbs; tdb != NULL; tdb = tdb->next)
        struct hashCookie cookie = hashFirst(trackDbHashSettings(tdb));
        verbose(2, "    track: %s\n", tdb->shortLabel);
        while ((el = hashNext(&cookie)) != NULL)
            int count = hashIntValDefault(counts, el->name, 0);
            hashReplace(counts, el->name, intToPt(count));
if (!totals)
return 0;
Exemple #10
int countBases(struct sqlConnection *conn, char *chrom, int chromSize,
    char *database)
/* Count bases, generally not including gaps, in chromosome. */
static boolean gapsLoaded = FALSE;
struct sqlResult *sr;
int totalGaps = 0;
char **row;
int rowOffset;

if (countGaps)
    return chromSize;

/*	If doing all chroms, then load up all the gaps and be done with
 *	it instead of re-reading the gap table for every chrom
if (sameWord(clChrom,"all"))
    if (!gapsLoaded)
	gapHash = loadAllGaps(conn, database);
    gapsLoaded = TRUE;
    totalGaps = hashIntValDefault(gapHash, chrom, 0);
    sr = hChromQuery(conn, "gap", chrom, NULL, &rowOffset);
    while ((row = sqlNextRow(sr)) != NULL)
	int gapSize;
	struct agpGap gap;
	agpGapStaticLoad(row+rowOffset, &gap);
	gapSize = gap.chromEnd - gap.chromStart;
	totalGaps += gapSize;
return chromSize - totalGaps;
Exemple #11
int main(int argc, char *argv[])
/* Process command line. */
optionInit(&argc, argv, options);
if (argc != 4)

char *lrgFile = argv[1];
char *chromSizes = argv[2];
char *pslFile = argv[3];

struct hash *chromHash = hChromSizeHashFromFile(chromSizes);
struct lrg *lrg, *lrgList = lrgLoadAllByTab(lrgFile);
FILE *f = mustOpen(pslFile, "w");
for (lrg = lrgList;  lrg != NULL;  lrg = lrg->next)
    int chromSize = hashIntValDefault(chromHash, lrg->chrom, 0);
    if (chromSize == 0)
	errAbort("Can't find size of '%s' in chrom.sizes file %s.", lrg->chrom, chromSizes);
    struct psl *psl = lrgToPsl(lrg, chromSize);
    pslTabOut(psl, f);
return 0;
Exemple #12
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */ = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	info.sourceAcc = cloneString(bed->name);
	info.sourceAcc = txAccFromTempName(bed->name);
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		bestCoverage = coverage;
		bestPsl = psl;
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    bestCoverage = coverage;
		    bestPsl = psl;
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;

    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
	    case 3:
	        info.genomicStop = TRUE;
	        exonCount += 1;
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);

/* Clean up and go home. */
Exemple #13
void jsonQuery(char *inFile, char *path, char *outFile)
/* jsonQuery - Use a path syntax to retrieve elements/values from each line of JSON input. */
struct lineFile *lf = lineFileOpen(inFile, TRUE);
struct hash *uniqHash = NULL;
boolean countUniq = optionExists("countUniq");
boolean uniq = optionExists("uniq") || countUniq;
if (uniq)
    uniqHash = hashNew(0);
struct dyString *dy = dyStringNew(0);
FILE *outF = mustOpen(outFile, "w");
char *line;
while (lineFileNextReal(lf, &line))
    struct lm *lm = lmInit(1<<16);
    struct jsonElement *topEl = jsonParseLm(line, lm);
    struct slRef topRef; = NULL;
    topRef.val = topEl;
    char desc[1024];
    safef(desc, sizeof desc, "line %d of %s", lf->lineIx, inFile);
    struct slRef *results = jsonQueryElementList(&topRef, desc, path, lm);
    struct slRef *result;
    for (result = results;  result != NULL;  result = result->next)
        struct jsonElement *el = result->val;
        if (uniq)
            jsonDyStringPrint(dy, el, NULL, -1);
            char *elStr = dy->string;
            int count = hashIntValDefault(uniqHash, elStr, 0);
            if (count < 1)
                hashAddInt(uniqHash, elStr, 1);
                verbose(2, "line %d: %s\n", lf->lineIx, elStr);
                if (!countUniq)
                    fprintf(outF, "%s\n", elStr);
                hashIncInt(uniqHash, elStr);
            jsonPrintToFile(el, NULL, outF, 2);
if (countUniq)
    struct hashEl *hel;
    struct hashCookie cookie = hashFirst(uniqHash);
    while ((hel = hashNext(&cookie)) != NULL)
        fprintf(outF, "%10d %s\n", ptToInt(hel->val), hel->name);