Esempio n. 1
0
void txOrtho(char *inAgx, char *inChain, char *inNet, char *orthoAgx, char *outEdges)
/* txOrtho - Produce list of shared edges between two transcription graphs in two species. */
{
/* Load up input and create output file */
struct txGraph *inGraphList = txGraphLoadAll(inAgx);
verbose(1, "Loaded %d input graphs in %s\n", slCount(inGraphList), inAgx);
struct hash *chainHash = allChainsHash(inChain);
verbose(1, "Read %d chains from %s\n", chainHash->elCount, inChain);
struct hash *netHash = netToBkHash(inNet);
verbose(1, "Read %d nets from %s\n", netHash->elCount, inNet);
struct txGraph *orthoGraphList = txGraphLoadAll(orthoAgx);
verbose(1, "Loaded %d ortho graphs in %s\n", slCount(orthoGraphList), orthoAgx);
struct hash *orthoGraphHash = txgIntoKeeperHash(orthoGraphList);
verbose(1, "%d ortho chromosomes/scaffolds\n", orthoGraphHash->elCount);
FILE *f = mustOpen(outEdges, "w");

/* Loop through inGraphList. */
struct txGraph *inGraph;
for (inGraph = inGraphList; inGraph != NULL; inGraph = inGraph->next)
    {
    verbose(2, "Processing %s %s:%d-%d strand %s\n", 
    	inGraph->name, inGraph->tName, inGraph->tStart, inGraph->tEnd,
	inGraph->strand);
    writeOrthoEdges(inGraph, chainHash, netHash, orthoGraphHash, f);
    }
carefulClose(&f);
}
void txgGoodEdges(char *inTxg, char *inWeights, char *asciiThreshold,
                  char *outType, char *outEdges)
/* txgGoodEdges - Get edges that are above a certain threshold.. */
{
    struct txGraph *txgList = txGraphLoadAll(inTxg);
    verbose(2, "LOaded %d txGraphs from %s\n", slCount(txgList), inTxg);
    struct hash *weightHash = hashWeights(inWeights);
    verbose(2, "Loaded %d weights from %s\n", weightHash->elCount, inWeights);
    double threshold = sqlDouble(asciiThreshold);
    verbose(2, "Threshold %f\n", threshold);
    struct txGraph *txg;
    FILE *f = mustOpen(outEdges, "w");
    for (txg = txgList; txg != NULL; txg = txg->next)
    {
        verbose(2, "%s edgeCount %d\n", txg->name, txg->edgeCount);
        processOneGraph(txg, weightHash, threshold, outType, f);
    }
    carefulClose(&f);
}
void txgTrim(char *inTxg, char *inWeights, char *asciiThreshold, char *outTxg)
/* txgTrim - Trim out parts of txGraph that are not of sufficient weight.. */
{
struct txGraph *txgList = txGraphLoadAll(inTxg);
verbose(2, "LOaded %d txGraphs from %s\n", slCount(txgList), inTxg);
struct hash *weightHash = hashWeights(inWeights);
verbose(2, "Loaded %d weights from %s\n", weightHash->elCount, inWeights);
double threshold = sqlDouble(asciiThreshold);
verbose(2, "Threshold %f\n", threshold);
struct txGraph *txg;
FILE *f = mustOpen(outTxg, "w");
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    verbose(2, "%s edgeCount %d\n", txg->name, txg->edgeCount);
    txGraphTrimOne(txg, weightHash, threshold);
    if (txg->edgeCount > 0)
	txGraphTabOut(txg, f);
    }
carefulClose(&f);
}
void txGeneCanonical(char *codingCluster, char *infoFile, 
	char *noncodingGraph, char *genesBed, char *nearCoding, 
	char *outCanonical, char *outIsoforms, char *outClusters)
/* txGeneCanonical - Pick a canonical version of each gene - that is the form
 * to use when just interested in a single splicing varient. Produces final
 * transcript clusters as well. */
{
/* Read in input into lists in memory. */
struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster);
struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph);
struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12);
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
struct bed *nearList = bedLoadNAll(nearCoding, 12);

/* Make hash of all beds. */
struct hash *bedHash = hashNew(18);
for (bed = bedList; bed != NULL; bed = bed->next)
    hashAdd(bedHash, bed->name, bed);

/* Make has of all info. */
struct hash *infoHash = hashNew(18);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Make a binKeeper structure that we'll populate with coding genes. */
struct hash *sizeHash = minChromSizeFromBeds(bedList);
struct hash *keeperHash = minChromSizeKeeperHash(sizeHash);

/* Make list of coding genes and toss them into binKeeper.
 * This will eat up bed list, but bedHash is ok. */
struct gene *gene, *geneList = NULL;
for (coding = codingList; coding != NULL; coding = coding->next)
    {
    gene = geneFromCluster(coding, bedHash, infoHash);
    slAddHead(&geneList, gene);
    struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom);
    binKeeperAdd(bk, gene->start, gene->end, gene);
    }

/* Go through near-coding genes and add them to the coding gene
 * they most overlap. */
for (bed = nearList; bed != NULL; bed = nextBed)
    {
    nextBed = bed->next;
    gene = mostOverlappingGene(keeperHash, bed);
    if (gene == NULL)
        errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name);
    geneAddBed(gene, bed);
    }

/* Add non-coding genes. */
for (graph = graphList; graph != NULL; graph = graph->next)
    {
    gene = geneFromGraph(graph, bedHash);
    slAddHead(&geneList, gene);
    }

/* Sort so it all looks nicer. */
slSort(&geneList, geneCmp);

/* Open up output files. */
FILE *fCan = mustOpen(outCanonical, "w");
FILE *fIso = mustOpen(outIsoforms, "w");
FILE *fClus = mustOpen(outClusters, "w");

/* Loop through, making up gene name, and writing output. */
int geneId = 0;
for (gene = geneList; gene != NULL; gene = gene->next)
    {
    /* Make up name. */
    char name[16];
    safef(name, sizeof(name), "g%05d", ++geneId);

    /* Reverse transcript list just to make it look better. */
    slReverse(&gene->txList);

    /* Write out canonical file output */
    bed = hashMustFindVal(bedHash, gene->niceTx->name);
    fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n",
    	bed->chrom, bed->chromStart, bed->chromEnd, geneId,
	gene->niceTx->name, gene->niceTx->name);

    /* Write out isoforms output. */
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fIso, "%d\t%s\n", geneId, bed->name);

    /* Write out cluster output, starting with bed 6 standard fields. */
    fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t",
    	gene->chrom, gene->start, gene->end, name, 0, gene->strand);

    /* Write out thick-start/thick end. */
    if (gene->isCoding)
        {
	int thickStart = gene->end, thickEnd  = gene->start;
	for (bed = gene->txList; bed != NULL; bed = bed->next)
	    {
	    if (bed->thickStart < bed->thickEnd)
	        {
		thickStart = min(thickStart, bed->thickStart);
		thickEnd = max(thickEnd, bed->thickEnd);
		}
	    }
	fprintf(fClus, "%d\t%d\t", thickStart, thickEnd);
	}
    else
        {
	fprintf(fClus, "%d\t%d\t", gene->start, gene->start);
	}

    /* We got no rgb value, just write out zero. */
    fprintf(fClus, "0\t");

    /* Get exons from exonTree. */
    struct range *exon, *exonList = rangeTreeList(gene->exonTree);
    fprintf(fClus, "%d\t", slCount(exonList));
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->start - gene->start);
    fprintf(fClus, "\t");
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->end - exon->start);
    fprintf(fClus, "\t");

    /* Write out associated transcripts. */
    fprintf(fClus, "%d\t", slCount(gene->txList));
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fClus, "%s,", bed->name);
    fprintf(fClus, "\t");

    /* Write out nice value */
    fprintf(fClus, "%s\t", gene->niceTx->name);

    /* Write out coding/noncoding value. */
    fprintf(fClus, "%d\n", gene->isCoding);
    }

/* Close up files. */
carefulClose(&fCan);
carefulClose(&fIso);
carefulClose(&fClus);
}
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, 
	char *inputInfo, char *inputCds, char *outputCds, char *outputPp)
/* txCdsRepick - After we have clustered based on the preliminary coding 
 * regions we can make a more intelligent choice here about the final coding 
 * regions. */
{
/* Read input bed into hash.  Also calculate number with CDS set. */
struct hash *bedHash = hashNew(16);
struct bed *bed, *bedList = bedLoadNAll(inputBed, 12);
int txWithCdsCount = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
        txWithCdsCount += 1;
    hashAdd(bedHash, bed->name, bed);
    }
verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed);

/* Read input transcript graphs into list, and into a hash
 * keyed by transcript names. */
struct hash *graphHash = hashNew(16);
struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg);
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    int i;
    for (i=0; i<txg->sourceCount; ++i)
        hashAdd(graphHash, txg->sources[i].accession, txg);
    }
verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList),
	graphHash->elCount, inputTxg);

/* Read input protein cluster into list, and into a hash
 * keyed by transcript name */
struct hash *clusterHash = hashNew(16);
struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster);
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    int i;
    for (i=0; i<cluster->txCount; ++i)
        hashAdd(clusterHash, cluster->txArray[i], cluster);
    }
verbose(2, "Read %d protein clusters (%d transcripts) from  %s\n", 
	slCount(clusterList), clusterHash->elCount, inputCluster);

/* Read in txInfo into a hash keyed by transcript name */
struct hash *infoHash = hashNew(16);
struct txInfo *info, *infoList = txInfoLoadAll(inputInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);
verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, 
	inputInfo);

/* Read in input cds evidence into a hash keyed by transcript name
 * who's values are a sorted *list* of evidence. */
struct hash *evHash = hashNew(16);
struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds);
int evCount = 0;
for  (ev = evList; ev != NULL; ev = nextEv)
    {
    nextEv = ev->next;
    struct hashEl *hel = hashLookup(evHash, ev->name);
    if (hel == NULL)
        hel = hashAdd(evHash, ev->name, NULL);
    slAddTail(&hel->val, ev);
    ++evCount;
    }
verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n",
	evCount, evHash->elCount, inputCds);

/* Create a hash containing what looks to be the best protein-coding
 * transcript in each protein cluster.  This is keyed by cluster name
 * with transcript names for values. */
FILE *f = mustOpen(outputPp, "w");
struct hash *bestInClusterHash = hashNew(16);
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    double bestScore = -BIGNUM;
    char *bestTx = NULL;
    int i;
    for (i=0; i<cluster->txCount; ++i)
        {
	char *tx = cluster->txArray[i];
	info = hashMustFindVal(infoHash, tx);
	double score = infoCodingScore(info, TRUE);
	if (score > bestScore)
	    {
	    bestTx = tx;
	    bestScore = score;
	    }
	}
    hashAdd(bestInClusterHash, cluster->name, bestTx);
    fprintf(f, "%s\t%s\n", cluster->name, bestTx);
    }
carefulClose(&f);
verbose(2, "Picked best protein for each protein cluster\n");


/* Loop through each transcript cluster (graph).  Make a list of
 * protein clusters associated with that graph. Armed with this
 * information call repick routine on each transcript in the graph. */
f = mustOpen(outputCds, "w");
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    /* Build up list of protein clusters associated with transcript cluster. */
    struct slRef *protClusterRefList = NULL, *protClusterRef;
    int i;
    for (i=0; i<txg->sourceCount; ++i)
	{
	char *tx = txg->sources[i].accession;
	struct txCluster *protCluster = hashFindVal(clusterHash, tx);
	if (protCluster != NULL)
	    refAddUnique(&protClusterRefList, protCluster);
	}

    /* Figure out best scoring protein in RNA cluster, and set threshold
     * to eliminate ones scoring less than half this much. */
    double bestProtScore = 0;
    for (protClusterRef = protClusterRefList; protClusterRef != NULL;
    	protClusterRef = protClusterRef->next)
	{
	struct txCluster *protCluster = protClusterRef->val;
	char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name);
	struct txInfo *info = hashMustFindVal(infoHash, protTx);
	double score = infoCodingScore(info, FALSE);
	bestProtScore = max(score, bestProtScore);
	}
    double protScoreThreshold = bestProtScore * 0.5;

    /* Get list of references to beds of proteins over that threshold. */
    struct slRef *protRefList = NULL;
    for (protClusterRef = protClusterRefList; protClusterRef != NULL;
    	protClusterRef = protClusterRef->next)
	{
	struct txCluster *protCluster = protClusterRef->val;
	char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name);
	struct txInfo *info = hashMustFindVal(infoHash, protTx);
	double score = infoCodingScore(info, FALSE);
	if (score >= protScoreThreshold)
	    {
	    struct bed *bed = hashMustFindVal(bedHash, protTx);
	    refAdd(&protRefList, bed);
	    }
	}

    /* Go repick each CDS in RNA cluster */
    for (i=0; i<txg->sourceCount; ++i)
        {
	char *tx = txg->sources[i].accession;
	struct bed *bed = hashMustFindVal(bedHash, tx);
	struct cdsEvidence *evList = hashFindVal(evHash, tx);
	if (evList != NULL && bed->thickStart < bed->thickEnd)
	    {
	    info = hashMustFindVal(infoHash, bed->name);
	    pickCompatableCds(bed, protRefList, evList, info, f);
	    }
	}
    slFreeList(&protClusterRefList);
    }
carefulClose(&f);
verbose(1, "repicked %d, removed %d, no change to %d\n",
    pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone);
}