Exemplo n.º 1
0
void txCdsRefBestEvOnly(char *inFile, char *outFile)
/* txCdsRefBestEvOnly - Go through a cdsEvidence file, and extract only the bits that refer to the native orf for a RefSeqReviewed transcript.. */
{
struct cdsEvidence *cds, *cdsList = cdsEvidenceLoadAll(inFile);
struct hash *nativeEvHash = hashNew(18);
FILE *f = mustOpen(outFile, "w");

/* Make one pass through list adding the native refseq reviewed records to hash. */
for (cds = cdsList; cds != NULL; cds = cds->next)
    {
    if (sameString(cds->source, "RefSeqReviewed"))
        {
	char *acc = strrchr(cds->name, '.');
	assert(acc != NULL);
	acc += 1;
	if (sameString(acc, cds->accession))
	    hashAdd(nativeEvHash, cds->name, cds);
	}
    }

/* Make another pass through outputting all lines that correspond to
 * reviewd refseq's ORF. */
for (cds = cdsList; cds != NULL; cds = cds->next)
    {
    struct cdsEvidence *native = hashFindVal(nativeEvHash, cds->name);
    if (native != NULL)
        {
	if (cds->start == native->start && cds->end == native->end)
	    cdsEvidenceTabOut(cds, f);
	}
    }
carefulClose(&f);
}
Exemplo n.º 2
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, 
	char *inputInfo, char *inputCds, char *outputCds, char *outputPp)
/* txCdsRepick - After we have clustered based on the preliminary coding 
 * regions we can make a more intelligent choice here about the final coding 
 * regions. */
{
/* Read input bed into hash.  Also calculate number with CDS set. */
struct hash *bedHash = hashNew(16);
struct bed *bed, *bedList = bedLoadNAll(inputBed, 12);
int txWithCdsCount = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
        txWithCdsCount += 1;
    hashAdd(bedHash, bed->name, bed);
    }
verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed);

/* Read input transcript graphs into list, and into a hash
 * keyed by transcript names. */
struct hash *graphHash = hashNew(16);
struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg);
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    int i;
    for (i=0; i<txg->sourceCount; ++i)
        hashAdd(graphHash, txg->sources[i].accession, txg);
    }
verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList),
	graphHash->elCount, inputTxg);

/* Read input protein cluster into list, and into a hash
 * keyed by transcript name */
struct hash *clusterHash = hashNew(16);
struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster);
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    int i;
    for (i=0; i<cluster->txCount; ++i)
        hashAdd(clusterHash, cluster->txArray[i], cluster);
    }
verbose(2, "Read %d protein clusters (%d transcripts) from  %s\n", 
	slCount(clusterList), clusterHash->elCount, inputCluster);

/* Read in txInfo into a hash keyed by transcript name */
struct hash *infoHash = hashNew(16);
struct txInfo *info, *infoList = txInfoLoadAll(inputInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);
verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, 
	inputInfo);

/* Read in input cds evidence into a hash keyed by transcript name
 * who's values are a sorted *list* of evidence. */
struct hash *evHash = hashNew(16);
struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds);
int evCount = 0;
for  (ev = evList; ev != NULL; ev = nextEv)
    {
    nextEv = ev->next;
    struct hashEl *hel = hashLookup(evHash, ev->name);
    if (hel == NULL)
        hel = hashAdd(evHash, ev->name, NULL);
    slAddTail(&hel->val, ev);
    ++evCount;
    }
verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n",
	evCount, evHash->elCount, inputCds);

/* Create a hash containing what looks to be the best protein-coding
 * transcript in each protein cluster.  This is keyed by cluster name
 * with transcript names for values. */
FILE *f = mustOpen(outputPp, "w");
struct hash *bestInClusterHash = hashNew(16);
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    double bestScore = -BIGNUM;
    char *bestTx = NULL;
    int i;
    for (i=0; i<cluster->txCount; ++i)
        {
	char *tx = cluster->txArray[i];
	info = hashMustFindVal(infoHash, tx);
	double score = infoCodingScore(info, TRUE);
	if (score > bestScore)
	    {
	    bestTx = tx;
	    bestScore = score;
	    }
	}
    hashAdd(bestInClusterHash, cluster->name, bestTx);
    fprintf(f, "%s\t%s\n", cluster->name, bestTx);
    }
carefulClose(&f);
verbose(2, "Picked best protein for each protein cluster\n");


/* Loop through each transcript cluster (graph).  Make a list of
 * protein clusters associated with that graph. Armed with this
 * information call repick routine on each transcript in the graph. */
f = mustOpen(outputCds, "w");
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    /* Build up list of protein clusters associated with transcript cluster. */
    struct slRef *protClusterRefList = NULL, *protClusterRef;
    int i;
    for (i=0; i<txg->sourceCount; ++i)
	{
	char *tx = txg->sources[i].accession;
	struct txCluster *protCluster = hashFindVal(clusterHash, tx);
	if (protCluster != NULL)
	    refAddUnique(&protClusterRefList, protCluster);
	}

    /* Figure out best scoring protein in RNA cluster, and set threshold
     * to eliminate ones scoring less than half this much. */
    double bestProtScore = 0;
    for (protClusterRef = protClusterRefList; protClusterRef != NULL;
    	protClusterRef = protClusterRef->next)
	{
	struct txCluster *protCluster = protClusterRef->val;
	char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name);
	struct txInfo *info = hashMustFindVal(infoHash, protTx);
	double score = infoCodingScore(info, FALSE);
	bestProtScore = max(score, bestProtScore);
	}
    double protScoreThreshold = bestProtScore * 0.5;

    /* Get list of references to beds of proteins over that threshold. */
    struct slRef *protRefList = NULL;
    for (protClusterRef = protClusterRefList; protClusterRef != NULL;
    	protClusterRef = protClusterRef->next)
	{
	struct txCluster *protCluster = protClusterRef->val;
	char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name);
	struct txInfo *info = hashMustFindVal(infoHash, protTx);
	double score = infoCodingScore(info, FALSE);
	if (score >= protScoreThreshold)
	    {
	    struct bed *bed = hashMustFindVal(bedHash, protTx);
	    refAdd(&protRefList, bed);
	    }
	}

    /* Go repick each CDS in RNA cluster */
    for (i=0; i<txg->sourceCount; ++i)
        {
	char *tx = txg->sources[i].accession;
	struct bed *bed = hashMustFindVal(bedHash, tx);
	struct cdsEvidence *evList = hashFindVal(evHash, tx);
	if (evList != NULL && bed->thickStart < bed->thickEnd)
	    {
	    info = hashMustFindVal(infoHash, bed->name);
	    pickCompatableCds(bed, protRefList, evList, info, f);
	    }
	}
    slFreeList(&protClusterRefList);
    }
carefulClose(&f);
verbose(1, "repicked %d, removed %d, no change to %d\n",
    pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone);
}