void pickCompatableCds(struct bed *bed, struct slRef *protRefList, 
	struct cdsEvidence *evList, struct txInfo *info, FILE *f)
/* Given a bed, a list of protein-coding beds to be compatible with,
 * and a sorted list of possible CDS's, write first CDS if any that
 * is compatible with any on list to file. */
{
if (info->isRefSeq || bedCompatibleWithList(bed, protRefList))
    {
    cdsEvidenceTabOut(evList, f);
    }
else
    {
    struct cdsEvidence *ev;
    for (ev = evList->next; ev != NULL; ev = ev->next)
	{
	if (ev->cdsCount == 1)
	    {
	    cdsEvidenceSetBedThick(ev, bed);
	    if (bedCompatibleWithList(bed, protRefList))
		{
		cdsEvidenceTabOut(ev, f);
		verbose(3, "Repicking CDS for %s, new one is based on %s %s score %f\n", 
			bed->name, ev->source, ev->accession, ev->score);
		++pickedBetter;
		break;
		}
	    }
	}
    if (ev == NULL)
	{
	verbose(3, "Removing CDS from %s\n", bed->name);
        ++pickedNone;
	}
    }
}
Пример #2
0
void txCdsRefBestEvOnly(char *inFile, char *outFile)
/* txCdsRefBestEvOnly - Go through a cdsEvidence file, and extract only the bits that refer to the native orf for a RefSeqReviewed transcript.. */
{
struct cdsEvidence *cds, *cdsList = cdsEvidenceLoadAll(inFile);
struct hash *nativeEvHash = hashNew(18);
FILE *f = mustOpen(outFile, "w");

/* Make one pass through list adding the native refseq reviewed records to hash. */
for (cds = cdsList; cds != NULL; cds = cds->next)
    {
    if (sameString(cds->source, "RefSeqReviewed"))
        {
	char *acc = strrchr(cds->name, '.');
	assert(acc != NULL);
	acc += 1;
	if (sameString(acc, cds->accession))
	    hashAdd(nativeEvHash, cds->name, cds);
	}
    }

/* Make another pass through outputting all lines that correspond to
 * reviewd refseq's ORF. */
for (cds = cdsList; cds != NULL; cds = cds->next)
    {
    struct cdsEvidence *native = hashFindVal(nativeEvHash, cds->name);
    if (native != NULL)
        {
	if (cds->start == native->start && cds->end == native->end)
	    cdsEvidenceTabOut(cds, f);
	}
    }
carefulClose(&f);
}
Пример #3
0
void txCdsPredict(char *inFa, char *outCds, char *nmdBed, char *mafFile, boolean anyStart)
/* txCdsPredict - Somewhat simple-minded ORF predictor using a weighting scheme.. */
{
struct dnaSeq *rna, *rnaList = faReadAllDna(inFa);
verbose(2, "Read %d sequences from %s\n", slCount(rnaList), inFa);

/* Make up hash of bed records for NMD analysis. */
struct hash *nmdHash = hashNew(18);
if (nmdBed != NULL)
    {
    struct bed *bed, *bedList = bedLoadNAll(nmdBed, 12);
    for (bed = bedList; bed != NULL; bed = bed->next)
        hashAdd(nmdHash, bed->name, bed);
    verbose(2, "Read %d beds from %s\n", nmdHash->elCount, nmdBed);
    }

/* Make up hash of maf records for conservation analysis. */
struct hash *mafHash = hashNew(18);
int otherSpeciesCount = 0;
if (mafFile != NULL)
    {
    struct mafFile *mf = mafReadAll(mafFile);
    struct mafAli *maf;
    for (maf = mf->alignments; maf != NULL; maf = maf->next)
	hashAdd(mafHash, maf->components->src, maf);
    verbose(2, "Read %d alignments from %s\n", mafHash->elCount, mafFile);

    struct hash *uniqSpeciesHash = hashNew(0);
    for (maf = mf->alignments; maf != NULL; maf = maf->next)
        {
	struct mafComp *comp;
	for (comp = maf->components->next;  comp != NULL; comp = comp->next)
	    hashStore(uniqSpeciesHash, comp->src);
	}
    otherSpeciesCount = uniqSpeciesHash->elCount;
    verbose(2, "%d other species in %s\n", otherSpeciesCount, mafFile);
    }

FILE *f = mustOpen(outCds, "w");
for (rna = rnaList; rna != NULL; rna = rna->next)
    {
    verbose(3, "%s\n", rna->name);
    struct cdsEvidence *orfList = orfsOnRna(rna, nmdHash, mafHash, otherSpeciesCount, anyStart);
    if (orfList != NULL)
	{
	slSort(&orfList, cdsEvidenceCmpScore);
	cdsEvidenceTabOut(orfList, f);
	}
    cdsEvidenceFreeList(&orfList);
    }
carefulClose(&f);
}
void txCdsPick(char *inBed, char *inTce, char *refToPepTab, char *outTce, char *outPick)
/* txCdsPick - Pick best CDS if any for transcript given evidence.. */
{
struct hash *pepToRefHash, *refToPepHash;
hashRefToPep(refToPepTab, &refToPepHash, &pepToRefHash);
struct hash *txCdsInfoHash = loadAndWeighTce(inTce, refToPepHash, pepToRefHash);
verbose(2, "Read info on %d transcripts from %s\n", 
	txCdsInfoHash->elCount, inTce);
struct lineFile *lf = lineFileOpen(inBed, TRUE);
FILE *fTce = mustOpen(outTce, "w");
FILE *fPick = mustOpen(outPick, "w");
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    struct txCdsInfo *tx = hashFindVal(txCdsInfoHash, bed->name);
    struct cdsPick pick;
    ZeroVar(&pick);
    pick.name = bed->name;
    pick.refSeq = pick.refProt = pick.swissProt = pick.uniProt = pick.ccds = "";
    if (tx != NULL && tx->cdsList->score >= weightedThreshold)
        {
	struct cdsEvidence *cds, *bestCds = tx->cdsList;
	int bestSize = bestCds->end - bestCds->start;
	int minSize = bestSize*0.50;
	cdsEvidenceTabOut(bestCds, fTce);
	pick.start = bestCds->start;
	pick.end = bestCds->end;
	pick.source = bestCds->source;
	pick.score = bestCds->score;
	pick.startComplete = bestCds->startComplete;
	pick.endComplete = bestCds->endComplete;
	for (cds = tx->cdsList; cds != NULL; cds = cds->next)
	    {
	    char *source = cds->source;
	    if (rangeIntersection(bestCds->start, bestCds->end, cds->start, cds->end)
	    	>= minSize)
		{
		if (startsWith("RefPep", source))
		    {
		    if (pick.refProt[0] == 0)
			{
			pick.refProt = cds->accession;
			if (pick.refSeq[0] == 0)
			    pick.refSeq = hashMustFindVal(pepToRefHash, cds->accession);
			}
		    }
		else if (startsWith("RefSeq", source))
		    {
		    if (pick.refSeq[0] == 0)
		        pick.refSeq = cds->accession;
		    }
		else if (sameString("swissProt", source))
		    {
		    if (pick.swissProt[0] == 0)
			{
			pick.swissProt = cds->accession;
			if (pick.uniProt[0] == 0)
			    pick.uniProt = cds->accession;
			}
		    }
		else if (sameString("trembl", source))
		    {
		    if (pick.uniProt[0] == 0)
			pick.uniProt = cds->accession;
		    }
		else if (sameString("txCdsPredict", source))
		    {
		    }
		else if (sameString("genbankCds", source))
		    {
		    }
		else if (sameString("ccds", source))
		    {
		    if (pick.ccds[0] == 0)
		        pick.ccds = cds->accession;
		    }
		else
		    errAbort("Unknown source %s", source);
		}
	    }

	if (exceptionsOut)
	    transferExceptions(bestCds->accession, bestCds->source, pepToRefHash,
	    	bed->name, exceptionsOut);
	}
    else
        {
	pick.source = "noncoding";
	}
    cdsPickTabOut(&pick, fPick);
    bedFree(&bed);
    }
carefulClose(&fPick);
carefulClose(&fTce);
}