void pickCompatableCds(struct bed *bed, struct slRef *protRefList, struct cdsEvidence *evList, struct txInfo *info, FILE *f) /* Given a bed, a list of protein-coding beds to be compatible with, * and a sorted list of possible CDS's, write first CDS if any that * is compatible with any on list to file. */ { if (info->isRefSeq || bedCompatibleWithList(bed, protRefList)) { cdsEvidenceTabOut(evList, f); } else { struct cdsEvidence *ev; for (ev = evList->next; ev != NULL; ev = ev->next) { if (ev->cdsCount == 1) { cdsEvidenceSetBedThick(ev, bed); if (bedCompatibleWithList(bed, protRefList)) { cdsEvidenceTabOut(ev, f); verbose(3, "Repicking CDS for %s, new one is based on %s %s score %f\n", bed->name, ev->source, ev->accession, ev->score); ++pickedBetter; break; } } } if (ev == NULL) { verbose(3, "Removing CDS from %s\n", bed->name); ++pickedNone; } } }
void txCdsRefBestEvOnly(char *inFile, char *outFile) /* txCdsRefBestEvOnly - Go through a cdsEvidence file, and extract only the bits that refer to the native orf for a RefSeqReviewed transcript.. */ { struct cdsEvidence *cds, *cdsList = cdsEvidenceLoadAll(inFile); struct hash *nativeEvHash = hashNew(18); FILE *f = mustOpen(outFile, "w"); /* Make one pass through list adding the native refseq reviewed records to hash. */ for (cds = cdsList; cds != NULL; cds = cds->next) { if (sameString(cds->source, "RefSeqReviewed")) { char *acc = strrchr(cds->name, '.'); assert(acc != NULL); acc += 1; if (sameString(acc, cds->accession)) hashAdd(nativeEvHash, cds->name, cds); } } /* Make another pass through outputting all lines that correspond to * reviewd refseq's ORF. */ for (cds = cdsList; cds != NULL; cds = cds->next) { struct cdsEvidence *native = hashFindVal(nativeEvHash, cds->name); if (native != NULL) { if (cds->start == native->start && cds->end == native->end) cdsEvidenceTabOut(cds, f); } } carefulClose(&f); }
void txCdsPredict(char *inFa, char *outCds, char *nmdBed, char *mafFile, boolean anyStart) /* txCdsPredict - Somewhat simple-minded ORF predictor using a weighting scheme.. */ { struct dnaSeq *rna, *rnaList = faReadAllDna(inFa); verbose(2, "Read %d sequences from %s\n", slCount(rnaList), inFa); /* Make up hash of bed records for NMD analysis. */ struct hash *nmdHash = hashNew(18); if (nmdBed != NULL) { struct bed *bed, *bedList = bedLoadNAll(nmdBed, 12); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(nmdHash, bed->name, bed); verbose(2, "Read %d beds from %s\n", nmdHash->elCount, nmdBed); } /* Make up hash of maf records for conservation analysis. */ struct hash *mafHash = hashNew(18); int otherSpeciesCount = 0; if (mafFile != NULL) { struct mafFile *mf = mafReadAll(mafFile); struct mafAli *maf; for (maf = mf->alignments; maf != NULL; maf = maf->next) hashAdd(mafHash, maf->components->src, maf); verbose(2, "Read %d alignments from %s\n", mafHash->elCount, mafFile); struct hash *uniqSpeciesHash = hashNew(0); for (maf = mf->alignments; maf != NULL; maf = maf->next) { struct mafComp *comp; for (comp = maf->components->next; comp != NULL; comp = comp->next) hashStore(uniqSpeciesHash, comp->src); } otherSpeciesCount = uniqSpeciesHash->elCount; verbose(2, "%d other species in %s\n", otherSpeciesCount, mafFile); } FILE *f = mustOpen(outCds, "w"); for (rna = rnaList; rna != NULL; rna = rna->next) { verbose(3, "%s\n", rna->name); struct cdsEvidence *orfList = orfsOnRna(rna, nmdHash, mafHash, otherSpeciesCount, anyStart); if (orfList != NULL) { slSort(&orfList, cdsEvidenceCmpScore); cdsEvidenceTabOut(orfList, f); } cdsEvidenceFreeList(&orfList); } carefulClose(&f); }
void txCdsPick(char *inBed, char *inTce, char *refToPepTab, char *outTce, char *outPick) /* txCdsPick - Pick best CDS if any for transcript given evidence.. */ { struct hash *pepToRefHash, *refToPepHash; hashRefToPep(refToPepTab, &refToPepHash, &pepToRefHash); struct hash *txCdsInfoHash = loadAndWeighTce(inTce, refToPepHash, pepToRefHash); verbose(2, "Read info on %d transcripts from %s\n", txCdsInfoHash->elCount, inTce); struct lineFile *lf = lineFileOpen(inBed, TRUE); FILE *fTce = mustOpen(outTce, "w"); FILE *fPick = mustOpen(outPick, "w"); char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); struct txCdsInfo *tx = hashFindVal(txCdsInfoHash, bed->name); struct cdsPick pick; ZeroVar(&pick); pick.name = bed->name; pick.refSeq = pick.refProt = pick.swissProt = pick.uniProt = pick.ccds = ""; if (tx != NULL && tx->cdsList->score >= weightedThreshold) { struct cdsEvidence *cds, *bestCds = tx->cdsList; int bestSize = bestCds->end - bestCds->start; int minSize = bestSize*0.50; cdsEvidenceTabOut(bestCds, fTce); pick.start = bestCds->start; pick.end = bestCds->end; pick.source = bestCds->source; pick.score = bestCds->score; pick.startComplete = bestCds->startComplete; pick.endComplete = bestCds->endComplete; for (cds = tx->cdsList; cds != NULL; cds = cds->next) { char *source = cds->source; if (rangeIntersection(bestCds->start, bestCds->end, cds->start, cds->end) >= minSize) { if (startsWith("RefPep", source)) { if (pick.refProt[0] == 0) { pick.refProt = cds->accession; if (pick.refSeq[0] == 0) pick.refSeq = hashMustFindVal(pepToRefHash, cds->accession); } } else if (startsWith("RefSeq", source)) { if (pick.refSeq[0] == 0) pick.refSeq = cds->accession; } else if (sameString("swissProt", source)) { if (pick.swissProt[0] == 0) { pick.swissProt = cds->accession; if (pick.uniProt[0] == 0) pick.uniProt = cds->accession; } } else if (sameString("trembl", source)) { if (pick.uniProt[0] == 0) pick.uniProt = cds->accession; } else if (sameString("txCdsPredict", source)) { } else if (sameString("genbankCds", source)) { } else if (sameString("ccds", source)) { if (pick.ccds[0] == 0) pick.ccds = cds->accession; } else errAbort("Unknown source %s", source); } } if (exceptionsOut) transferExceptions(bestCds->accession, bestCds->source, pepToRefHash, bed->name, exceptionsOut); } else { pick.source = "noncoding"; } cdsPickTabOut(&pick, fPick); bedFree(&bed); } carefulClose(&fPick); carefulClose(&fTce); }