void mafMeFirst(char *inMaf, char *meFile, char *outMaf) /* mafMeFirst - Move component to top if it is one of the named ones. Useful * in conjunction with mafFrags when you don't want the one with the gene name * to be in the middle.. */ { struct hash *meHash = hashWordsInFile(meFile, 18); struct mafFile *mf = mafOpen(inMaf); FILE *f = mustOpen(outMaf, "w"); mafWriteStart(f, mf->scoring); struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { struct mafComp *comp = compInHash(maf, meHash); if (comp == NULL) errAbort("No components in %s in maf ending line %d of %s", meFile, mf->lf->lineIx, mf->lf->fileName); slRemoveEl(&maf->components, comp); slAddHead(&maf->components, comp); mafWrite(f, maf); mafAliFree(&maf); } mafWriteEnd(f); carefulClose(&f); }
void weedLines(char *weedFile, char *file, char *output, boolean invert, char *invertOutput) /* weedLines - Selectively remove lines from file. */ { struct hash *hash = hashWordsInFile(weedFile, 16); struct hashEl *weedList = hashElListHash(hash); verbose(2, "%d words in weed file %s\n", hash->elCount, weedFile); struct lineFile *lf = lineFileOpen(file, TRUE); char *line, *word; FILE *f = mustOpen(output, "w"); FILE *fInvert = NULL; boolean embedded = optionExists("embedded"); if (invertOutput != NULL) fInvert = mustOpen(invertOutput, "w"); while (lineFileNext(lf, &line, NULL)) { boolean doWeed = FALSE; char *dupe = NULL; if (embedded) { struct hashEl *hel; for (hel = weedList; hel != NULL; hel = hel->next) { if (stringIn(hel->name, line)) doWeed = TRUE; } } else { dupe = cloneString(line); while ((word = nextWord(&line)) != NULL) { if (hashLookup(hash, word)) doWeed = TRUE; } line = dupe; } if (invert) doWeed = !doWeed; if (!doWeed) fprintf(f, "%s\n", line); else { if (fInvert != NULL) fprintf(fInvert, "%s\n", line); } freez(&dupe); } }
static struct hash *processFieldHash(struct joiner *joiner, char *inName, char *outName) /* Read in field hash from file if inName is non-NULL, * else read from database. If outName is non-NULL, * save it to file. */ { struct hash *fieldHash; if (inName != NULL) fieldHash = hashWordsInFile(inName, 18); else fieldHash = joinerAllFields(joiner); if (outName != NULL) { struct hashEl *el, *list = hashElListHash(fieldHash); FILE *f = mustOpen(outName, "w"); slSort(&list, hashElCmp); for (el = list; el != NULL; el = el->next) fprintf(f, "%s\n", el->name); slFreeList(&list); carefulClose(&f); } return fieldHash; }
void txGeneBakeOff(char *database, char *refRevFile, char *refClusterFile, char *geneTrack) /* txGeneBakeOff - Compare gene finder results to reference annotations.. */ { hSetDb(database); /* Make list of our clusters. */ struct refCluster *cluster, *clusterList = refClusterLoadAll(refClusterFile); /* Make list of only refseqs in reviewed list. */ struct hash *refRevOnlyHash = hashWordsInFile(refRevFile, 16); struct bed *refAll = bedThickOnlyList(hWholeTrackAsBedList("refGene")); struct bed *refList = NULL, *nextRef, *ref; struct hash *refBedHash = hashNew(18); for (ref = refAll; ref != NULL; ref = nextRef) { nextRef = ref->next; if (hashLookup(refRevOnlyHash, ref->name)) { slAddHead(&refList, ref); hashAdd(refBedHash, ref->name, ref); } } verbose(2, "%d of %d reviewed are still in refGene track\n", slCount(refList), refRevOnlyHash->elCount); /* Turn this into hash. */ struct hash *refHash = bedsIntoKeeperHash(refList); verbose(2, "Loaded %d items from %s into %d chromosomes\n", slCount(refList), refRevFile, refHash->elCount); struct bed *geneList = bedThickOnlyList(hWholeTrackAsBedList(geneTrack)); struct hash *geneHash = bedsIntoKeeperHash(geneList); verbose(2, "Loaded %d items from %s into %d chromosomes\n", slCount(geneList), geneTrack, geneHash->elCount); int allCount = 0, allMiss = 0; int allExact = 0, allClose = 0, allHalf = 0, allAny = 0; // struct hash *uniqHash = hashNew(0); for (ref = refList; ref != NULL; ref = ref->next) { double ratio; struct bed *gene = mostOverlappingBed(ref, geneHash, &ratio); if (gene != NULL) { ++allAny; if (ratio == 1.0) ++allExact; else if (ratio >= 0.80) ++allClose; else if (ratio >= 0.50) ++allHalf; } else ++allMiss; ++allCount; } printf("Exact match: %d (%4.2f%%)\n", allExact, 100.0 * allExact/allCount); printf("80%% match: %d (%4.2f%%)\n", allClose, 100.0 * allClose/allCount); //printf("50%% match: %d (%4.2f%%)\n", allHalf, 100.0 * allHalf/allCount); //printf("any match: %d (%4.2f%%)\n", allAny, 100.0 * allAny/allCount); //printf("Clean miss: %d (%4.2f%%)\n", allMiss, 100.0 * allMiss/allCount); int anyCount = 0, anyMiss = 0; int anyExact = 0, anyClose = 0, anyHalf = 0, anyAny = 0; for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { struct bed *gene, *ref; double ratio; if (findMostOverlappingInCluster(cluster, geneHash, refBedHash, &gene, &ref, &ratio)) { ++anyAny; if (ratio == 1.0) ++anyExact; else if (ratio >= 0.80) ++anyClose; else if (ratio >= 0.50) ++anyHalf; } else ++anyMiss; ++anyCount; } // printf("Total reviewed clusters: %d\n", anyCount); printf("Exact match any: %d (%4.2f%%)\n", anyExact, 100.0 * anyExact/anyCount); printf("80%% match any: %d (%4.2f%%)\n", anyClose, 100.0 * anyClose/anyCount); // printf("50%% match any: %d (%4.2f%%)\n", anyHalf, 100.0 * anyHalf/anyCount); // printf("any match any: %d (%4.2f%%)\n", anyAny, 100.0 * anyAny/anyCount); // printf("Clean miss any: %d (%4.2f%%)\n", anyMiss, 100.0 * anyMiss/anyCount); printf("Base coverage: (%4.2f%%)\n", 100.0*calcBaseCoverage(refHash, geneHash)); }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }