void txGeneAccession(char *oldBedFile, char *lastIdFile, char *newBedFile, char *txToAccFile, char *oldToNewFile) /* txGeneAccession - Assign permanent accession number to genes. */ { /* Read in all input. */ struct bed *oldList = bedLoadNAll(oldBedFile, 12); verbose(2, "Read %d from %s\n", slCount(oldList), oldBedFile); struct bed *newList = bedLoadNAll(newBedFile, 12); verbose(2, "Read %d from %s\n", slCount(newList), newBedFile); int txId = readNumberFromFile(lastIdFile); verbose(2, "Last txId used was %d (from %s)\n", txId, lastIdFile); /* Make a random-access data structure for old list. */ struct hash *oldHash = bedsIntoKeeperHash(oldList); /* Make a little hash to help prevent us from reusing an * old accession twice (which might happen if we extend it * in two incompatible ways). */ struct hash *usedHash = hashNew(16); /* Record our decisions in hash as well as file. */ struct hash *idToAccHash = hashNew(16); /* Loop through new list first looking for exact matches. Record * exact matches in hash so we don't look for them again during * the next, "compatable" match phase. */ struct hash *oldExactHash = hashNew(16), *newExactHash = hashNew(16); struct bed *oldBed, *newBed; FILE *f = mustOpen(txToAccFile, "w"); FILE *fOld = mustOpen(oldToNewFile, "w"); for (newBed = newList; newBed != NULL; newBed = newBed->next) { oldBed = findExact(newBed, oldHash, usedHash); if (oldBed != NULL) { hashAdd(oldExactHash, oldBed->name, oldBed); hashAdd(newExactHash, newBed->name, newBed); hashAdd(usedHash, oldBed->name, NULL); fprintf(f, "%s\t%s\n", newBed->name, oldBed->name); hashAdd(idToAccHash, newBed->name, oldBed->name); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd, oldBed->name, oldBed->name, "exact"); } } /* Loop through new bed looking for compatible things. If * we can't find anything compatable, make up a new accession. */ for (newBed = newList; newBed != NULL; newBed = newBed->next) { if (!hashLookup(newExactHash, newBed->name)) { oldBed = findCompatible(newBed, oldHash, usedHash); if (oldBed == NULL) { char newAcc[16]; txGeneAccFromId(++txId, newAcc); strcat(newAcc, ".1"); fprintf(f, "%s\t%s\n", newBed->name, newAcc); hashAdd(idToAccHash, newBed->name, cloneString(newAcc)); oldBed = findMostOverlapping(newBed, oldHash); char *oldAcc = (oldBed == NULL ? "" : oldBed->name); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd, oldAcc, newAcc, "new"); } else { char *acc = cloneString(oldBed->name); char *ver = strchr(acc, '.'); if (ver == NULL) errAbort("No version found in %s", oldBed->name); *ver++ = 0; int version = sqlUnsigned(ver); char newAcc[16]; safef(newAcc, sizeof(newAcc), "%s.%d", acc, version+1); hashAdd(usedHash, oldBed->name, NULL); fprintf(f, "%s\t%s\n", newBed->name, newAcc); hashAdd(idToAccHash, newBed->name, newAcc); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd, oldBed->name, newAcc, "compatible"); } } } carefulClose(&f); /* Make a random-access data structure for old list. */ struct hash *newHash = bedsIntoKeeperHash(newList); /* Write record of ones that don't map. */ for (oldBed = oldList; oldBed != NULL; oldBed = oldBed->next) { if (!hashLookup(usedHash, oldBed->name)) { char *newAcc = ""; struct bed *newBed = findMostOverlapping(oldBed, newHash); if (newBed != NULL) newAcc = hashMustFindVal(idToAccHash, newBed->name); fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd, oldBed->name, newAcc, "lost"); } } carefulClose(&fOld); if (!optionExists("test")) { FILE *fId = mustOpen(lastIdFile, "w"); fprintf(fId, "%d\n", txId); carefulClose(&fId); } }
void txGeneBakeOff(char *database, char *refRevFile, char *refClusterFile, char *geneTrack) /* txGeneBakeOff - Compare gene finder results to reference annotations.. */ { hSetDb(database); /* Make list of our clusters. */ struct refCluster *cluster, *clusterList = refClusterLoadAll(refClusterFile); /* Make list of only refseqs in reviewed list. */ struct hash *refRevOnlyHash = hashWordsInFile(refRevFile, 16); struct bed *refAll = bedThickOnlyList(hWholeTrackAsBedList("refGene")); struct bed *refList = NULL, *nextRef, *ref; struct hash *refBedHash = hashNew(18); for (ref = refAll; ref != NULL; ref = nextRef) { nextRef = ref->next; if (hashLookup(refRevOnlyHash, ref->name)) { slAddHead(&refList, ref); hashAdd(refBedHash, ref->name, ref); } } verbose(2, "%d of %d reviewed are still in refGene track\n", slCount(refList), refRevOnlyHash->elCount); /* Turn this into hash. */ struct hash *refHash = bedsIntoKeeperHash(refList); verbose(2, "Loaded %d items from %s into %d chromosomes\n", slCount(refList), refRevFile, refHash->elCount); struct bed *geneList = bedThickOnlyList(hWholeTrackAsBedList(geneTrack)); struct hash *geneHash = bedsIntoKeeperHash(geneList); verbose(2, "Loaded %d items from %s into %d chromosomes\n", slCount(geneList), geneTrack, geneHash->elCount); int allCount = 0, allMiss = 0; int allExact = 0, allClose = 0, allHalf = 0, allAny = 0; // struct hash *uniqHash = hashNew(0); for (ref = refList; ref != NULL; ref = ref->next) { double ratio; struct bed *gene = mostOverlappingBed(ref, geneHash, &ratio); if (gene != NULL) { ++allAny; if (ratio == 1.0) ++allExact; else if (ratio >= 0.80) ++allClose; else if (ratio >= 0.50) ++allHalf; } else ++allMiss; ++allCount; } printf("Exact match: %d (%4.2f%%)\n", allExact, 100.0 * allExact/allCount); printf("80%% match: %d (%4.2f%%)\n", allClose, 100.0 * allClose/allCount); //printf("50%% match: %d (%4.2f%%)\n", allHalf, 100.0 * allHalf/allCount); //printf("any match: %d (%4.2f%%)\n", allAny, 100.0 * allAny/allCount); //printf("Clean miss: %d (%4.2f%%)\n", allMiss, 100.0 * allMiss/allCount); int anyCount = 0, anyMiss = 0; int anyExact = 0, anyClose = 0, anyHalf = 0, anyAny = 0; for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { struct bed *gene, *ref; double ratio; if (findMostOverlappingInCluster(cluster, geneHash, refBedHash, &gene, &ref, &ratio)) { ++anyAny; if (ratio == 1.0) ++anyExact; else if (ratio >= 0.80) ++anyClose; else if (ratio >= 0.50) ++anyHalf; } else ++anyMiss; ++anyCount; } // printf("Total reviewed clusters: %d\n", anyCount); printf("Exact match any: %d (%4.2f%%)\n", anyExact, 100.0 * anyExact/anyCount); printf("80%% match any: %d (%4.2f%%)\n", anyClose, 100.0 * anyClose/anyCount); // printf("50%% match any: %d (%4.2f%%)\n", anyHalf, 100.0 * anyHalf/anyCount); // printf("any match any: %d (%4.2f%%)\n", anyAny, 100.0 * anyAny/anyCount); // printf("Clean miss any: %d (%4.2f%%)\n", anyMiss, 100.0 * anyMiss/anyCount); printf("Base coverage: (%4.2f%%)\n", 100.0*calcBaseCoverage(refHash, geneHash)); }