void txGeneAccession(char *oldBedFile, char *lastIdFile, char *newBedFile, char *txToAccFile,
	char *oldToNewFile)
/* txGeneAccession - Assign permanent accession number to genes. */
{
/* Read in all input. */
struct bed *oldList = bedLoadNAll(oldBedFile, 12);
verbose(2, "Read %d from %s\n", slCount(oldList), oldBedFile);
struct bed *newList = bedLoadNAll(newBedFile, 12);
verbose(2, "Read %d from %s\n", slCount(newList), newBedFile);
int txId = readNumberFromFile(lastIdFile);
verbose(2, "Last txId used was %d (from %s)\n", txId, lastIdFile);

/* Make a random-access data structure for old list. */
struct hash *oldHash = bedsIntoKeeperHash(oldList);

/* Make a little hash to help prevent us from reusing an
 * old accession twice (which might happen if we extend it
 * in two incompatible ways). */
struct hash *usedHash = hashNew(16);

/* Record our decisions in hash as well as file. */
struct hash *idToAccHash = hashNew(16);

/* Loop through new list first looking for exact matches. Record
 * exact matches in hash so we don't look for them again during
 * the next, "compatable" match phase. */
struct hash *oldExactHash = hashNew(16), *newExactHash = hashNew(16);
struct bed *oldBed, *newBed;
FILE *f = mustOpen(txToAccFile, "w");
FILE *fOld = mustOpen(oldToNewFile, "w");
for (newBed = newList; newBed != NULL; newBed = newBed->next)
    {
    oldBed = findExact(newBed, oldHash, usedHash);
    if (oldBed != NULL)
        {
	hashAdd(oldExactHash, oldBed->name, oldBed);
	hashAdd(newExactHash, newBed->name, newBed);
	hashAdd(usedHash, oldBed->name, NULL);
        fprintf(f, "%s\t%s\n", newBed->name, oldBed->name);
	hashAdd(idToAccHash, newBed->name, oldBed->name);
	fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd,
		oldBed->name, oldBed->name, "exact");
	}
    }

/* Loop through new bed looking for compatible things.  If
 * we can't find anything compatable, make up a new accession. */
for (newBed = newList; newBed != NULL; newBed = newBed->next)
    {
    if (!hashLookup(newExactHash, newBed->name))
	{
	oldBed = findCompatible(newBed, oldHash, usedHash);
	if (oldBed == NULL)
	    {
	    char newAcc[16];
	    txGeneAccFromId(++txId, newAcc);
	    strcat(newAcc, ".1");
	    fprintf(f, "%s\t%s\n", newBed->name, newAcc);
	    hashAdd(idToAccHash, newBed->name, cloneString(newAcc));
	    oldBed = findMostOverlapping(newBed, oldHash);
	    char *oldAcc = (oldBed == NULL ? "" : oldBed->name);
	    fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd,
	    	oldAcc, newAcc, "new");
	    }
	else
	    {
	    char *acc = cloneString(oldBed->name);
	    char *ver = strchr(acc, '.');
	    if (ver == NULL)
	        errAbort("No version found in %s", oldBed->name);
	    *ver++ = 0;
	    int version = sqlUnsigned(ver);
	    char newAcc[16];
	    safef(newAcc, sizeof(newAcc), "%s.%d", acc, version+1);
	    hashAdd(usedHash, oldBed->name, NULL);
	    fprintf(f, "%s\t%s\n", newBed->name, newAcc);
	    hashAdd(idToAccHash, newBed->name, newAcc);
	    fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd,
	    	oldBed->name, newAcc, "compatible");
	    }
	}
    }
carefulClose(&f);

/* Make a random-access data structure for old list. */
struct hash *newHash = bedsIntoKeeperHash(newList);

/* Write record of ones that don't map. */
for (oldBed = oldList; oldBed != NULL; oldBed = oldBed->next)
    {
    if (!hashLookup(usedHash, oldBed->name))
	{
	char *newAcc = "";
	struct bed *newBed = findMostOverlapping(oldBed, newHash);
	if (newBed != NULL)
	    newAcc = hashMustFindVal(idToAccHash, newBed->name);
	fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd,
		oldBed->name, newAcc, "lost");
	}
    }
carefulClose(&fOld);

if (!optionExists("test"))
    {
    FILE *fId = mustOpen(lastIdFile, "w");
    fprintf(fId, "%d\n", txId);
    carefulClose(&fId);
    }
}
Example #2
0
void txGeneBakeOff(char *database, char *refRevFile, char *refClusterFile, char *geneTrack)
/* txGeneBakeOff - Compare gene finder results to reference annotations.. */
{
hSetDb(database);

/* Make list of our clusters. */
struct refCluster *cluster, *clusterList = refClusterLoadAll(refClusterFile);

/* Make list of only refseqs in reviewed list. */
struct hash *refRevOnlyHash = hashWordsInFile(refRevFile, 16);
struct bed *refAll = bedThickOnlyList(hWholeTrackAsBedList("refGene"));
struct bed *refList = NULL, *nextRef, *ref;
struct hash *refBedHash = hashNew(18);
for (ref = refAll; ref != NULL; ref = nextRef)
    {
    nextRef = ref->next;
    if (hashLookup(refRevOnlyHash, ref->name))
        {
	slAddHead(&refList, ref);
	hashAdd(refBedHash, ref->name, ref);
	}
    }
verbose(2, "%d of %d reviewed are still in refGene track\n", 
	slCount(refList), refRevOnlyHash->elCount);

/* Turn this into hash. */
struct hash *refHash = bedsIntoKeeperHash(refList);
verbose(2, "Loaded %d items from %s into %d chromosomes\n", 
	slCount(refList), refRevFile, refHash->elCount);

struct bed *geneList = bedThickOnlyList(hWholeTrackAsBedList(geneTrack));
struct hash *geneHash = bedsIntoKeeperHash(geneList);
verbose(2, "Loaded %d items from %s into %d chromosomes\n", 
	slCount(geneList), geneTrack, geneHash->elCount);

int allCount = 0, allMiss = 0;
int allExact = 0, allClose = 0, allHalf = 0, allAny = 0;
// struct hash *uniqHash = hashNew(0);
for (ref = refList; ref != NULL; ref = ref->next)
   {
   double ratio;
   struct bed *gene = mostOverlappingBed(ref, geneHash, &ratio);
   if (gene != NULL)
       {
       ++allAny;
       if (ratio == 1.0)
           ++allExact;
       else if (ratio >= 0.80)
           ++allClose;
       else if (ratio >= 0.50)
           ++allHalf;
       }
    else
       ++allMiss;
   ++allCount;
   }
printf("Exact match:    %d (%4.2f%%)\n", allExact, 100.0 * allExact/allCount);
printf("80%% match:     %d (%4.2f%%)\n", allClose, 100.0 * allClose/allCount);
//printf("50%% match:     %d (%4.2f%%)\n", allHalf, 100.0 * allHalf/allCount);
//printf("any match:      %d (%4.2f%%)\n", allAny, 100.0 * allAny/allCount);
//printf("Clean miss:     %d (%4.2f%%)\n", allMiss, 100.0 * allMiss/allCount);

int anyCount = 0, anyMiss = 0;
int anyExact = 0, anyClose = 0, anyHalf = 0, anyAny = 0;
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
   {
   struct bed *gene, *ref;
   double ratio;
   if (findMostOverlappingInCluster(cluster, geneHash, refBedHash, &gene, &ref, &ratio))
       {
       ++anyAny;
       if (ratio == 1.0)
           ++anyExact;
       else if (ratio >= 0.80)
           ++anyClose;
       else if (ratio >= 0.50)
           ++anyHalf;
       }
    else
       ++anyMiss;
   ++anyCount;
   }

// printf("Total reviewed clusters: %d\n", anyCount);
printf("Exact match any:    %d (%4.2f%%)\n", anyExact, 100.0 * anyExact/anyCount);
printf("80%% match any:     %d (%4.2f%%)\n", anyClose, 100.0 * anyClose/anyCount);
// printf("50%% match any:     %d (%4.2f%%)\n", anyHalf, 100.0 * anyHalf/anyCount);
// printf("any match any:      %d (%4.2f%%)\n", anyAny, 100.0 * anyAny/anyCount);
// printf("Clean miss any:     %d (%4.2f%%)\n", anyMiss, 100.0 * anyMiss/anyCount);

printf("Base coverage:  (%4.2f%%)\n", 100.0*calcBaseCoverage(refHash, geneHash));

}