void mafMeFirst(char *inMaf, char *meFile, char *outMaf)
/* mafMeFirst - Move component to top if it is one of the named ones.  Useful 
 * in conjunction with mafFrags when you don't want the one with the gene name 
 * to be in the middle.. */
{
struct hash *meHash = hashWordsInFile(meFile, 18);
struct mafFile *mf = mafOpen(inMaf);
FILE *f = mustOpen(outMaf, "w");
mafWriteStart(f, mf->scoring);
struct mafAli *maf;
while ((maf = mafNext(mf)) != NULL)
    {
    struct mafComp *comp = compInHash(maf, meHash);
    if (comp == NULL)
        errAbort("No components in %s in maf ending line %d of %s",
		meFile, mf->lf->lineIx, mf->lf->fileName);
    slRemoveEl(&maf->components, comp);
    slAddHead(&maf->components, comp);
    mafWrite(f, maf);
    mafAliFree(&maf);
    }

mafWriteEnd(f);
carefulClose(&f);
}
void weedLines(char *weedFile, char *file, char *output, 
	boolean invert, char *invertOutput)
/* weedLines - Selectively remove lines from file. */
{
struct hash *hash = hashWordsInFile(weedFile, 16);
struct hashEl *weedList = hashElListHash(hash);
verbose(2, "%d words in weed file %s\n", hash->elCount, weedFile);
struct lineFile *lf = lineFileOpen(file, TRUE);
char *line, *word;
FILE *f = mustOpen(output, "w");
FILE *fInvert = NULL;
boolean embedded = optionExists("embedded");
if (invertOutput != NULL)
    fInvert = mustOpen(invertOutput, "w");

while (lineFileNext(lf, &line, NULL))
    {
    boolean doWeed = FALSE;
    char *dupe = NULL;
    if (embedded)
	{
	struct hashEl *hel;
	for (hel = weedList; hel != NULL; hel = hel->next)
	    {
	    if (stringIn(hel->name, line))
	        doWeed = TRUE;
	    }
	}
    else
	{
	dupe = cloneString(line);
	while ((word = nextWord(&line)) != NULL)
	    {
	    if (hashLookup(hash, word))
		doWeed = TRUE;
	    }
	line = dupe;
	}
    if (invert)
	doWeed = !doWeed;
    if (!doWeed)
	fprintf(f, "%s\n", line);
    else
	{
	if (fInvert != NULL)
	    fprintf(fInvert, "%s\n", line);
	}
    freez(&dupe);
    }
}
static struct hash *processFieldHash(struct joiner *joiner, char *inName,
                                     char *outName)
/* Read in field hash from file if inName is non-NULL,
 * else read from database.  If outName is non-NULL,
 * save it to file.  */
{
    struct hash *fieldHash;

    if (inName != NULL)
        fieldHash = hashWordsInFile(inName, 18);
    else
        fieldHash = joinerAllFields(joiner);
    if (outName != NULL)
    {
        struct hashEl *el, *list = hashElListHash(fieldHash);
        FILE *f = mustOpen(outName, "w");
        slSort(&list, hashElCmp);
        for (el = list; el != NULL; el = el->next)
            fprintf(f, "%s\n", el->name);
        slFreeList(&list);
        carefulClose(&f);
    }
    return fieldHash;
}
Exemple #4
0
void txGeneBakeOff(char *database, char *refRevFile, char *refClusterFile, char *geneTrack)
/* txGeneBakeOff - Compare gene finder results to reference annotations.. */
{
hSetDb(database);

/* Make list of our clusters. */
struct refCluster *cluster, *clusterList = refClusterLoadAll(refClusterFile);

/* Make list of only refseqs in reviewed list. */
struct hash *refRevOnlyHash = hashWordsInFile(refRevFile, 16);
struct bed *refAll = bedThickOnlyList(hWholeTrackAsBedList("refGene"));
struct bed *refList = NULL, *nextRef, *ref;
struct hash *refBedHash = hashNew(18);
for (ref = refAll; ref != NULL; ref = nextRef)
    {
    nextRef = ref->next;
    if (hashLookup(refRevOnlyHash, ref->name))
        {
	slAddHead(&refList, ref);
	hashAdd(refBedHash, ref->name, ref);
	}
    }
verbose(2, "%d of %d reviewed are still in refGene track\n", 
	slCount(refList), refRevOnlyHash->elCount);

/* Turn this into hash. */
struct hash *refHash = bedsIntoKeeperHash(refList);
verbose(2, "Loaded %d items from %s into %d chromosomes\n", 
	slCount(refList), refRevFile, refHash->elCount);

struct bed *geneList = bedThickOnlyList(hWholeTrackAsBedList(geneTrack));
struct hash *geneHash = bedsIntoKeeperHash(geneList);
verbose(2, "Loaded %d items from %s into %d chromosomes\n", 
	slCount(geneList), geneTrack, geneHash->elCount);

int allCount = 0, allMiss = 0;
int allExact = 0, allClose = 0, allHalf = 0, allAny = 0;
// struct hash *uniqHash = hashNew(0);
for (ref = refList; ref != NULL; ref = ref->next)
   {
   double ratio;
   struct bed *gene = mostOverlappingBed(ref, geneHash, &ratio);
   if (gene != NULL)
       {
       ++allAny;
       if (ratio == 1.0)
           ++allExact;
       else if (ratio >= 0.80)
           ++allClose;
       else if (ratio >= 0.50)
           ++allHalf;
       }
    else
       ++allMiss;
   ++allCount;
   }
printf("Exact match:    %d (%4.2f%%)\n", allExact, 100.0 * allExact/allCount);
printf("80%% match:     %d (%4.2f%%)\n", allClose, 100.0 * allClose/allCount);
//printf("50%% match:     %d (%4.2f%%)\n", allHalf, 100.0 * allHalf/allCount);
//printf("any match:      %d (%4.2f%%)\n", allAny, 100.0 * allAny/allCount);
//printf("Clean miss:     %d (%4.2f%%)\n", allMiss, 100.0 * allMiss/allCount);

int anyCount = 0, anyMiss = 0;
int anyExact = 0, anyClose = 0, anyHalf = 0, anyAny = 0;
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
   {
   struct bed *gene, *ref;
   double ratio;
   if (findMostOverlappingInCluster(cluster, geneHash, refBedHash, &gene, &ref, &ratio))
       {
       ++anyAny;
       if (ratio == 1.0)
           ++anyExact;
       else if (ratio >= 0.80)
           ++anyClose;
       else if (ratio >= 0.50)
           ++anyHalf;
       }
    else
       ++anyMiss;
   ++anyCount;
   }

// printf("Total reviewed clusters: %d\n", anyCount);
printf("Exact match any:    %d (%4.2f%%)\n", anyExact, 100.0 * anyExact/anyCount);
printf("80%% match any:     %d (%4.2f%%)\n", anyClose, 100.0 * anyClose/anyCount);
// printf("50%% match any:     %d (%4.2f%%)\n", anyHalf, 100.0 * anyHalf/anyCount);
// printf("any match any:      %d (%4.2f%%)\n", anyAny, 100.0 * anyAny/anyCount);
// printf("Clean miss any:     %d (%4.2f%%)\n", anyMiss, 100.0 * anyMiss/anyCount);

printf("Base coverage:  (%4.2f%%)\n", 100.0*calcBaseCoverage(refHash, geneHash));

}
Exemple #5
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}