예제 #1
0
void bedFirstCodingExonSize(char *inBed, char *overBed, char *underBed, char *outSize)
/* bedFirstCodingExonSize - Figure out size of first coding exon. */
{
FILE *fSize = mustOpen(outSize, "w");
FILE *fOver = NULL, *fUnder = NULL;
if (overBed)
    fOver = mustOpen(overBed, "w");
if (underBed)
    fUnder = mustOpen(underBed, "w");
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
        {
	int firstCdsSize = bedFirstCdsSize(bed);
	fprintf(fSize, "%s\t%d\n", bed->name, firstCdsSize);
	if (firstCdsSize >= threshold)
	    {
	    if (fOver != NULL)
	        bedTabOutN(bed, 12, fOver);
	    }
	else
	    {
	    if (fUnder != NULL)
	        bedTabOutN(bed, 12, fUnder);
	    }
	}
    }
carefulClose(&fSize);
carefulClose(&fOver);
carefulClose(&fUnder);
}
예제 #2
0
struct hash *loadRegions(char *file)
/* load regions into a hash of lists by chrom */
{
struct bed *bed = NULL, *bedList = NULL, *nextBed = NULL, *temp = NULL;
struct hash *regionHash = newHash(6);
struct bed *regions;

regions = bedLoadNAll(file, outDir ? 4 : 3);
/* order by chrom, start */
slSort(&regions, bedCmp);
verbose(2, "found %d regions\n", slCount(regions));
bedList = regions;
for (bed = regions; bed != NULL; bed = nextBed)
    {
    verbose(3, "region %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd);
    nextBed = bed->next;
    if ((bed->next == NULL) || (differentString(bed->chrom,bed->next->chrom)))
	{
	temp = bed->next;
	bed->next = NULL;
	hashAdd(regionHash, bed->chrom, bedList);
	verbose(2, "just added %d regions on %s\n", slCount(bedList), bed->chrom);
	bedList = temp;
	}
    }
return regionHash;
}
예제 #3
0
void cgapSageBedAddFreqs(char *oldBedFile, char *freqFile, char *libsFile, char *newBedFile)
/* cgapSageBedAddFreqs - Add frequency data to the bed. */
{
struct hash *totTagsHash;
struct bed *mappings;
struct hash *freqHash;
struct cgapSage *sageList;
verbose(1, "Loading libraries...\n");
totTagsHash =  getTotTagsHash(libsFile);
verbose(1, "Loaded libraries.\n");
verbose(1, "Loading mappings...\n");
mappings = bedLoadNAll(oldBedFile, 8);
verbose(1, "Loaded mappings.\n");
verbose(1, "Loading frequencies...\n");
freqHash = getFreqHash(freqFile);
verbose(1, "Loaded frequencies.\n");
verbose(1, "Building new bed list...\n");
sageList = makeCgapSageList(freqHash, totTagsHash, mappings);
verbose(1, "Built new bed list.\n");
verbose(1, "Writing output...\n");
writeCgapSageFile(sageList, newBedFile);
verbose(1, "Wrote output. All done!\n");
freeFreqHash(&freqHash);
hashFree(&totTagsHash);
cgapSageFreeList(&sageList);
slFreeList(&mappings);
}
예제 #4
0
void txCdsPredict(char *inFa, char *outCds, char *nmdBed, char *mafFile, boolean anyStart)
/* txCdsPredict - Somewhat simple-minded ORF predictor using a weighting scheme.. */
{
struct dnaSeq *rna, *rnaList = faReadAllDna(inFa);
verbose(2, "Read %d sequences from %s\n", slCount(rnaList), inFa);

/* Make up hash of bed records for NMD analysis. */
struct hash *nmdHash = hashNew(18);
if (nmdBed != NULL)
    {
    struct bed *bed, *bedList = bedLoadNAll(nmdBed, 12);
    for (bed = bedList; bed != NULL; bed = bed->next)
        hashAdd(nmdHash, bed->name, bed);
    verbose(2, "Read %d beds from %s\n", nmdHash->elCount, nmdBed);
    }

/* Make up hash of maf records for conservation analysis. */
struct hash *mafHash = hashNew(18);
int otherSpeciesCount = 0;
if (mafFile != NULL)
    {
    struct mafFile *mf = mafReadAll(mafFile);
    struct mafAli *maf;
    for (maf = mf->alignments; maf != NULL; maf = maf->next)
	hashAdd(mafHash, maf->components->src, maf);
    verbose(2, "Read %d alignments from %s\n", mafHash->elCount, mafFile);

    struct hash *uniqSpeciesHash = hashNew(0);
    for (maf = mf->alignments; maf != NULL; maf = maf->next)
        {
	struct mafComp *comp;
	for (comp = maf->components->next;  comp != NULL; comp = comp->next)
	    hashStore(uniqSpeciesHash, comp->src);
	}
    otherSpeciesCount = uniqSpeciesHash->elCount;
    verbose(2, "%d other species in %s\n", otherSpeciesCount, mafFile);
    }

FILE *f = mustOpen(outCds, "w");
for (rna = rnaList; rna != NULL; rna = rna->next)
    {
    verbose(3, "%s\n", rna->name);
    struct cdsEvidence *orfList = orfsOnRna(rna, nmdHash, mafHash, otherSpeciesCount, anyStart);
    if (orfList != NULL)
	{
	slSort(&orfList, cdsEvidenceCmpScore);
	cdsEvidenceTabOut(orfList, f);
	}
    cdsEvidenceFreeList(&orfList);
    }
carefulClose(&f);
}
void bedOrBlocks(char *inFile, char *outFile)
/* bedOrBlocks - Create a bed that is the union of all blocks of a list of beds.. */
{
struct bed *start, *end, *inList = bedLoadNAll(inFile, 12);
FILE *f = mustOpen(outFile, "w");
slSort(&inList, bedCmpChromStrandStart);
for (start = inList; start != NULL; start = end)
    {
    for (end = start->next; end != NULL; end = end->next)
        {
	if (!sameString(start->chrom, end->chrom))
	    break;
	if (start->strand[0] != end->strand[0])
	    break;
	}
    doStrand(start, end, f);
    }
carefulClose(&f);
}
예제 #6
0
struct genePred *convertBedsToGps(char *bedFile)
/* Load beds from a file and convert to bare bones genePredictions. */
{
struct genePred *gpList = NULL, *gp =NULL;
struct bed *bedList=NULL, *bed=NULL;
bedList = bedLoadNAll(bedFile, 6);
if(bedList->strand == NULL)
    errAbort("Beds must have strand information.");
for(bed=bedList; bed!=NULL; bed=bed->next)
    {
    AllocVar(gp);
    gp->chrom = cloneString(bed->chrom);
    gp->txStart = gp->cdsStart = bed->chromStart;
    gp->txEnd = gp->cdsEnd = bed->chromEnd;
    gp->name = cloneString(bed->name);
    safef(gp->strand, sizeof(gp->strand), "%s", bed->strand);
    slAddHead(&gpList, gp);
    }
bedFreeList(&bedList);
slReverse(&gpList);
return gpList;
}
예제 #7
0
struct hash *bedLoadNInHash(char *filename, int fields)
{
	struct bed *bed = NULL, *currList = NULL, *temp = NULL, *nextBed = NULL;
	struct hash *regionHash = newHash(6);
	struct bed *regions;

	regions = bedLoadNAll(filename, fields);
	slSort(&regions, bedCmp);
	currList = regions;
	for(bed = regions; bed != NULL; bed = nextBed)
	{
		nextBed = bed->next;
		if((bed->next == NULL) || (differentString(bed->chrom,bed->next->chrom)))
		{
			temp = bed->next;
			bed->next = NULL;
			hashAdd(regionHash, bed->chrom, currList);
			currList = temp;
		}
	}
	return(regionHash);
}
예제 #8
0
void ultraPcrRegions(char *database, char *bedFile, char *outFa)
/* ultraPcrRegions - Get regions to PCR up and some surrounding sequence. */
{
int extraSize = 1000;
FILE *f = mustOpen(outFa, "w");
struct bed *bed, *bedList = bedLoadNAll(bedFile, 4);
hSetDb(database);
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    int bedSize = bed->chromEnd - bed->chromStart;
    int chromSize = hChromSize(bed->chrom);
    int seqSize;
    int seqStart = bed->chromStart - extraSize;
    int seqEnd = bed->chromEnd + extraSize;
    int firstParenPos, secondParenPos;
    struct dyString *dy;
    char fileName[512];
    struct dnaSeq *seq;
    if (seqStart < 0)
        seqStart = 0;
    if (seqEnd > chromSize)
        seqEnd = chromSize;
    seqSize = seqEnd - seqStart;
    firstParenPos = bed->chromStart - seqStart;
    secondParenPos = firstParenPos + bedSize;
    seq = hChromSeqMixed(bed->chrom, seqStart, seqEnd);
    dy = dyStringNew(seqSize+2);
    dyStringAppendN(dy, seq->dna, firstParenPos);
    dyStringAppendC(dy, '(');
    dyStringAppendN(dy, seq->dna+firstParenPos, secondParenPos-firstParenPos);
    dyStringAppendC(dy, ')');
    dyStringAppendN(dy, seq->dna+secondParenPos, seqSize - secondParenPos);
    faWriteNext(f, bed->name, dy->string, dy->stringSize);
    }
carefulClose(&f);
}
void txGeneAccession(char *oldBedFile, char *lastIdFile, char *newBedFile, char *txToAccFile,
	char *oldToNewFile)
/* txGeneAccession - Assign permanent accession number to genes. */
{
/* Read in all input. */
struct bed *oldList = bedLoadNAll(oldBedFile, 12);
verbose(2, "Read %d from %s\n", slCount(oldList), oldBedFile);
struct bed *newList = bedLoadNAll(newBedFile, 12);
verbose(2, "Read %d from %s\n", slCount(newList), newBedFile);
int txId = readNumberFromFile(lastIdFile);
verbose(2, "Last txId used was %d (from %s)\n", txId, lastIdFile);

/* Make a random-access data structure for old list. */
struct hash *oldHash = bedsIntoKeeperHash(oldList);

/* Make a little hash to help prevent us from reusing an
 * old accession twice (which might happen if we extend it
 * in two incompatible ways). */
struct hash *usedHash = hashNew(16);

/* Record our decisions in hash as well as file. */
struct hash *idToAccHash = hashNew(16);

/* Loop through new list first looking for exact matches. Record
 * exact matches in hash so we don't look for them again during
 * the next, "compatable" match phase. */
struct hash *oldExactHash = hashNew(16), *newExactHash = hashNew(16);
struct bed *oldBed, *newBed;
FILE *f = mustOpen(txToAccFile, "w");
FILE *fOld = mustOpen(oldToNewFile, "w");
for (newBed = newList; newBed != NULL; newBed = newBed->next)
    {
    oldBed = findExact(newBed, oldHash, usedHash);
    if (oldBed != NULL)
        {
	hashAdd(oldExactHash, oldBed->name, oldBed);
	hashAdd(newExactHash, newBed->name, newBed);
	hashAdd(usedHash, oldBed->name, NULL);
        fprintf(f, "%s\t%s\n", newBed->name, oldBed->name);
	hashAdd(idToAccHash, newBed->name, oldBed->name);
	fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd,
		oldBed->name, oldBed->name, "exact");
	}
    }

/* Loop through new bed looking for compatible things.  If
 * we can't find anything compatable, make up a new accession. */
for (newBed = newList; newBed != NULL; newBed = newBed->next)
    {
    if (!hashLookup(newExactHash, newBed->name))
	{
	oldBed = findCompatible(newBed, oldHash, usedHash);
	if (oldBed == NULL)
	    {
	    char newAcc[16];
	    txGeneAccFromId(++txId, newAcc);
	    strcat(newAcc, ".1");
	    fprintf(f, "%s\t%s\n", newBed->name, newAcc);
	    hashAdd(idToAccHash, newBed->name, cloneString(newAcc));
	    oldBed = findMostOverlapping(newBed, oldHash);
	    char *oldAcc = (oldBed == NULL ? "" : oldBed->name);
	    fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd,
	    	oldAcc, newAcc, "new");
	    }
	else
	    {
	    char *acc = cloneString(oldBed->name);
	    char *ver = strchr(acc, '.');
	    if (ver == NULL)
	        errAbort("No version found in %s", oldBed->name);
	    *ver++ = 0;
	    int version = sqlUnsigned(ver);
	    char newAcc[16];
	    safef(newAcc, sizeof(newAcc), "%s.%d", acc, version+1);
	    hashAdd(usedHash, oldBed->name, NULL);
	    fprintf(f, "%s\t%s\n", newBed->name, newAcc);
	    hashAdd(idToAccHash, newBed->name, newAcc);
	    fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", newBed->chrom, newBed->chromStart, newBed->chromEnd,
	    	oldBed->name, newAcc, "compatible");
	    }
	}
    }
carefulClose(&f);

/* Make a random-access data structure for old list. */
struct hash *newHash = bedsIntoKeeperHash(newList);

/* Write record of ones that don't map. */
for (oldBed = oldList; oldBed != NULL; oldBed = oldBed->next)
    {
    if (!hashLookup(usedHash, oldBed->name))
	{
	char *newAcc = "";
	struct bed *newBed = findMostOverlapping(oldBed, newHash);
	if (newBed != NULL)
	    newAcc = hashMustFindVal(idToAccHash, newBed->name);
	fprintf(fOld, "%s:%d-%d\t%s\t%s\t%s\n", oldBed->chrom, oldBed->chromStart, oldBed->chromEnd,
		oldBed->name, newAcc, "lost");
	}
    }
carefulClose(&fOld);

if (!optionExists("test"))
    {
    FILE *fId = mustOpen(lastIdFile, "w");
    fprintf(fId, "%d\n", txId);
    carefulClose(&fId);
    }
}
예제 #10
0
void liftOverMerge(char *oldFile, char *newFile)
/* liftOverMerge - Merge regions in BED5  generated by liftOver -multiple */
{
    struct bed *bedList = NULL, *bed = NULL, *otherBed = NULL, *nextBed = NULL;
    struct bedList *bedListHeaders = NULL, *bedListHeader = NULL;
    FILE *f = mustOpen(newFile, "w");

    bedList = bedLoadNAll(oldFile, 5);

    /* break down bed list into a list of lists, one per "region", where region
     * is the name field in the bed */
    for (bed = bedList; bed != NULL; bed = nextBed)
    {
        verbose(3, "%s:%d-%d %s %d\n", bed->chrom, bed->chromStart, bed->chromEnd,
                bed->name, bed->score);
        if (bedListHeader == NULL ||
                differentString(bed->name, bedListHeader->name))
        {
            verbose(2, "region %s\n", bed->name);
            AllocVar(bedListHeader);
            bedListHeader->name = cloneString(bed->name);
            slAddHead(&bedListHeaders, bedListHeader);
        }
        nextBed = bed->next;
        slAddHead(&bedListHeader->bed, bed);
    }
    slReverse(&bedListHeaders);

    for (bedListHeader = bedListHeaders; bedListHeader != NULL;
            bedListHeader = bedListHeader->next)
    {
        int ix = 1;
        verbose(3, "region %s\n", bedListHeader->name);
        slReverse(&bedListHeader->bed);

        /* traverse list of bed lists, merging overlapping entries
         * for each region */
        for (bed = bedListHeader->bed; bed != NULL; bed = bed->next)
        {
            for (otherBed = bed->next; otherBed != NULL; otherBed = nextBed)
            {
                nextBed = otherBed->next;
                if (sameString(bed->chrom, otherBed->chrom) &&
                        (max(bed->chromStart, otherBed->chromStart) <=
                         min(bed->chromEnd, otherBed->chromEnd) + mergeGap))
                {
                    /* these regions overlap (or are within the merge gap),
                     * so create one that is a merge, and drop the other */
                    verbose(2,"merging %s:%d-%d, %s:%d-%d (overlap=%d)",
                            otherBed->chrom, otherBed->chromStart, otherBed->chromEnd,
                            bed->chrom, bed->chromStart, bed->chromEnd,
                            min(bed->chromEnd, otherBed->chromEnd) -
                            max(bed->chromStart, otherBed->chromStart));
                    bed->chromStart = min(otherBed->chromStart, bed->chromStart);
                    bed->chromEnd = max(otherBed->chromEnd, bed->chromEnd);
                    verbose(2," to %s:%d-%d\n",
                            bed->chrom, bed->chromStart, bed->chromEnd);
                    slRemoveEl(&bedListHeader->bed, otherBed);
                }
            }
        }
        for (otherBed = bedListHeader->bed; otherBed != NULL;
                otherBed = otherBed->next)
        {
            otherBed->score = ix++;
            bedOutputN(otherBed, 5, f, '\t', '\n');
        }
    }
}
예제 #11
0
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
	{
	info.sourceAcc = cloneString(bed->name);
	}
    else 
	{
	info.sourceAcc = txAccFromTempName(bed->name);
	}
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) 
	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
	|| stringIn("tRNA", info.sourceAcc) != NULL)
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
예제 #12
0
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, 
	char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome)
/* txGeneCdsMap - Create mapping between CDS region of gene and genome. */
{
/* Load info into hash. */
struct hash *infoHash = hashNew(18);
struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }
lineFileClose(&lf);

/* Load refPep/tx alignments into hash keyed by tx. */
struct hash *refPslHash = hashNew(18);
struct psl *psl, *pslList  = pslLoadAll(refPepToTxPsl);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(refPslHash, psl->tName, psl);

struct hash *refToPepHash = hashTwoColumnFile(refToPepTab);
struct hash *chromSizeHash = hashNameIntFile(chromSizes);

/* Load in bed. */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Open output, and stream through bedList, writing output. */
FILE *fCdsToRna = mustOpen(cdsToRna, "w");
FILE *fRnaToGenome = mustOpen(rnaToGenome, "w");
int refTotal = 0, refFound = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
	{
	char *chrom = bed->chrom;
	int chromSize = hashIntVal(chromSizeHash, chrom);
	info = hashMustFindVal(infoHash, bed->name);
	pick = hashMustFindVal(pickHash, bed->name);
	if (info->isRefSeq)
	    {
	    char *refAcc = txAccFromTempName(bed->name);
	    if (!startsWith("NM_", refAcc))
		errAbort("Don't think I did find that refSeq acc, got %s", refAcc);
	    char *protAcc = hashMustFindVal(refToPepHash, refAcc);
	    ++refTotal;
	    if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna))
	        ++refFound;
	    }
	else
	    {
	    fakeCdsToMrna(bed, fCdsToRna);
	    }
	fakeRnaToGenome(bed, chromSize, fRnaToGenome);
	}
    }
verbose(1, "Missed %d of %d refSeq protein mappings.  A small number of RefSeqs just map\n"
           "to genome in the UTR.\n", refTotal - refFound, refTotal);
carefulClose(&fCdsToRna);
carefulClose(&fRnaToGenome);
}
예제 #13
0
void txGeneSeparateNoncoding(char *inBed, char *inInfo,
	char *outCoding, char *outNearCoding, char *outNearCodingJunk,
	char *outAntisense, char *outNoncoding, char *outInfo)
/* txGeneSeparateNoncoding - Separate genes into four piles - coding, 
 * non-coding that overlap coding, antisense to coding, and independent non-coding. */
{
/* Read in txInfo into a hash keyed by transcript name */
struct hash *infoHash = hashNew(16);
struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);
verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, 
	inInfo);

/* Read in bed, and sort so we can process it easily a 
 * strand of one chromosome at a time. */
struct bed *inBedList = bedLoadNAll(inBed, 12);
slSort(&inBedList, bedCmpChromStrandStart);

/* Open up output files. */
FILE *fCoding = mustOpen(outCoding, "w");
FILE *fNearCoding = mustOpen(outNearCoding, "w");
FILE *fNearCodingJunk = mustOpen(outNearCodingJunk, "w");
FILE *fNoncoding = mustOpen(outNoncoding, "w");
FILE *fAntisense = mustOpen(outAntisense, "w");

/* Go through input one chromosome strand at a time. */
struct chrom *chrom, *chromList = chromsForBeds(inBedList);
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    verbose(2, "chrom %s\n", chrom->name); 

    /* Do the separation. */
    struct bed *codingList, *nearCodingList, *nearCodingJunkList, *antisenseList, *noncodingList;
    separateChrom(chrom, infoHash, &codingList, &nearCodingList, 
    	&nearCodingJunkList, &antisenseList, &noncodingList);
    verbose(2, "%d coding, %d near, %d anti, %d non\n", 
    	slCount(codingList), slCount(nearCodingList), slCount(antisenseList), slCount(noncodingList));

    /* Write lists to respective files. */
    writeBedList(codingList, fCoding);
    writeBedList(nearCodingList, fNearCoding);
    writeBedList(nearCodingJunkList, fNearCodingJunk);
    writeBedList(antisenseList, fAntisense);
    writeBedList(noncodingList, fNoncoding);
    }
carefulClose(&fCoding);
carefulClose(&fNearCoding);
carefulClose(&fNearCodingJunk);
carefulClose(&fNoncoding);
carefulClose(&fAntisense);

verbose(1, "coding %d, codingJunk %d, nearCoding %d, junk %d, antisense %d, noncoding %d\n",
	codingCount, codingJunkCount, nearCodingCount, junkCount, antisenseCount, noncodingCount);
/* Write out updated info file */
FILE *f = mustOpen(outInfo, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    txInfoTabOut(info, f);
    }
carefulClose(&f);
}
void txGeneCanonical(char *codingCluster, char *infoFile, 
	char *noncodingGraph, char *genesBed, char *nearCoding, 
	char *outCanonical, char *outIsoforms, char *outClusters)
/* txGeneCanonical - Pick a canonical version of each gene - that is the form
 * to use when just interested in a single splicing varient. Produces final
 * transcript clusters as well. */
{
/* Read in input into lists in memory. */
struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster);
struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph);
struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12);
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
struct bed *nearList = bedLoadNAll(nearCoding, 12);

/* Make hash of all beds. */
struct hash *bedHash = hashNew(18);
for (bed = bedList; bed != NULL; bed = bed->next)
    hashAdd(bedHash, bed->name, bed);

/* Make has of all info. */
struct hash *infoHash = hashNew(18);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Make a binKeeper structure that we'll populate with coding genes. */
struct hash *sizeHash = minChromSizeFromBeds(bedList);
struct hash *keeperHash = minChromSizeKeeperHash(sizeHash);

/* Make list of coding genes and toss them into binKeeper.
 * This will eat up bed list, but bedHash is ok. */
struct gene *gene, *geneList = NULL;
for (coding = codingList; coding != NULL; coding = coding->next)
    {
    gene = geneFromCluster(coding, bedHash, infoHash);
    slAddHead(&geneList, gene);
    struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom);
    binKeeperAdd(bk, gene->start, gene->end, gene);
    }

/* Go through near-coding genes and add them to the coding gene
 * they most overlap. */
for (bed = nearList; bed != NULL; bed = nextBed)
    {
    nextBed = bed->next;
    gene = mostOverlappingGene(keeperHash, bed);
    if (gene == NULL)
        errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name);
    geneAddBed(gene, bed);
    }

/* Add non-coding genes. */
for (graph = graphList; graph != NULL; graph = graph->next)
    {
    gene = geneFromGraph(graph, bedHash);
    slAddHead(&geneList, gene);
    }

/* Sort so it all looks nicer. */
slSort(&geneList, geneCmp);

/* Open up output files. */
FILE *fCan = mustOpen(outCanonical, "w");
FILE *fIso = mustOpen(outIsoforms, "w");
FILE *fClus = mustOpen(outClusters, "w");

/* Loop through, making up gene name, and writing output. */
int geneId = 0;
for (gene = geneList; gene != NULL; gene = gene->next)
    {
    /* Make up name. */
    char name[16];
    safef(name, sizeof(name), "g%05d", ++geneId);

    /* Reverse transcript list just to make it look better. */
    slReverse(&gene->txList);

    /* Write out canonical file output */
    bed = hashMustFindVal(bedHash, gene->niceTx->name);
    fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n",
    	bed->chrom, bed->chromStart, bed->chromEnd, geneId,
	gene->niceTx->name, gene->niceTx->name);

    /* Write out isoforms output. */
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fIso, "%d\t%s\n", geneId, bed->name);

    /* Write out cluster output, starting with bed 6 standard fields. */
    fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t",
    	gene->chrom, gene->start, gene->end, name, 0, gene->strand);

    /* Write out thick-start/thick end. */
    if (gene->isCoding)
        {
	int thickStart = gene->end, thickEnd  = gene->start;
	for (bed = gene->txList; bed != NULL; bed = bed->next)
	    {
	    if (bed->thickStart < bed->thickEnd)
	        {
		thickStart = min(thickStart, bed->thickStart);
		thickEnd = max(thickEnd, bed->thickEnd);
		}
	    }
	fprintf(fClus, "%d\t%d\t", thickStart, thickEnd);
	}
    else
        {
	fprintf(fClus, "%d\t%d\t", gene->start, gene->start);
	}

    /* We got no rgb value, just write out zero. */
    fprintf(fClus, "0\t");

    /* Get exons from exonTree. */
    struct range *exon, *exonList = rangeTreeList(gene->exonTree);
    fprintf(fClus, "%d\t", slCount(exonList));
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->start - gene->start);
    fprintf(fClus, "\t");
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->end - exon->start);
    fprintf(fClus, "\t");

    /* Write out associated transcripts. */
    fprintf(fClus, "%d\t", slCount(gene->txList));
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fClus, "%s,", bed->name);
    fprintf(fClus, "\t");

    /* Write out nice value */
    fprintf(fClus, "%s\t", gene->niceTx->name);

    /* Write out coding/noncoding value. */
    fprintf(fClus, "%d\n", gene->isCoding);
    }

/* Close up files. */
carefulClose(&fCan);
carefulClose(&fIso);
carefulClose(&fClus);
}
void txCdsRepick(char *inputBed, char *inputTxg, char *inputCluster, 
	char *inputInfo, char *inputCds, char *outputCds, char *outputPp)
/* txCdsRepick - After we have clustered based on the preliminary coding 
 * regions we can make a more intelligent choice here about the final coding 
 * regions. */
{
/* Read input bed into hash.  Also calculate number with CDS set. */
struct hash *bedHash = hashNew(16);
struct bed *bed, *bedList = bedLoadNAll(inputBed, 12);
int txWithCdsCount = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
        txWithCdsCount += 1;
    hashAdd(bedHash, bed->name, bed);
    }
verbose(2, "Read %d beds from %s\n", bedHash->elCount, inputBed);

/* Read input transcript graphs into list, and into a hash
 * keyed by transcript names. */
struct hash *graphHash = hashNew(16);
struct txGraph *txg, *txgList = txGraphLoadAll(inputTxg);
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    int i;
    for (i=0; i<txg->sourceCount; ++i)
        hashAdd(graphHash, txg->sources[i].accession, txg);
    }
verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList),
	graphHash->elCount, inputTxg);

/* Read input protein cluster into list, and into a hash
 * keyed by transcript name */
struct hash *clusterHash = hashNew(16);
struct txCluster *cluster, *clusterList = txClusterLoadAll(inputCluster);
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    int i;
    for (i=0; i<cluster->txCount; ++i)
        hashAdd(clusterHash, cluster->txArray[i], cluster);
    }
verbose(2, "Read %d protein clusters (%d transcripts) from  %s\n", 
	slCount(clusterList), clusterHash->elCount, inputCluster);

/* Read in txInfo into a hash keyed by transcript name */
struct hash *infoHash = hashNew(16);
struct txInfo *info, *infoList = txInfoLoadAll(inputInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);
verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, 
	inputInfo);

/* Read in input cds evidence into a hash keyed by transcript name
 * who's values are a sorted *list* of evidence. */
struct hash *evHash = hashNew(16);
struct cdsEvidence *ev, *nextEv, *evList = cdsEvidenceLoadAll(inputCds);
int evCount = 0;
for  (ev = evList; ev != NULL; ev = nextEv)
    {
    nextEv = ev->next;
    struct hashEl *hel = hashLookup(evHash, ev->name);
    if (hel == NULL)
        hel = hashAdd(evHash, ev->name, NULL);
    slAddTail(&hel->val, ev);
    ++evCount;
    }
verbose(2, "Read %d pieces of cdsEvidence on %d transcripts from %s\n",
	evCount, evHash->elCount, inputCds);

/* Create a hash containing what looks to be the best protein-coding
 * transcript in each protein cluster.  This is keyed by cluster name
 * with transcript names for values. */
FILE *f = mustOpen(outputPp, "w");
struct hash *bestInClusterHash = hashNew(16);
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    double bestScore = -BIGNUM;
    char *bestTx = NULL;
    int i;
    for (i=0; i<cluster->txCount; ++i)
        {
	char *tx = cluster->txArray[i];
	info = hashMustFindVal(infoHash, tx);
	double score = infoCodingScore(info, TRUE);
	if (score > bestScore)
	    {
	    bestTx = tx;
	    bestScore = score;
	    }
	}
    hashAdd(bestInClusterHash, cluster->name, bestTx);
    fprintf(f, "%s\t%s\n", cluster->name, bestTx);
    }
carefulClose(&f);
verbose(2, "Picked best protein for each protein cluster\n");


/* Loop through each transcript cluster (graph).  Make a list of
 * protein clusters associated with that graph. Armed with this
 * information call repick routine on each transcript in the graph. */
f = mustOpen(outputCds, "w");
for (txg = txgList; txg != NULL; txg = txg->next)
    {
    /* Build up list of protein clusters associated with transcript cluster. */
    struct slRef *protClusterRefList = NULL, *protClusterRef;
    int i;
    for (i=0; i<txg->sourceCount; ++i)
	{
	char *tx = txg->sources[i].accession;
	struct txCluster *protCluster = hashFindVal(clusterHash, tx);
	if (protCluster != NULL)
	    refAddUnique(&protClusterRefList, protCluster);
	}

    /* Figure out best scoring protein in RNA cluster, and set threshold
     * to eliminate ones scoring less than half this much. */
    double bestProtScore = 0;
    for (protClusterRef = protClusterRefList; protClusterRef != NULL;
    	protClusterRef = protClusterRef->next)
	{
	struct txCluster *protCluster = protClusterRef->val;
	char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name);
	struct txInfo *info = hashMustFindVal(infoHash, protTx);
	double score = infoCodingScore(info, FALSE);
	bestProtScore = max(score, bestProtScore);
	}
    double protScoreThreshold = bestProtScore * 0.5;

    /* Get list of references to beds of proteins over that threshold. */
    struct slRef *protRefList = NULL;
    for (protClusterRef = protClusterRefList; protClusterRef != NULL;
    	protClusterRef = protClusterRef->next)
	{
	struct txCluster *protCluster = protClusterRef->val;
	char *protTx = hashMustFindVal(bestInClusterHash, protCluster->name);
	struct txInfo *info = hashMustFindVal(infoHash, protTx);
	double score = infoCodingScore(info, FALSE);
	if (score >= protScoreThreshold)
	    {
	    struct bed *bed = hashMustFindVal(bedHash, protTx);
	    refAdd(&protRefList, bed);
	    }
	}

    /* Go repick each CDS in RNA cluster */
    for (i=0; i<txg->sourceCount; ++i)
        {
	char *tx = txg->sources[i].accession;
	struct bed *bed = hashMustFindVal(bedHash, tx);
	struct cdsEvidence *evList = hashFindVal(evHash, tx);
	if (evList != NULL && bed->thickStart < bed->thickEnd)
	    {
	    info = hashMustFindVal(infoHash, bed->name);
	    pickCompatableCds(bed, protRefList, evList, info, f);
	    }
	}
    slFreeList(&protClusterRefList);
    }
carefulClose(&f);
verbose(1, "repicked %d, removed %d, no change to %d\n",
    pickedBetter, pickedNone, txWithCdsCount - pickedBetter - pickedNone);
}
예제 #16
0
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg)
/* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */
{
/* Load protein sequence into hashes */
struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper);
struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper);
struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load in bed */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Do reformatting and write output. */
FILE *f = mustOpen(outKg, "w");
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    char *protAcc = NULL;
    if (bed->thickStart < bed->thickEnd)
	{
        pick = hashMustFindVal(pickHash, bed->name);
	struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq;
	ucscSeq = hashMustFindVal(ucscProtHash, bed->name);
	if (pick->swissProt[0])
	    spSeq = hashMustFindVal(uniProtHash, pick->swissProt);
	if (pick->uniProt[0])
	    uniSeq = hashMustFindVal(uniProtHash, pick->uniProt);
	if (pick->refProt[0])
	    refPep = hashMustFindVal(refProtHash, pick->refProt);

	/* First we look for an exact match between the ucsc protein and
	 * something from swissProt/uniProt. */
	if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna))
	    protAcc = pick->swissProt;
	if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna))
	    protAcc = pick->uniProt;
	if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna))
	    {
	    protAcc = cloneString(pick->refProt);
	    chopSuffix(protAcc);
	    }

	if (protAcc == NULL)
	    {
	    if (pick->uniProt[0])
	        protAcc = pick->uniProt;
	    else 
		{
	        protAcc = cloneString(pick->refProt);
		chopSuffix(protAcc);
		}
	    }
	}
    outputKg(bed, emptyForNull(protAcc), f);
    }
carefulClose(&f);
}