void txCdsToGene(char *txBed, char *txFa, char *txCds, char *outGtf, char *outFa)
/* txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and 
 * protein sequence. */
{
struct hash *txSeqHash = faReadAllIntoHash(txFa, dnaLower);
verbose(2, "Read %d transcript sequences from %s\n", txSeqHash->elCount, txFa);
struct hash *cdsHash = cdsEvidenceReadAllIntoHash(txCds);
verbose(2, "Read %d cdsEvidence from %s\n", cdsHash->elCount, txCds);
struct lineFile *lf = lineFileOpen(txBed, TRUE);
FILE *fGtf = mustOpen(outGtf, "w");
FILE *fFa = mustOpen(outFa, "w");
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(2, "processing %s\n", bed->name);
    struct cdsEvidence *cds = hashFindVal(cdsHash, bed->name);
    struct dnaSeq *txSeq = hashFindVal(txSeqHash, bed->name);
    char *cdsSource = NULL;
    if (txSeq == NULL)
        errAbort("%s is in %s but not %s", bed->name, txBed, txFa);
    if (cds != NULL)
	{
        outputProtein(cds, txSeq, fFa);
	if (cds->cdsCount > 1)
	    {
	    struct bed *newBed = breakUpBedAtCdsBreaks(cds, bed);
	    if (fTweaked)
	        fprintf(fTweaked, "%s\n", newBed->name);
	    bedFree(&bed);
	    bed = newBed;
	    }
	cdsSource = cds->accession;
	if (sameString(cds->accession, "."))
	    cdsSource = cds->source;
	}

    /* Set bed CDS bounds and optionally output bed. */
    cdsEvidenceSetBedThick(cds, bed);
    if (fBed)
        bedTabOutN(bed, 12, fBed);

    /* Parse out bed name, which is in format chrom.geneId.txId.accession */
    char *geneName = cloneString(bed->name);
    char *accession = strrchr(geneName, '.');
    assert(accession != NULL);
    *accession++ = 0;
    chopSuffix(geneName);

    /* Output as GTF */
    bedToGtf(bed, accession, cdsSource, geneName, fGtf);

    /* Clean up for next iteration of loop. */
    freez(&geneName);
    bedFree(&bed);
    }
lineFileClose(&lf);
carefulClose(&fFa);
carefulClose(&fGtf);
}
static void makeDirFasta(char *regionsFile, char *hg18FastaFile, char *dir, int num) {
	FILE *fp, *sq;
	char buf[500], dirName[500], seqName[500], chr1[500], chr2[500];
	int b1, e1, b2, e2, i, len;
	char ori1, ori2;
	struct hash *seqHash = NULL;
	struct dnaSeq *seq1, *seq2;
	struct stat st;
	DNA *s1, *s2;

	seqHash = faReadAllIntoHash(hg18FastaFile, dnaUpper);
	if (stat(dir, &st) != 0)
		do_cmd("mkdir %s", dir);

	fp = mustOpen(regionsFile, "r");
	i = 0;
	while (fgets(buf, 500, fp)) {
		if (sscanf(buf, "%[^:]:%d-%d %[^:]:%d-%d [%c %c]", chr1, &b1, &e1, chr2, &b2, &e2, &ori1, &ori2) != 8)
			errAbort("error: %s", buf);
		++i;
		if (i != num) 
			continue;
		sprintf(dirName, "%s/R%d", dir, i);
		if (stat(dirName, &st) != 0)
			do_cmd("mkdir %s", dir);
		sprintf(seqName, "%s/ref.fa", dirName);
		sq = mustOpen(seqName, "w");
		fprintf(sq, ">%s:%d-%d+%s:%d-%d[%c%c]\n", chr1, b1, e1, chr2, b2, e2, ori1, ori2);
		seq1 = (struct dnaSeq *)hashFindVal(seqHash, chr1);
		assert(e1 <= seq1->size);
		len = e1 - b1 + 1;
		if (ori1 == '-') {
			s1 = cloneStringZExt(seq1->dna + b1 - 1, len, len+1);
			reverseComplement(s1, len);
			writeSeqWithBreaks(sq, s1, len, 80);
			freeMem(s1);
		}
		else
			writeSeqWithBreaks(sq, seq1->dna + b1 - 1, e1 - b1 + 1, 80);
		seq2 = (struct dnaSeq *)hashFindVal(seqHash, chr2);
		assert(e2 <= seq2->size);
		len = e2 - b2 + 1;
		if (ori2 == '-') {
			s2 = cloneStringZExt(seq2->dna + b2 - 1, len, len+1);
			reverseComplement(s2, len);
			writeSeqWithBreaks(sq, s2, len, 80);
			freeMem(s2);
		}
		else
			writeSeqWithBreaks(sq, seq2->dna + b2 - 1, e2 - b2 + 1, 80);
		fclose(sq);
	}
	fclose(fp);
	//FIXME: free space
} 
示例#3
0
void txGeneAltProt(char *pepFile, char *isoformsFile, char *outFile)
/* txGeneAltProt - Figure out statistics on number of alternative proteins produced by alt-splicing.. */
{
struct hash *pepHash = faReadAllIntoHash(pepFile, dnaUpper);
struct hash *totalUniqHash = hashNew(18);
uglyf("Read %d from %s\n", pepHash->elCount, pepFile);
int lastClusterId = -1;
struct hash *uniqHash = NULL;
struct slName *clusterList = NULL;
FILE *f = mustOpen(outFile, "w");
struct lineFile *lf = lineFileOpen(isoformsFile, TRUE);
char *row[2];
while (lineFileRow(lf, row))
    {
    int clusterId = lineFileNeedNum(lf, row, 0);
    char *tx = row[1];
    if (clusterId != lastClusterId)
        {
	if (uniqHash != NULL)
	    {
	    outputCluster(lastClusterId, clusterList, f);
	    hashFree(&uniqHash);
	    slFreeList(&clusterList);
	    }
	uniqHash = hashNew(0);
	}
    lastClusterId = clusterId;
    struct dnaSeq *pep = hashFindVal(pepHash, tx);
    if (pep != NULL)
         {
	 if (!hashLookup(uniqHash, pep->dna))
	     {
	     hashAdd(uniqHash, pep->dna, NULL);
	     slNameAddTail(&clusterList, tx);
	     }
	 if (!hashLookup(totalUniqHash, pep->dna))
	     hashAdd(totalUniqHash, pep->dna, NULL);
	 }
    }
outputCluster(lastClusterId, clusterList, f);
verbose(1, "%d total unique proteins\n", totalUniqHash->elCount);
carefulClose(&f);
}
void txCdsOrfInfo(char *inCds, char *inFa, char *outInfo)
/* txCdsOrfInfo - Given a sequence and a putative ORF, calculate some basic information on it.. */
{
    struct hash *dnaHash = faReadAllIntoHash(inFa, dnaLower);
    struct lineFile *lf = lineFileOpen(inCds, TRUE);
    FILE *f = mustOpen(outInfo, "w");
    char *row[3];
    while (lineFileRow(lf, row))
    {
        char *seqName = row[0];
        int start = lineFileNeedNum(lf, row, 1);
        int end = lineFileNeedNum(lf, row, 2);
        struct dnaSeq *seq = hashFindVal(dnaHash, seqName);
        if (seq == NULL)
            errAbort("%s is in %s but not %s", seqName, inCds, inFa);
        outputOneRa(seq, start, end, f);
    }
    carefulClose(&f);
}
void txCdsEvFromBorf(char *inBorf, char *txFa, char *outTce)
/* txCdsEvFromBorf - Convert borfBig format to txCdsEvidence (tce) in an effort 
 * to annotate the coding regions.. */
{
struct lineFile *lf = lineFileOpen(inBorf, TRUE);
struct hash *txHash = faReadAllIntoHash(txFa, dnaLower);
char *row[BORF_NUM_COLS];
FILE *f = mustOpen(outTce, "w");
while (lineFileRowTab(lf, row))
    {
    struct borf b;
    borfStaticLoad(row, &b);
    if (b.strand[0] == '+' && b.score >= 50)
	{
	struct dnaSeq *txSeq = hashFindVal(txHash, b.name);
	boolean hasStop = FALSE;
	if (b.cdsEnd + 3 < txSeq->size)
	    {
	    hasStop = isStopCodon(txSeq->dna + b.cdsEnd);
	    b.cdsEnd += 3;
	    }
	if (txSeq == NULL)
	    errAbort("%s is in %s but not %s", b.name, inBorf, txFa);
	int score = (b.score - 45)*5;
	if (score > 1000) score = 1000;
	if (score < 0) score = 0;
	fprintf(f, "%s\t", b.name);
	fprintf(f, "%d\t", b.cdsStart);
	fprintf(f, "%d\t", b.cdsEnd);
	fprintf(f, "%s\t", "bestorf");
	fprintf(f, "%s\t", ".");
	fprintf(f, "%d\t", score);
	fprintf(f, "%d\t", startsWith("atg", txSeq->dna + b.cdsStart));
	fprintf(f, "%d\t", hasStop);
	fprintf(f, "%d\t", 1);	
	fprintf(f, "%d,\t", b.cdsStart);
	fprintf(f, "%d,\n", b.cdsEnd - b.cdsStart);
	}
    }
lineFileClose(&lf);
carefulClose(&f);
}
示例#6
0
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg)
/* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */
{
/* Load protein sequence into hashes */
struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper);
struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper);
struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load in bed */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Do reformatting and write output. */
FILE *f = mustOpen(outKg, "w");
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    char *protAcc = NULL;
    if (bed->thickStart < bed->thickEnd)
	{
        pick = hashMustFindVal(pickHash, bed->name);
	struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq;
	ucscSeq = hashMustFindVal(ucscProtHash, bed->name);
	if (pick->swissProt[0])
	    spSeq = hashMustFindVal(uniProtHash, pick->swissProt);
	if (pick->uniProt[0])
	    uniSeq = hashMustFindVal(uniProtHash, pick->uniProt);
	if (pick->refProt[0])
	    refPep = hashMustFindVal(refProtHash, pick->refProt);

	/* First we look for an exact match between the ucsc protein and
	 * something from swissProt/uniProt. */
	if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna))
	    protAcc = pick->swissProt;
	if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna))
	    protAcc = pick->uniProt;
	if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna))
	    {
	    protAcc = cloneString(pick->refProt);
	    chopSuffix(protAcc);
	    }

	if (protAcc == NULL)
	    {
	    if (pick->uniProt[0])
	        protAcc = pick->uniProt;
	    else 
		{
	        protAcc = cloneString(pick->refProt);
		chopSuffix(protAcc);
		}
	    }
	}
    outputKg(bed, emptyForNull(protAcc), f);
    }
carefulClose(&f);
}