struct cdsPick *cdsPickLoadAllByChar(char *fileName, char chopper) 
/* Load all cdsPick from a chopper separated file.
 * Dispose of this with cdsPickFreeList(). */
{
struct cdsPick *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[12];

while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
    {
    el = cdsPickLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct cdsPick *cdsPickLoadAll(char *fileName) 
/* Load all cdsPick from a whitespace-separated file.
 * Dispose of this with cdsPickFreeList(). */
{
struct cdsPick *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[12];

while (lineFileRow(lf, row))
    {
    el = cdsPickLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}
Beispiel #3
0
void txGeneColor(char *uniProtDb, char *infoFile, char *pickFile, char *outFile)
/* txGeneColor - Figure out color to draw gene in.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Open uniprot database connection. */
struct sqlConnection *uConn = sqlConnect(uniProtDb);

#ifdef OLD
/* Figure out our light and medium colors. */
mediumBlue.r = (6*trueBlue.r + 4*255)/10;
mediumBlue.g = (6*trueBlue.g + 4*255)/10;
mediumBlue.b = (6*trueBlue.b + 4*255)/10;
lightBlue.r = (1*trueBlue.r + 2*255)/3;
lightBlue.g = (1*trueBlue.g + 2*255)/3;
lightBlue.b = (1*trueBlue.b + 2*255)/3;
#endif /* OLD */

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    struct rgbColor *col;
    pick = hashFindVal(pickHash, info->name);
    if (pick != NULL)
        {
	char *source = pick->source;
	if (sameString(source, "RefPepValidated"))
	    col = &trueBlue;
	else if (sameString(source, "ccds"))
	    col = &trueBlue;
	else if (sameString(source, "RefPepReviewed"))
	    col = &trueBlue;
	else if (sameString(source, "RefSeqValidated"))
	    col = &trueBlue;
	else if (sameString(source, "RefSeqReviewed"))
	    col = &trueBlue;
	else if (sameString(source, "swissProt"))
	    col = &trueBlue;
	else if (startsWith("Ref", source))
	    col = &mediumBlue;
	else
	    col = &lightBlue;
	if (pick->swissProt[0] != 0)
	    {
	    char *acc = spLookupPrimaryAcc(uConn, pick->swissProt);
	    struct slName *pdbList = spPdbAccs(uConn, acc);
	    if (pdbList != NULL)
	        col = &black;
	    slFreeList(&pdbList);
	    }
	}
    else
        col = &lightBlue;
    fprintf(f, "%s\t%d\t%d\t%d\n", info->name, col->r, col->g, col->b);
    }
carefulClose(&f);
}
Beispiel #4
0
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, 
	char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome)
/* txGeneCdsMap - Create mapping between CDS region of gene and genome. */
{
/* Load info into hash. */
struct hash *infoHash = hashNew(18);
struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }
lineFileClose(&lf);

/* Load refPep/tx alignments into hash keyed by tx. */
struct hash *refPslHash = hashNew(18);
struct psl *psl, *pslList  = pslLoadAll(refPepToTxPsl);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(refPslHash, psl->tName, psl);

struct hash *refToPepHash = hashTwoColumnFile(refToPepTab);
struct hash *chromSizeHash = hashNameIntFile(chromSizes);

/* Load in bed. */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Open output, and stream through bedList, writing output. */
FILE *fCdsToRna = mustOpen(cdsToRna, "w");
FILE *fRnaToGenome = mustOpen(rnaToGenome, "w");
int refTotal = 0, refFound = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
	{
	char *chrom = bed->chrom;
	int chromSize = hashIntVal(chromSizeHash, chrom);
	info = hashMustFindVal(infoHash, bed->name);
	pick = hashMustFindVal(pickHash, bed->name);
	if (info->isRefSeq)
	    {
	    char *refAcc = txAccFromTempName(bed->name);
	    if (!startsWith("NM_", refAcc))
		errAbort("Don't think I did find that refSeq acc, got %s", refAcc);
	    char *protAcc = hashMustFindVal(refToPepHash, refAcc);
	    ++refTotal;
	    if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna))
	        ++refFound;
	    }
	else
	    {
	    fakeCdsToMrna(bed, fCdsToRna);
	    }
	fakeRnaToGenome(bed, chromSize, fRnaToGenome);
	}
    }
verbose(1, "Missed %d of %d refSeq protein mappings.  A small number of RefSeqs just map\n"
           "to genome in the UTR.\n", refTotal - refFound, refTotal);
carefulClose(&fCdsToRna);
carefulClose(&fRnaToGenome);
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}
Beispiel #6
0
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg)
/* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */
{
/* Load protein sequence into hashes */
struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper);
struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper);
struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load in bed */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Do reformatting and write output. */
FILE *f = mustOpen(outKg, "w");
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    char *protAcc = NULL;
    if (bed->thickStart < bed->thickEnd)
	{
        pick = hashMustFindVal(pickHash, bed->name);
	struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq;
	ucscSeq = hashMustFindVal(ucscProtHash, bed->name);
	if (pick->swissProt[0])
	    spSeq = hashMustFindVal(uniProtHash, pick->swissProt);
	if (pick->uniProt[0])
	    uniSeq = hashMustFindVal(uniProtHash, pick->uniProt);
	if (pick->refProt[0])
	    refPep = hashMustFindVal(refProtHash, pick->refProt);

	/* First we look for an exact match between the ucsc protein and
	 * something from swissProt/uniProt. */
	if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna))
	    protAcc = pick->swissProt;
	if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna))
	    protAcc = pick->uniProt;
	if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna))
	    {
	    protAcc = cloneString(pick->refProt);
	    chopSuffix(protAcc);
	    }

	if (protAcc == NULL)
	    {
	    if (pick->uniProt[0])
	        protAcc = pick->uniProt;
	    else 
		{
	        protAcc = cloneString(pick->refProt);
		chopSuffix(protAcc);
		}
	    }
	}
    outputKg(bed, emptyForNull(protAcc), f);
    }
carefulClose(&f);
}