void gensatFixFull(char *captionFile)
/* Fix missing captions. */
{
struct lineFile *lf = lineFileOpen(captionFile, TRUE);
char *row[2];
struct dyString *sql = dyStringNew(0);
struct sqlConnection *conn = sqlConnect(database);
struct hash *capHash = newHash(16);
while (lineFileRowTab(lf, row))
    {
    int captionId;
    char *submitId = row[0];
    char *caption = row[1];
    captionId = hashIntValDefault(capHash, caption, 0);
    if (captionId == 0)
        {
	dyStringClear(sql);
	dyStringAppend(sql, "insert into caption values(default, \"");
	dyStringAppend(sql, caption);
	dyStringAppend(sql, "\")");
	sqlUpdate(conn, sql->string);
	verbose(1, "%s\n", sql->string);
	captionId = sqlLastAutoId(conn);
	hashAddInt(capHash, caption, captionId);
	}
    dyStringClear(sql);
    dyStringPrintf(sql, "update imageFile set caption=%d ", captionId);
    dyStringPrintf(sql, "where submissionSet=%d ", gensatId);
    dyStringPrintf(sql, "and submitId = \"%s\"", submitId);
    sqlUpdate(conn, sql->string);
    verbose(1, "%s\n", sql->string);
    }
dyStringFree(&sql);
}
Esempio n. 2
0
void scopCollapse(char *inFeat, char *inModel, char *outFeat, char *outDesc, 
	char *outKnownTo)
/* scopCollapse - Convert SCOP model to SCOP ID. Also make id/name converter file.. */
{
/* Process inModel file, writing three columns to output, and keeping
 * a couple of columns in a hash */
struct hash *modelToSeed = hashNew(18);
struct hash *seedToScop = hashNew(16);
struct lineFile *lf = lineFileOpen(inModel, TRUE);
FILE *f = mustOpen(outDesc, "w");
char *modRow[5];
while (lineFileRowTab(lf, modRow))
    {
    char *seedId = modRow[2];
    hashAdd(modelToSeed, modRow[0], cloneString(seedId) );
    if (!hashLookup(seedToScop, seedId))
        {
	char *scopId = modRow[1];
	hashAdd(seedToScop, seedId, cloneString(scopId));
	fprintf(f, "%s\t%s\t%s\n", scopId, seedId, modRow[4]);
	}
    }
carefulClose(&f);
lineFileClose(&lf);

/* Process in-feature.  We make up a structure for each protein here. */
struct hash *protHash = hashNew(18);
struct protInfo *prot, *protList = NULL;
lf = lineFileOpen(inFeat, TRUE);
char *featRow[6];
while (lineFileRow(lf, featRow))
    {
    prot = hashFindVal(protHash, featRow[0]);
    if (prot == NULL)
        {
	AllocVar(prot);
	hashAddSaveName(protHash, featRow[0], prot, &prot->name);
	slAddHead(&protList, prot);
	}
    struct protFeature *feature;
    AllocVar(feature);
    feature->protein = prot->name;
    feature->start = lineFileNeedNum(lf, featRow, 1);
    feature->end = lineFileNeedNum(lf, featRow, 2);
    feature->name = hashMustFindVal(modelToSeed, featRow[3]);
    feature->eVal = lineFileNeedDouble(lf, featRow, 4);
    feature->score = lineFileNeedDouble(lf, featRow, 5);
    slAddHead(&prot->featureList, feature);
    }
lineFileClose(&lf);
slReverse(&protList);

f = mustOpen(outFeat, "w");
FILE *fKnownTo = mustOpen(outKnownTo, "w");
for (prot = protList; prot != NULL; prot = prot->next)
    outputProt(prot, seedToScop, f, fKnownTo);
carefulClose(&f);
carefulClose(&fKnownTo);
}
Esempio n. 3
0
struct hash *readCsizeHash(char *filename)
/* read in a chrom sizes file */
{
    struct lineFile *lf = lineFileOpen(filename, TRUE);
    struct hash *cHash = hashNew(10);
    char *words[2];
    while (lineFileRowTab(lf, words))
	hashAddInt(cHash, words[0], sqlSigned(words[1]));
    lineFileClose(&lf);
    return cHash;
}
struct hash *makeGeneToProtHash(char *fileName)
/* Create hash that links gene name to protein name. 
 * Feed this in extended gene pred.*/
{
struct hash *hash = newHash(18);
char *row[11];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
while (lineFileRowTab(lf, row))
    hashAdd(hash, row[0], cloneString(row[10]));
lineFileClose(&lf);
return hash;
}
void hprdP2p(char *hprdBinaryPPI, char *hprdComplexes, char *outTab)
/* hprdP2p - Create hprd.p2p tab file from HPRD flat files for use with hgNetDist. */
{
FILE *f = mustOpen(outTab, "w");
char *row[8];
char *row2[6];
char *ids[100];
struct lineFile *lf = lineFileOpen(hprdBinaryPPI, TRUE);
while (lineFileRowTab(lf, row))
    {
    char *hprdId1 = row[1];
    char *hprdId2 = row[4];
    fprintf(f,"%s\t%s\t1.0\n",hprdId1,hprdId2);
    }
lineFileClose(&lf);

lf = lineFileOpen(hprdComplexes, TRUE);
char *lastComplex = "";
int i = 0;
while (lineFileRowTab(lf, row2))
    {
    char *complexId = row2[0];
    char *hprdId = row2[1];
    if (sameString(hprdId,"None"))
	continue;
    if (!sameString(complexId,lastComplex))
	{
    	iterateComplex(ids, i, f, lastComplex);
	i = 0;
	lastComplex = complexId;
	}
    ids[i++] = cloneString(hprdId);
    }
iterateComplex(ids, i, f, lastComplex);
lineFileClose(&lf);

carefulClose(&f);

}
Esempio n. 6
0
struct hash *loadNewToOldHash(char *oldToNewFile)
/* Read through 4 column file <position> <old> <new> <type> and make hash of old accessions 
 * keyed by new accession, only containing elements where new and old are different. */
{
struct lineFile *lf = lineFileOpen(oldToNewFile, TRUE);
char *row[4];
struct hash *hash = hashNew(16);
while (lineFileRowTab(lf, row))
    {
    char *oldAcc = row[1], *newAcc = row[2];
    if (newAcc[0] != 0 && !sameString(oldAcc, newAcc))
	hashAdd(hash, newAcc, cloneString(oldAcc));
    }
return hash;
}
void txCdsEvFromBorf(char *inBorf, char *txFa, char *outTce)
/* txCdsEvFromBorf - Convert borfBig format to txCdsEvidence (tce) in an effort 
 * to annotate the coding regions.. */
{
struct lineFile *lf = lineFileOpen(inBorf, TRUE);
struct hash *txHash = faReadAllIntoHash(txFa, dnaLower);
char *row[BORF_NUM_COLS];
FILE *f = mustOpen(outTce, "w");
while (lineFileRowTab(lf, row))
    {
    struct borf b;
    borfStaticLoad(row, &b);
    if (b.strand[0] == '+' && b.score >= 50)
	{
	struct dnaSeq *txSeq = hashFindVal(txHash, b.name);
	boolean hasStop = FALSE;
	if (b.cdsEnd + 3 < txSeq->size)
	    {
	    hasStop = isStopCodon(txSeq->dna + b.cdsEnd);
	    b.cdsEnd += 3;
	    }
	if (txSeq == NULL)
	    errAbort("%s is in %s but not %s", b.name, inBorf, txFa);
	int score = (b.score - 45)*5;
	if (score > 1000) score = 1000;
	if (score < 0) score = 0;
	fprintf(f, "%s\t", b.name);
	fprintf(f, "%d\t", b.cdsStart);
	fprintf(f, "%d\t", b.cdsEnd);
	fprintf(f, "%s\t", "bestorf");
	fprintf(f, "%s\t", ".");
	fprintf(f, "%d\t", score);
	fprintf(f, "%d\t", startsWith("atg", txSeq->dna + b.cdsStart));
	fprintf(f, "%d\t", hasStop);
	fprintf(f, "%d\t", 1);	
	fprintf(f, "%d,\t", b.cdsStart);
	fprintf(f, "%d,\n", b.cdsEnd - b.cdsStart);
	}
    }
lineFileClose(&lf);
carefulClose(&f);
}
struct hash *getFreqHash(char *freqFile)
/* Read the frequency file in, and store it in a hash and return that. */
{
struct hash *freqHash = newHash(23);
struct lineFile *lf = lineFileOpen(freqFile, TRUE);
char *words[3];
/* Assume there's a header and skip it. */
lineFileSkip(lf, 1);
while (lineFileRowTab(lf, words))
    {
    int val;
    lineFileNeedFullNum(lf, words, 1);
    lineFileNeedFullNum(lf, words, 2);
    val = (int)sqlUnsigned(words[2]);
    addFreqToHash(freqHash, words[0], words[1], val);
    }
lineFileClose(&lf);
hashTraverseEls(freqHash, sortSlPairList);
return freqHash;
}
Esempio n. 9
0
void encodeUserDbCrawl(char *input, char *output)
/* encodeUserDbCrawl - Mine user DB for ENCODE info.. */
{
struct lineFile *lf = lineFileOpen(input, TRUE);
FILE *f = mustOpen(output, "w");
char *row[6];
struct hash *varHash = hashNew(0);
struct trackVar *tvList = NULL, *tv;
int totalCount = 0;
int wgEncodeCount = 0;
while (lineFileRowTab(lf, row))
    {
    char *contents;
    contents = row[1];
    int useCount;
    char *lastUse = row[4];
    useCount = atoi(row[5]);
    if (useCount > 1 && startsWith("2011-1", lastUse) && (stringIn("db=hg18", contents) || stringIn("db=hg19", contents)))
	{
	boolean anyTrack, isEncode;
	parseContents(contents, varHash, &tvList, &anyTrack, &isEncode);
	if (isEncode)
	    wgEncodeCount++;
	if (anyTrack)
	    ++totalCount;
	}
    }
slSort(&tvList, trackVarCmp);
for (tv = tvList; tv != NULL; tv = tv->next)
    {
    fprintf(f, "%s\t%f\t%d\t%d\t%d\t%d\t%d\t%d\n",
    	tv->name, percentOn(tv), tv->full, tv->pack, tv->squish, 
	tv->dense, tv->show, tv->hide);
    }
printf("wgEncode in %d of %d\n", wgEncodeCount, totalCount);
carefulClose(&f);
}
Esempio n. 10
0
void txGeneColor(char *uniProtDb, char *infoFile, char *pickFile, char *outFile)
/* txGeneColor - Figure out color to draw gene in.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Open uniprot database connection. */
struct sqlConnection *uConn = sqlConnect(uniProtDb);

#ifdef OLD
/* Figure out our light and medium colors. */
mediumBlue.r = (6*trueBlue.r + 4*255)/10;
mediumBlue.g = (6*trueBlue.g + 4*255)/10;
mediumBlue.b = (6*trueBlue.b + 4*255)/10;
lightBlue.r = (1*trueBlue.r + 2*255)/3;
lightBlue.g = (1*trueBlue.g + 2*255)/3;
lightBlue.b = (1*trueBlue.b + 2*255)/3;
#endif /* OLD */

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    struct rgbColor *col;
    pick = hashFindVal(pickHash, info->name);
    if (pick != NULL)
        {
	char *source = pick->source;
	if (sameString(source, "RefPepValidated"))
	    col = &trueBlue;
	else if (sameString(source, "ccds"))
	    col = &trueBlue;
	else if (sameString(source, "RefPepReviewed"))
	    col = &trueBlue;
	else if (sameString(source, "RefSeqValidated"))
	    col = &trueBlue;
	else if (sameString(source, "RefSeqReviewed"))
	    col = &trueBlue;
	else if (sameString(source, "swissProt"))
	    col = &trueBlue;
	else if (startsWith("Ref", source))
	    col = &mediumBlue;
	else
	    col = &lightBlue;
	if (pick->swissProt[0] != 0)
	    {
	    char *acc = spLookupPrimaryAcc(uConn, pick->swissProt);
	    struct slName *pdbList = spPdbAccs(uConn, acc);
	    if (pdbList != NULL)
	        col = &black;
	    slFreeList(&pdbList);
	    }
	}
    else
        col = &lightBlue;
    fprintf(f, "%s\t%d\t%d\t%d\n", info->name, col->r, col->g, col->b);
    }
carefulClose(&f);
}
Esempio n. 11
0
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, 
	char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome)
/* txGeneCdsMap - Create mapping between CDS region of gene and genome. */
{
/* Load info into hash. */
struct hash *infoHash = hashNew(18);
struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }
lineFileClose(&lf);

/* Load refPep/tx alignments into hash keyed by tx. */
struct hash *refPslHash = hashNew(18);
struct psl *psl, *pslList  = pslLoadAll(refPepToTxPsl);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(refPslHash, psl->tName, psl);

struct hash *refToPepHash = hashTwoColumnFile(refToPepTab);
struct hash *chromSizeHash = hashNameIntFile(chromSizes);

/* Load in bed. */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Open output, and stream through bedList, writing output. */
FILE *fCdsToRna = mustOpen(cdsToRna, "w");
FILE *fRnaToGenome = mustOpen(rnaToGenome, "w");
int refTotal = 0, refFound = 0;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
	{
	char *chrom = bed->chrom;
	int chromSize = hashIntVal(chromSizeHash, chrom);
	info = hashMustFindVal(infoHash, bed->name);
	pick = hashMustFindVal(pickHash, bed->name);
	if (info->isRefSeq)
	    {
	    char *refAcc = txAccFromTempName(bed->name);
	    if (!startsWith("NM_", refAcc))
		errAbort("Don't think I did find that refSeq acc, got %s", refAcc);
	    char *protAcc = hashMustFindVal(refToPepHash, refAcc);
	    ++refTotal;
	    if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna))
	        ++refFound;
	    }
	else
	    {
	    fakeCdsToMrna(bed, fCdsToRna);
	    }
	fakeRnaToGenome(bed, chromSize, fRnaToGenome);
	}
    }
verbose(1, "Missed %d of %d refSeq protein mappings.  A small number of RefSeqs just map\n"
           "to genome in the UTR.\n", refTotal - refFound, refTotal);
carefulClose(&fCdsToRna);
carefulClose(&fRnaToGenome);
}
Esempio n. 12
0
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg)
/* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */
{
/* Load protein sequence into hashes */
struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper);
struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper);
struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper);

/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(inPicks, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load in bed */
struct bed *bed, *bedList = bedLoadNAll(inBed, 12);

/* Do reformatting and write output. */
FILE *f = mustOpen(outKg, "w");
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    char *protAcc = NULL;
    if (bed->thickStart < bed->thickEnd)
	{
        pick = hashMustFindVal(pickHash, bed->name);
	struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq;
	ucscSeq = hashMustFindVal(ucscProtHash, bed->name);
	if (pick->swissProt[0])
	    spSeq = hashMustFindVal(uniProtHash, pick->swissProt);
	if (pick->uniProt[0])
	    uniSeq = hashMustFindVal(uniProtHash, pick->uniProt);
	if (pick->refProt[0])
	    refPep = hashMustFindVal(refProtHash, pick->refProt);

	/* First we look for an exact match between the ucsc protein and
	 * something from swissProt/uniProt. */
	if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna))
	    protAcc = pick->swissProt;
	if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna))
	    protAcc = pick->uniProt;
	if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna))
	    {
	    protAcc = cloneString(pick->refProt);
	    chopSuffix(protAcc);
	    }

	if (protAcc == NULL)
	    {
	    if (pick->uniProt[0])
	        protAcc = pick->uniProt;
	    else 
		{
	        protAcc = cloneString(pick->refProt);
		chopSuffix(protAcc);
		}
	    }
	}
    outputKg(bed, emptyForNull(protAcc), f);
    }
carefulClose(&f);
}
void writeTab(struct hash *stageHash, struct hash *seqHash, 
	char *sourceImageDir, char *parsedTab, 
	struct hash *nameHash, char *outName)
/* Synthasize data and write out tab-separated file with one line for
 * each image. */
{
char sourceImage[PATH_LEN];
FILE *f = mustOpen(outName, "w");
struct lineFile *lf = lineFileOpen(parsedTab, TRUE);
char *row[6];

/* Write header. */
fprintf(f, "#");
fprintf(f, "gene\t");
fprintf(f, "submitId\t");
fprintf(f, "fileName\t");
fprintf(f, "imageWidth\t");
fprintf(f, "imageHeight\t");
fprintf(f, "bodyPart\t");
fprintf(f, "age\t");
fprintf(f, "minAge\t");
fprintf(f, "maxAge\t");
fprintf(f, "seq\n");

while (lineFileRowTab(lf, row))
    {
    char *clone = row[0];
    char *stage = row[1];
    char *part = row[2];
    char *dir = row[3];
    char *subdir = row[4];
    char *file = row[5];
    char *gene = hashFindVal(nameHash, clone);
    struct dnaSeq *seq = hashFindVal(seqHash, clone);
    int width, height;

    safef(sourceImage, sizeof(sourceImage), "%s/%s/%s/%s",
    	sourceImageDir, dir, subdir, file);
    jpegSize(sourceImage, &width, &height);

    if (gene == NULL)
        gene = clone;
    fprintf(f, "%s\t", gene);
    fprintf(f, "%s\t", clone);
    fprintf(f, "%s/%s\t", subdir, file);
    fprintf(f, "%d\t", width);
    fprintf(f, "%d\t", height);
    fprintf(f, "%s\t", part);
    if (sameString(stage, "mixed"))
	fprintf(f, "1\t0\t3\t");
    else
	{
	char *age = hashMustFindVal(stageHash, stage);
	fprintf(f, "%s\t", age);
	fprintf(f, "%s\t", age);
	fprintf(f, "%s\t", age);
	}
    if (seq != NULL)
	fprintf(f, "%s\n", seq->dna);
    else
        fprintf(f, "\n");
    }

carefulClose(&f);
}
Esempio n. 14
0
void writeTab(
	struct hash *imageHash, 
	struct hash *seqHash, 
	char *sourceImageDir, char *allenTab, 
	struct hash *nameHash, char *outName)
/* Synthesize data and write out tab-separated file with one line for
 * each image. */
{
char sourceImage[PATH_LEN];
FILE *f = mustOpen(outName, "w");
struct lineFile *lf = lineFileOpen(allenTab, TRUE);
char *row[5];

/* Write header. */
fprintf(f, "#");
fprintf(f, "gene\t");
fprintf(f, "refSeq\t");
fprintf(f, "locusLink\t"); 
fprintf(f, "submitId\t");    /* egeneid=68323 or genesym=1110003F05Rik */
fprintf(f, "fileName\t");
fprintf(f, "imageWidth\t");
fprintf(f, "imageHeight\t");
fprintf(f, "probeId\t");     /* actually, this not supported yet but would be great. */
fprintf(f, "seq\n");

while (lineFileRowTab(lf, row))
    {
    char *gene = row[0];
    /* char *geneName = row[1]; */
    char *entrez = row[2];
    char *refSeq = row[3];
    char *url = row[4];
    char *probeId = hashFindVal(nameHash, refSeq);
    struct dnaSeq *seq = NULL; 
    int width=0, height=0;
    char *relPath = hashFindVal(imageHash, gene);
    char *submitId = strchr(url,'='); 
    if (probeId)
    	seq = hashFindVal(seqHash, probeId);
    if (submitId)
	++submitId;  /* we want the string following first '=' */
    if (sameString(entrez,"0"))
	entrez = NULL;

    if (relPath)
	{
	safef(sourceImage, sizeof(sourceImage), "%s/%s", 
	    sourceImageDir, relPath);
	jpegSize(sourceImage, &width, &height);

	fprintf(f, "%s\t", gene);
	fprintf(f, "%s\t", refSeq);
	fprintf(f, "%s\t", entrez?entrez:"");
	fprintf(f, "%s\t", submitId);
	fprintf(f, "%s\t", relPath);
	fprintf(f, "%d\t", width);
	fprintf(f, "%d\t", height);
	fprintf(f, "%s\t", probeId?probeId:"");
	if (seq != NULL)
	    fprintf(f, "%s\n", seq->dna);
	else
	    fprintf(f, "\n");
	}
    }
lineFileClose(&lf);    
carefulClose(&f);
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}
Esempio n. 16
0
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, 
	char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile)
/* txGeneAlias - Make kgAlias and kgProtAlias tables.. */
{
/* Read and hash oldToNew */
struct hash *newToOldHash = loadNewToOldHash(oldToNew);

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);
struct sqlResult *sr;
char **row;
char query[256];

/* Open files. */
struct lineFile *lf = lineFileOpen(xrefFile, TRUE);
FILE *fAlias = mustOpen(aliasFile, "w");
FILE *fProt = mustOpen(protAliasFile, "w");

/* Stream through xref file, which has much of the info we need,
 * and which contains a line for each gene. */
char *words[KGXREF_NUM_COLS];
while (lineFileRowTab(lf, words))
    {
    /* Load the xref, and output most of it's fields as aliases. */
    struct kgXref *x = kgXrefLoad(words);
    char *id = x->kgID;
    outAlias(fAlias, id, x->kgID);
    outAlias(fAlias, id, x->mRNA);
    outAlias(fAlias, id, x->spID);
    outAlias(fAlias, id, x->spDisplayID);
    outAlias(fAlias, id, x->geneSymbol);
    outAlias(fAlias, id, x->refseq);
    outAlias(fAlias, id, x->protAcc);
    char *old = hashFindVal(newToOldHash, id);
    if (old != NULL)
        outAlias(fAlias, id, old);

    /* If we've got a uniProt ID, use that to get more info from uniProt. */
    char *acc = x->spID;
    if ((acc[0] != 0)  && (acc = spLookupPrimaryAccMaybe(uConn, acc)) != NULL)
        {
	/* Get current accession and output a bunch of easy protein aliases. */
	outProt(fProt, id, acc, acc);
	outProt(fProt, id, acc, x->spDisplayID);
	outProt(fProt, id, acc, x->geneSymbol);
	outProt(fProt, id, acc, x->protAcc);
	if (old != NULL)
	    outProt(fProt, id, acc, old);

	/* Throw in old swissProt accessions. */
	sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc);
	sr = sqlGetResult(uConn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    outAlias(fAlias, id, row[0]);
	    outProt(fProt, id, acc, row[0]);
	    }

	/* Throw in gene names that SwissProt knows about */
	struct slName *gene, *geneList = spGenes(uConn, acc);
	for (gene = geneList; gene != NULL; gene = gene->next)
	    {
	    outAlias(fAlias, id, gene->name);
	    outProt(fProt, id, acc, gene->name);
	    }
	slFreeList(&geneList);
	}
    /* Throw in gene names from genbank. */
    /* At some point we may want to restrict this to the primary transcript in a cluster. */
    ev = hashFindVal(evHash,  id);
    if (ev != NULL)
	{
	int i;
	for (i=0; i<ev->accCount; ++i)
	    {
	    sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc);
	    int nameId = sqlQuickNum(gConn, query);
	    if (nameId != 0)
		{
		char name[64];
		sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId);
		if (sqlQuickQuery(gConn, query, name, sizeof(name)))
		    outAlias(fAlias, id, name);
		}
	    }
	}

    kgXrefFree(&x);
    }

carefulClose(&fAlias);
carefulClose(&fProt);
}
Esempio n. 17
0
void edwFixReplaced(char *database, char *inTab, char *spikedTab, char *outSql, char *outRa)
/* edwFixReplaced - Clean up files that were replaced in ENCODE2. */
{
struct sqlConnection *conn = edwConnect();
struct lineFile *lf = lineFileOpen(inTab, TRUE);
FILE *fSql = mustOpen(outSql, "w");
FILE *fRa = mustOpen(outRa, "w");
char *row[2];
struct hash *renameHash = rootRenameHash();
struct hash *spikedHash = hashTwoColumnFile(spikedTab);
int depCount = 0, repCount = 0;
while (lineFileRowTab(lf, row))
    {
    /* Get fields in local variables. */
    char *oldFileName = row[0];
    char *objStatus = row[1];

    /* Do spikein rename lookup. */
    char *spiked = hashFindVal(spikedHash, oldFileName);
    if (spiked != NULL)
	{
	verbose(2, "renaming spikeing %s to %s\n", oldFileName, spiked);
        oldFileName = spiked;
	}

    /* Get rid of bai name for bam,bai pairs. */
    char *comma = strchr(oldFileName, ',');
    if (comma != NULL)
        {
	if (!endsWith(comma, ".bai"))
	    errAbort("Unexpected conjoining of files line %d of %s", lf->lineIx, lf->fileName);
	*comma = 0;
	}

    /* For .fastq.tgz files we got to unpack them. */
    if (endsWith(oldFileName, ".fastq.tgz"))
	{
	/* Get root name - name minus suffix */
	char *oldRoot = cloneString(oldFileName);
	chopSuffix(oldRoot);
	chopSuffix(oldRoot);
	verbose(2, "Processing fastq.tgz %s %s\n", oldFileName, oldRoot);

	// Find records for old version.
	char query[512];
	sqlSafef(query, sizeof(query), 
	    "select * from edwFile where submitFileName like '%s/%%/%s.fastq.tgz.dir/%%'"
	    " order by submitFileName",
	    database, oldRoot);
	struct edwFile *oldList = edwFileLoadByQuery(conn, query);
	int oldCount = slCount(oldList);
	if (oldCount == 0)
	    errAbort("No records match %s", query);


	// Find record for replaced version.
	// Fortunately all of the fastq.tgz's are just V2, which simplifies code a bit
	sqlSafef(query, sizeof(query), 
	    "select * from edwFile where submitFileName like '%s/%%/%sV2.fastq.tgz.dir/%%'"
	    " order by submitFileName",
	    database, oldRoot);
	struct edwFile *newList = edwFileLoadByQuery(conn, query);
	int newCount = slCount(newList);
	if (newCount == 0)
	    errAbort("No records match %s", query);

	// Make a hash of new records keyed by new file name inside of tgz
	struct edwFile *newEf;
	struct hash *newHash = hashNew(0);
	for (newEf = newList; newEf != NULL; newEf = newEf->next)
	    {
	    char fileName[FILENAME_LEN];
	    splitPath(newEf->submitFileName, NULL, fileName, NULL);
	    hashAdd(newHash, fileName, newEf);
	    verbose(2, " %s\n", fileName);
	    }
	verbose(2, "%d in oldList, %d in newList\n", oldCount, newCount);

	// Loop through old records trying to find corresponding new record
	struct edwFile *oldEf;
	for (oldEf = oldList; oldEf != NULL; oldEf = oldEf->next)
	    {
	    char fileName[FILENAME_LEN];
	    splitPath(oldEf->submitFileName, NULL, fileName, NULL);
	    struct edwFile *newEf = hashFindVal(newHash, fileName);
	    char *newName = "n/a";
	    fprintf(fSql, "update edwFile set deprecated='%s' where id=%u;\n", objStatus, oldEf->id);
	    ++depCount;
	    if (newEf != NULL)
	        {
		fprintf(fSql, "update edwFile set replacedBy=%u where id=%u;\n", newEf->id, oldEf->id);
		newName = newEf->submitFileName;
		++repCount;
		}
	    fprintf(fRa, "objStatus %s\n", objStatus);
	    fprintf(fRa, "oldFile %s\n", oldEf->submitFileName);
	    fprintf(fRa, "newFile %s\n", newName);
	    fprintf(fRa, "\n");
	    verbose(2, "%s -> %s\n", oldEf->submitFileName, newName);
	    }
	}
    else
	{

	/* Figure out new file name by either adding V2 at end, or if there is already a V#,
	 * replacing it. */
#ifdef SOON
#endif /* SOON */
	int oldVersion = 1;
	char *noVersion = NULL;
	    {
	    /* Split old file name into root and suffix. */
	    char *suffix = edwFindDoubleFileSuffix(oldFileName);
	    if (suffix == NULL)
		errAbort("No suffix in %s line %d of %s", oldFileName, lf->lineIx, lf->fileName);
	    char *oldRoot = cloneStringZ(oldFileName, suffix - oldFileName);
	    char *renamed = hashFindVal(renameHash, oldRoot);
	    if (renamed != NULL)
		{
		verbose(2, "Overriding %s with %s\n", oldRoot, renamed);
		oldRoot = cloneString(renamed);
		}


	    /* Look for V# at end of old root, and if it's there chop it off and update oldVersion */
	    noVersion = oldRoot;  // If no V, we done. */
	    char *vPos = strrchr(oldRoot, 'V');
	    if (vPos != NULL)
		{
		char *numPos = vPos + 1;
		int numSize = strlen(numPos);
		if (numSize == 1 || numSize == 2)
		    {
		    if (isAllDigits(numPos))
			{
			oldVersion = atoi(numPos);
			*vPos = 0;
			}
		    else
			errAbort("Expecting numbers after V in file name got %s line %d of %s",
			    numPos, lf->lineIx, lf->fileName);
		    }
		}
	    verbose(2, "%s parses to  %s %d %s\n", oldFileName, noVersion, oldVersion, suffix);

	    /* Find record for old file. */
	    char query[512];
	    sqlSafef(query, sizeof(query), 
		"select * from edwFile where submitFileName like '%s/%%/%s'", 
		database, oldFileName);
	    struct edwFile *oldEf = edwFileLoadByQuery(conn, query);
	    if (slCount(oldEf) != 1)
		errAbort("Expecting one result got %d for %s\n", slCount(oldEf), query);
	    fprintf(fSql, "# %s %s\n", oldFileName, objStatus);
	    verbose(2, "%s: %s\n", oldFileName, objStatus);

	    /* Find record for new file. */
	    struct edwFile *newEf = NULL;
	    int newVersion;
	    for (newVersion = oldVersion+1; newVersion < 7; ++newVersion)
		{
		sqlSafef(query, sizeof(query), 
		    "select * from edwFile where submitFileName like '%s/%%/%sV%d%s'",
		    database, noVersion, newVersion, suffix); 
		newEf = edwFileLoadByQuery(conn, query);
		if (newEf != NULL)
		    break;
		}
	    if (newEf == NULL)
		verbose(2, "Could not find next version of %s (%s)", oldFileName, oldRoot);
	    if (slCount(newEf) > 1)
		errAbort("Expecting one result got %d for %s\n", slCount(newEf), query);

	    long long oldId = oldEf->id;
	    fprintf(fSql, "update edwFile set deprecated='%s' where id=%lld;\n", objStatus, oldId);
	    ++depCount;
	    char *newName = "n/a";
	    if (newEf != NULL)
		{
		long long newId = newEf->id;
		fprintf(fSql, "update edwFile set replacedBy=%lld where id=%lld;\n", newId, oldId);
		newName = newEf->submitFileName;
		++repCount;
		}
	    fprintf(fRa, "objStatus %s\n", objStatus);
	    fprintf(fRa, "oldFile %s\n", oldEf->submitFileName);
	    fprintf(fRa, "newFile %s\n", newName);
	    fprintf(fRa, "\n");
	    verbose(2, "%s -> %s\n", oldEf->submitFileName, newName);
	    }
	}
    }
verbose(1, "%d deprecated, %d replaced\n", depCount, repCount);
carefulClose(&fSql);
carefulClose(&fRa);
}