static struct joinedTables *joinedTablesCreate( struct joiner *joiner,
        char *primaryDb, char *primaryTable,
        struct joinerDtf *fieldList, struct joinerDtf *filterTables,
        int maxRowCount, struct region *regionList)
/* Create joinedTables structure from fields. */
{
    struct tableJoiner *tj, *tjList = bundleFieldsIntoTables(fieldList, filterTables);
    struct joinerPair *routeList = NULL, *route;
    struct joinedTables *joined = NULL;
    struct hash *tableHash = newHash(8);
    int totalKeyCount = 0, totalFieldCount = 0;
    int curKeyCount = 0, curFieldCount = 0;
    struct joinerDtf *tableDtfs;

    for (tj = tjList; tj != NULL; tj = tj->next)
    {
        char buf[256];
        safef(buf, sizeof(buf), "%s.%s", tj->database, tj->table);
        hashAdd(tableHash, buf, tj);
    }
    orderTables(&tjList, primaryDb, primaryTable);
    tableDtfs = tableToDtfs(tjList);
    routeList = joinerFindRouteThroughAll(joiner, tableDtfs);
    if (routeList == NULL)
        errAbort("Can't find route from %s to %s via all.joiner", primaryTable, tjList->next->table);
    addOutKeys(tableHash, routeList, &tjList);

    /* If first table is non-positional then it will lead to a lot
     * of n/a's in later fields unless we treat the genome-wide. */
    if (!isPositional(tjList->database, tjList->table))
        regionList = getRegionsFullGenome();
    /* Count up total fields and keys. */
    for (tj = tjList; tj != NULL; tj = tj->next)
    {
        totalKeyCount += slCount(tj->keysOut);
        totalFieldCount += slCount(tj->fieldList);
    }

    /* Do first table.  This one uses identifier hash if any. */
    {
        joined = tjLoadFirst(regionList,
                             tjList, totalFieldCount, totalKeyCount, maxRowCount);
        curKeyCount = slCount(tjList->keysOut);
        curFieldCount = slCount(tjList->fieldList);
    }

    /* Follow routing list for rest. */
    if (!sameString(tjList->database, routeList->a->database))
        internalErr();
    if (!sameString(tjList->table, routeList->a->table))
        internalErr();
    for (route = routeList; route != NULL; route = route->next)
    {
        struct tableJoiner *tj = findTableJoiner(tjList,
                                 route->b->database, route->b->table);
        struct joinerField *jfA = NULL, *jfB = NULL;
        if (tj == NULL)
            internalErr();
        jfA = findJoinerField(route->identifier, route->a);
        if (jfA == NULL)
        {
            internalErr();
        }
        jfB = findJoinerField(route->identifier, route->b);
        if (jfB == NULL)
            internalErr();
        if (!tj->loaded)
        {
            int keyIx;
            struct hash *keyHash = NULL;
            keyIx = findDtfIndex(joined->keyList, route->a);
            if (keyIx < 0)
                internalErr();
            keyHash = hashKeyField(joined, keyIx, jfA);
            tjLoadSome(regionList, joined, curFieldCount, curKeyCount,
                       route->b->field, keyHash,
                       jfB->chopBefore, jfB->chopAfter,
                       tj, isPositional(tj->database, tj->table),  FALSE);
            curKeyCount += slCount(tj->keysOut);
            curFieldCount += slCount(tj->fieldList);
            hashFree(&keyHash);
        }
    }
    joinerDtfFreeList(&tableDtfs);
    hashFree(&tableHash);
    tableJoinerFreeList(&tjList);
    return joined;
}
struct genoLay *genoLayNew(struct genoLayChrom *chromList,
	MgFont *font, int picWidth, int betweenChromHeight,
	int minLeftLabelWidth, int minRightLabelWidth,
	char *how)
/* Figure out layout.  For human and most mammals this will be
 * two columns with sex chromosomes on bottom.  This is complicated
 * by the platypus having a bunch of sex chromosomes. */
{
int margin = 3;
struct slRef *refList = NULL, *ref, *left, *right;
struct genoLayChrom *chrom;
struct genoLay *gl;
int autoCount, halfCount, bases, chromInLine;
int leftLabelWidth=0, rightLabelWidth=0, labelWidth;
int spaceWidth = mgFontCharWidth(font, ' ');
int extraLabelPadding = 0;
int autosomeOtherPixels=0, sexOtherPixels=0;
int autosomeBasesInLine=0;	/* Maximum bases in a line for autosome. */
int sexBasesInLine=0;		/* Bases in line for sex chromsome. */
double sexBasesPerPixel, autosomeBasesPerPixel, basesPerPixel;
int pos = margin;
int y = 0;
int fontHeight = mgFontLineHeight(font);
int chromHeight = fontHeight;
int lineHeight = chromHeight + betweenChromHeight;
boolean allOneLine = FALSE;

refList = refListFromSlList(chromList);

/* Allocate genoLay object and fill in simple fields. */
AllocVar(gl);
gl->chromList = chromList;
gl->chromHash = hashNew(0);
gl->font = font;
gl->picWidth = picWidth;
gl->margin = margin;
gl->spaceWidth = spaceWidth;
gl->lineHeight = lineHeight;
gl->betweenChromHeight = betweenChromHeight;
gl->betweenChromOffsetY = 0;
gl->chromHeight = chromHeight;
gl->chromOffsetY = lineHeight - chromHeight;


/* Save chromosomes in hash too, for easy access */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    hashAdd(gl->chromHash, chrom->fullName, chrom);

if (sameString(how, genoLayOnePerLine))
    {
    gl->leftList = refList;
    }
else if (sameString(how, genoLayAllOneLine))
    {
    gl->bottomList = refList;
    allOneLine = TRUE;
    }
else
    {
    /* Put sex chromosomes on bottom, and rest on left. */
    separateSexChroms(refList, &refList, &gl->bottomList);
    autoCount = slCount(refList);
    gl->leftList = refList;

    /* If there are a lot of chromosomes, then move later
     * (and smaller) chromosomes to a new right column */
    if (autoCount > 12)
	{
	halfCount = (autoCount+1)/2;
	ref = slElementFromIx(refList, halfCount-1);
	gl->rightList = ref->next;
	ref->next = NULL;
	slReverse(&gl->rightList);
	}
    }

if (allOneLine)
    {
    unsigned long totalBases = 0, bStart=0, bEnd;
    int chromCount = 0, chromIx=0;
    for (ref = gl->bottomList; ref != NULL; ref = ref->next)
        {
	chrom = ref->val;
	totalBases += chrom->size;
	chromCount += 1;
	}
    int availablePixels = picWidth - minLeftLabelWidth - minRightLabelWidth
       - 2*margin - (chromCount-1);
    double basesPerPixel = (double)totalBases/availablePixels;
    gl->picHeight = 2*margin + lineHeight + fontHeight;
    for (ref = gl->bottomList; ref != NULL; ref = ref->next)
        {
	chrom = ref->val;
	bEnd = bStart + chrom->size;
	int pixStart = round(bStart / basesPerPixel);
	int pixEnd = round(bEnd / basesPerPixel);
	chrom->width = pixEnd - pixStart;
	chrom->height = lineHeight;
	chrom->x = pixStart + margin + chromIx + minLeftLabelWidth;
	chrom->y = 0;
	chromIx += 1;
	bStart = bEnd;
	}
    gl->lineCount = 1;
    gl->picHeight = 2*margin + lineHeight + fontHeight + 1;
    gl->allOneLine = TRUE;
    gl->leftLabelWidth = minLeftLabelWidth;
    gl->rightLabelWidth = minRightLabelWidth;
    gl->basesPerPixel = basesPerPixel;
    gl->pixelsPerBase = 1.0/basesPerPixel;
    }
else
    {
    /* Figure out space needed for autosomes. */
    left = gl->leftList;
    right = gl->rightList;
    while (left || right)
	{
	bases = 0;
	chromInLine = 0;
	if (left)
	    {
	    chrom = left->val;
	    labelWidth = mgFontStringWidth(font, chrom->shortName) + spaceWidth;
	    if (leftLabelWidth < labelWidth)
		leftLabelWidth = labelWidth;
	    bases = chrom->size;
	    left = left->next;
	    }
	if (right)
	    {
	    chrom = right->val;
	    labelWidth = mgFontStringWidth(font, chrom->shortName) + spaceWidth;
	    if (rightLabelWidth < labelWidth)
		rightLabelWidth = labelWidth;
	    bases += chrom->size;
	    right = right->next;
	    }
	if (autosomeBasesInLine < bases)
	    autosomeBasesInLine = bases;
	gl->lineCount += 1;
	}

    /* Figure out space needed for bottom chromosomes. */
    if (gl->bottomList)
	{
	gl->lineCount += 1;
	sexOtherPixels = spaceWidth + 2*margin;
	for (ref = gl->bottomList; ref != NULL; ref = ref->next)
	    {
	    chrom = ref->val;
	    sexBasesInLine += chrom->size;
	    labelWidth = mgFontStringWidth(font, chrom->shortName) + spaceWidth;
	    if (ref == gl->bottomList )
		{
		if (leftLabelWidth < labelWidth)
		    leftLabelWidth  = labelWidth;
		sexOtherPixels = leftLabelWidth;
		}
	    else if (ref->next == NULL)
		{
		if (rightLabelWidth < labelWidth)
		    rightLabelWidth  = labelWidth;
		sexOtherPixels += rightLabelWidth + spaceWidth;
		}
	    else
		{
		sexOtherPixels += labelWidth + spaceWidth;
		}
	    }
	}

    /* Do some adjustments if side labels are bigger than needed for
     * chromosome names. */
    if (leftLabelWidth < minLeftLabelWidth)
	{
	extraLabelPadding += (minLeftLabelWidth - leftLabelWidth);
	leftLabelWidth = minLeftLabelWidth;
	}
    if (rightLabelWidth < minRightLabelWidth)
	{
	extraLabelPadding += (minRightLabelWidth - rightLabelWidth);
	rightLabelWidth = minRightLabelWidth;
	}
    sexOtherPixels += extraLabelPadding;

    /* Figure out the number of bases needed per pixel. */
    autosomeOtherPixels = 2*margin + spaceWidth + leftLabelWidth + rightLabelWidth;
    basesPerPixel = autosomeBasesPerPixel 
	    = autosomeBasesInLine/(picWidth-autosomeOtherPixels);
    if (gl->bottomList)
	{
	sexBasesPerPixel = sexBasesInLine/(picWidth-sexOtherPixels);
	if (sexBasesPerPixel > basesPerPixel)
	    basesPerPixel = sexBasesPerPixel;
	}

    /* Save positions and sizes of some things in layout structure. */
    gl->leftLabelWidth = leftLabelWidth;
    gl->rightLabelWidth = rightLabelWidth;
    gl->basesPerPixel = basesPerPixel;
    gl->pixelsPerBase = 1.0/basesPerPixel;

    /* Set pixel positions for left autosomes */
    for (ref = gl->leftList; ref != NULL; ref = ref->next)
	{
	chrom = ref->val;
	chrom->x = leftLabelWidth + margin;
	chrom->y = y;
	chrom->width = round(chrom->size/basesPerPixel);
	chrom->height = lineHeight;
	y += lineHeight;
	}

    /* Set pixel positions for right autosomes */
    y = 0;
    for (ref = gl->rightList; ref != NULL; ref = ref->next)
	{
	chrom = ref->val;
	chrom->width = round(chrom->size/basesPerPixel);
	chrom->height = lineHeight;
	chrom->x = picWidth - margin - rightLabelWidth - chrom->width;
	chrom->y = y;
	y += lineHeight;
	}
    gl->picHeight = 2*margin + lineHeight * gl->lineCount;
    y = gl->picHeight - margin - lineHeight;

    /* Set pixel positions for sex chromosomes */
    for (ref = gl->bottomList; ref != NULL; ref = ref->next)
	{
	chrom = ref->val;
	chrom->y = y;
	chrom->width = round(chrom->size/basesPerPixel);
	chrom->height = lineHeight;
	if (ref == gl->bottomList)
	    chrom->x = leftLabelWidth + margin;
	else if (ref->next == NULL)
	    chrom->x = picWidth - margin - rightLabelWidth - chrom->width;
	else
	    chrom->x = 2*spaceWidth+mgFontStringWidth(font,chrom->shortName) + pos;
	pos = chrom->x + chrom->width;
	}
    }
return gl;
}
Ejemplo n.º 3
0
struct fullExperiment *getFullExperimentList(struct sqlConnection *conn,
    struct edwExperiment *eeList, char *assembly, struct hash **retHash)
/* Given list of edwExperiments, return list of ones replicated with full file sets on
 * both replicates.  If optional retHash is non-NULL then return a hash full of same 
 * experiments keyed by experiment accession */
{
/* Build up a list of fullExperiments and a hash keyed by name. */
struct hash *hash = hashNew(14);
struct fullExperiment *fullList = NULL;
struct edwExperiment *ee;
for (ee = eeList; ee != NULL; ee = ee->next)
    {
    struct fullExperiment *full = hashFindVal(hash, ee->accession);
    if (full == NULL)
        {
	AllocVar(full);
	full->name = cloneString(ee->accession);
	full->exp = ee;
	slAddHead(&fullList, full);
	hashAdd(hash, full->name, full);
	}
    }
uglyf("Got %d in eeList, %d in fullList, %d in hash\n", slCount(eeList), slCount(fullList), hash->elCount);

/* Build up SQL query to efficiently fetch all good files and valid files from our experiment */
struct dyString *q = dyStringNew(16*1024);
sqlDyStringPrintf(q, "select edwValidFile.*,edwFile.*,eapOutput.* "
		  " from edwValidFile,edwFile,eapOutput "
                  " where edwValidFile.fileId = edwFile.id and edwFile.id = eapOutput.fileId "
		  " and edwFile.deprecated='' and edwFile.errorMessage='' "
		  " and edwValidFile.ucscDb != 'centro.hg19' "
		  " and edwValidFile.ucscDb like '%%%s' and edwValidFile.experiment in ("
		  , assembly);
for (ee = eeList; ee != NULL; ee = ee->next)
    {
    dyStringPrintf(q, "'%s'", ee->accession);
    if (ee->next != NULL)
        dyStringAppendC(q, ',');
    }
dyStringAppendC(q, ')');

/* Loop through this making up vFiles that ultimately are attached to replicates. */
int vCount = 0;
struct sqlResult *sr = sqlGetResult(conn, q->string);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    {
    ++vCount;
    struct edwValidFile *valid = edwValidFileLoad(row);
    fixOutputType(valid);
    struct edwFile *file = edwFileLoad(row + EDWVALIDFILE_NUM_COLS);
    struct eapOutput *eapOutput = eapOutputLoad(row + EDWVALIDFILE_NUM_COLS + EDWFILE_NUM_COLS);
    struct vFile *vf = vFileNew(file, valid, eapOutput);
    struct fullExperiment *full = hashMustFindVal(hash, valid->experiment);
    struct replicate *rep = findOrMakeReplicate(valid->replicate, &full->repList);
    char *format = valid->format;
    if (sameString(format, "bam"))
        slAddHead(&rep->bamList, vf);
    else if (sameString(format, "bigWig"))
        slAddHead(&rep->bigWigList, vf);
    else if (sameString(format, "narrowPeak") && !sameString(valid->outputType, "replicated_narrowPeak"))
        slAddHead(&rep->narrowList, vf);
    else if (sameString(format, "broadPeak") && !sameString(valid->outputType, "replicated_broadPeak"))
        slAddHead(&rep->broadList, vf);
    }
sqlFreeResult(&sr);
uglyf("Got %d vFiles\n", vCount);

dyStringFree(&q);

/* Free hash or return it, and return list. */
if (retHash == NULL)
    hashFree(&hash);
else
    *retHash = hash;
return fullList;
}
Ejemplo n.º 4
0
void knownToVisiGene(char *database)
/* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */
{
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);
struct sqlConnection *hConn = sqlConnect(database);
struct sqlConnection *iConn = sqlConnect(visiDb);
struct sqlResult *sr;
char **row;
struct hash *geneImageHash = newHash(18);
struct hash *locusLinkImageHash = newHash(18);
struct hash *refSeqImageHash = newHash(18);
struct hash *genbankImageHash = newHash(18);
struct hash *probeImageHash = newHash(18);
struct hash *knownToLocusLinkHash = newHash(18);
struct hash *knownToRefSeqHash = newHash(18);
struct hash *knownToGeneHash = newHash(18);
struct hash *favorHugoHash = newHash(18);
struct hash *knownToProbeHash = newHash(18);
struct hash *knownToAllProbeHash = newHash(18);
struct genePred *knownList = NULL, *known;
struct hash *dupeHash = newHash(17);


probesDb  = optionVal("probesDb", database);
struct sqlConnection *probesConn = sqlConnect(probesDb);
vgProbes = sqlTableExists(probesConn,"vgProbes");
vgAllProbes = sqlTableExists(probesConn,"vgAllProbes");

/* Go through and make up hashes of images keyed by various fields. */
sr = sqlGetResult(iConn,
        "NOSQLINJ select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank"
	",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id"
	" from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap"
	" where image.imageFile = imageFile.id"
	" and image.id = imageProbe.image"
	" and imageProbe.probe = probe.id"
	" and probe.gene = gene.id"
	" and image.submissionSet=submissionSet.id"
	" and vgPrbMap.probe = probe.id");

while ((row = sqlNextRow(sr)) != NULL)
    {
    int id = sqlUnsigned(row[0]);
    float priority = atof(row[1]);
    int privateUser = sqlSigned(row[7]);
    char vgPrb_Id[256];
    safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]);
    int geneId = sqlUnsigned(row[9]);
    if (privateUser == 0)
	{
	addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id);
	addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]);
	addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]);
	addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]);
	addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]);
	}
    }
verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d"
           ", genbankImageHash %d probeImageHash %d\n", 
            geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, 
	    genbankImageHash->elCount, probeImageHash->elCount);
sqlFreeResult(&sr);

/* Build up list of known genes. */
sr = sqlGetResult(hConn, "NOSQLINJ select * from knownGene");
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *known = genePredLoad(row);
    if (!hashLookup(dupeHash, known->name))
        {
	hashAdd(dupeHash, known->name, NULL);
	slAddHead(&knownList, known);
	}
    }
slReverse(&knownList);
sqlFreeResult(&sr);
verbose(2, "Got %d known genes\n", slCount(knownList));

/* Build up hashes from knownGene to other things. */
if (vgProbes)
    bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash);
if (vgAllProbes)
    bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash);

foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE);
foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE);
foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE);
foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);
foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);

verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", 
   knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount,
   knownToProbeHash->elCount, knownToAllProbeHash->elCount);

/* Try and find an image for each gene. */
for (known = knownList; known != NULL; known = known->next)
    {
    char *name = known->name;
    struct prioritizedImage *best = NULL;
    {
    best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash);
    if (!best)
	best = bestImage(name, knownToRefSeqHash, refSeqImageHash);
    if (!best)
	{
	best = hashFindVal(genbankImageHash, name);
	}
    if (!best)
	best = bestImage(name, knownToGeneHash, geneImageHash);
    if (vgProbes && !best)
	best = bestImage(name, knownToProbeHash, probeImageHash);
    if (vgAllProbes && !best)
	best = bestImage(name, knownToAllProbeHash, probeImageHash);
    }	    
    if (best)
        {
	fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId);
	}
    }

createTable(hConn, outTable);
hgLoadTabFile(hConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}
void ffaToFa(char *inFile, char *outDir, char *outTabName)
/* convert Greg Schulers .ffa fasta files to our .fa files */
{
struct lineFile *in;
FILE *out = NULL, *tab;
int lineSize;
char *line;
char ucscName[128];
char path[512];
static char lastPath[512];
int outFileCount = 0;
struct hash *uniqClone = newHash(16);
struct hash *uniqFrag = newHash(19);
boolean ignore = FALSE;

makeDir(outDir);
errLog = mustOpen("ffaToFa.err", "w");
tab = mustOpen(outTabName, "w");
printf("Converting %s", inFile);
fflush(stdout);
if (sameString(inFile, "stdin"))
    in = lineFileStdin(TRUE);
else
    in = lineFileOpen(inFile, TRUE);
while (lineFileNext(in, &line, &lineSize))
    {
    if (line[0] == '>')
	{
	ignore = FALSE;
	gsToUcsc(line+1, ucscName);
	faRecNameToFaFileName(outDir, ucscName, path);
	if (hashLookup(uniqFrag, ucscName))
	    {
	    ignore = TRUE;
	    warn("Duplicate %s in %s, ignoring all but first",
	    	ucscName, inFile);
	    }
	else
	    {
	    hashAdd(uniqFrag, ucscName, NULL);
	    }
	if (!sameString(path, lastPath))
	    {
	    strcpy(lastPath, path);
	    carefulClose(&out);
	    if (hashLookup(uniqClone, path))
		{
		warn("Duplicate %s in %s ignoring all but first", 
		    ucscName, inFile);
		}
	    else
		{
		hashAdd(uniqClone, path, NULL);
		out = mustOpen(path, "w");
		++outFileCount;
		if ((outFileCount&7) == 0)
		    {
		    putc('.', stdout);
		    fflush(stdout);
		    }
		}
	    }
	if (out != NULL && !ignore)
	    {
	    fprintf(out, ">%s\n", ucscName);
	    fprintf(tab, "%s\t%s\n", ucscName, line+1);
	    }
	}
    else
	{
	if (out != NULL && !ignore)
	    {
	    fputs(line, out);
	    fputc('\n', out);
	    }
	}
    }
carefulClose(&out);
fclose(tab);
lineFileClose(&in);
printf("Made %d .fa files in %s\n", outFileCount, outDir);
}
Ejemplo n.º 6
0
void txGeneCanonical(char *codingCluster, char *infoFile, 
	char *noncodingGraph, char *genesBed, char *nearCoding, 
	char *outCanonical, char *outIsoforms, char *outClusters)
/* txGeneCanonical - Pick a canonical version of each gene - that is the form
 * to use when just interested in a single splicing varient. Produces final
 * transcript clusters as well. */
{
/* Read in input into lists in memory. */
struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster);
struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph);
struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12);
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
struct bed *nearList = bedLoadNAll(nearCoding, 12);

/* Make hash of all beds. */
struct hash *bedHash = hashNew(18);
for (bed = bedList; bed != NULL; bed = bed->next)
    hashAdd(bedHash, bed->name, bed);

/* Make has of all info. */
struct hash *infoHash = hashNew(18);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Make a binKeeper structure that we'll populate with coding genes. */
struct hash *sizeHash = minChromSizeFromBeds(bedList);
struct hash *keeperHash = minChromSizeKeeperHash(sizeHash);

/* Make list of coding genes and toss them into binKeeper.
 * This will eat up bed list, but bedHash is ok. */
struct gene *gene, *geneList = NULL;
for (coding = codingList; coding != NULL; coding = coding->next)
    {
    gene = geneFromCluster(coding, bedHash, infoHash);
    slAddHead(&geneList, gene);
    struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom);
    binKeeperAdd(bk, gene->start, gene->end, gene);
    }

/* Go through near-coding genes and add them to the coding gene
 * they most overlap. */
for (bed = nearList; bed != NULL; bed = nextBed)
    {
    nextBed = bed->next;
    gene = mostOverlappingGene(keeperHash, bed);
    if (gene == NULL)
        errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name);
    geneAddBed(gene, bed);
    }

/* Add non-coding genes. */
for (graph = graphList; graph != NULL; graph = graph->next)
    {
    gene = geneFromGraph(graph, bedHash);
    slAddHead(&geneList, gene);
    }

/* Sort so it all looks nicer. */
slSort(&geneList, geneCmp);

/* Open up output files. */
FILE *fCan = mustOpen(outCanonical, "w");
FILE *fIso = mustOpen(outIsoforms, "w");
FILE *fClus = mustOpen(outClusters, "w");

/* Loop through, making up gene name, and writing output. */
int geneId = 0;
for (gene = geneList; gene != NULL; gene = gene->next)
    {
    /* Make up name. */
    char name[16];
    safef(name, sizeof(name), "g%05d", ++geneId);

    /* Reverse transcript list just to make it look better. */
    slReverse(&gene->txList);

    /* Write out canonical file output */
    bed = hashMustFindVal(bedHash, gene->niceTx->name);
    fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n",
    	bed->chrom, bed->chromStart, bed->chromEnd, geneId,
	gene->niceTx->name, gene->niceTx->name);

    /* Write out isoforms output. */
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fIso, "%d\t%s\n", geneId, bed->name);

    /* Write out cluster output, starting with bed 6 standard fields. */
    fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t",
    	gene->chrom, gene->start, gene->end, name, 0, gene->strand);

    /* Write out thick-start/thick end. */
    if (gene->isCoding)
        {
	int thickStart = gene->end, thickEnd  = gene->start;
	for (bed = gene->txList; bed != NULL; bed = bed->next)
	    {
	    if (bed->thickStart < bed->thickEnd)
	        {
		thickStart = min(thickStart, bed->thickStart);
		thickEnd = max(thickEnd, bed->thickEnd);
		}
	    }
	fprintf(fClus, "%d\t%d\t", thickStart, thickEnd);
	}
    else
        {
	fprintf(fClus, "%d\t%d\t", gene->start, gene->start);
	}

    /* We got no rgb value, just write out zero. */
    fprintf(fClus, "0\t");

    /* Get exons from exonTree. */
    struct range *exon, *exonList = rangeTreeList(gene->exonTree);
    fprintf(fClus, "%d\t", slCount(exonList));
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->start - gene->start);
    fprintf(fClus, "\t");
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->end - exon->start);
    fprintf(fClus, "\t");

    /* Write out associated transcripts. */
    fprintf(fClus, "%d\t", slCount(gene->txList));
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fClus, "%s,", bed->name);
    fprintf(fClus, "\t");

    /* Write out nice value */
    fprintf(fClus, "%s\t", gene->niceTx->name);

    /* Write out coding/noncoding value. */
    fprintf(fClus, "%d\n", gene->isCoding);
    }

/* Close up files. */
carefulClose(&fCan);
carefulClose(&fIso);
carefulClose(&fClus);
}
Ejemplo n.º 7
0
void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
    char *fileName, int lineIx)
/* Process one row of GFF file (a non-comment line parsed by tabs normally). */
{
struct hashEl *hel;
struct gffLine *gl;

if (wordCount < 8)
    gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
AllocVar(gl);

if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
    {
    struct gffSeqName *el;
    AllocVar(el);
    hel = hashAdd(gff->seqHash, words[0], el);
    el->name = hel->name;
    slAddHead(&gff->seqList, el);
    }
gl->seq = hel->name;

if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
    {
    struct gffSource *el;
    AllocVar(el);
    hel = hashAdd(gff->sourceHash, words[1], el);
    el->name = hel->name;
    slAddHead(&gff->sourceList, el);
    }
gl->source = hel->name;

if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
    {
    struct gffFeature *el;
    AllocVar(el);
    hel = hashAdd(gff->featureHash, words[2], el);
    el->name = hel->name;
    slAddHead(&gff->featureList, el);
    }
gl->feature = hel->name;

if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
   gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");	
gl->start = atoi(words[3])-1 + baseOffset;
gl->end = atoi(words[4]) + baseOffset;
gl->score = atof(words[5]);
gl->strand = words[6][0];
gl->frame = words[7][0];

if (wordCount >= 9)
    {
    if (!gff->typeKnown)
	{
	gff->typeKnown = TRUE;
	gff->isGtf = isGtfGroup(words[8]);
	}
    if (gff->isGtf)
	{
	parseGtfEnd(words[8], gff, gl, fileName, lineIx);
	}
    else
	{
	char *tnName = gffTnName(gl->seq, trimSpaces(words[8]));
	if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
	    {
	    struct gffGroup *group;
	    AllocVar(group);
	    hel = hashAdd(gff->groupHash, tnName, group);
	    group->name = hel->name;
	    group->seq = gl->seq;
	    group->source = gl->source;
	    slAddHead(&gff->groupList, group);
	    }
	gl->group = hel->name;
	}
    }
slAddHead(&gff->lineList, gl);
}
Ejemplo n.º 8
0
void cartJsonRegisterHandler(struct cartJson *cj, char *command, CartJsonHandler *handler)
/* Associate handler with command; when javascript sends command, handler will be executed. */
{
hashAdd(cj->handlerHash, command, handler);
}
Ejemplo n.º 9
0
void doEnrichments(struct sqlConnection *conn, struct cdwFile *ef, char *path, 
    struct hash *assemblyToTarget)
/* Calculate enrichments on for all targets file. The targetList and the
 * grtList are in the same order. */
{
/* Get validFile from database. */
struct cdwValidFile *vf = cdwValidFileFromFileId(conn, ef->id);
if (vf == NULL)
    return;	/* We can only work if have validFile table entry */

if (!isEmpty(vf->enrichedIn) && !sameWord(vf->ucscDb, "unknown") && !isEmpty(vf->ucscDb)
    && !sameWord(vf->format, "unknown"))
    {
    /* Get our assembly */
    char *format = vf->format;
    char *ucscDb = vf->ucscDb;
    char *targetName = cdwSimpleAssemblyName(ucscDb);
    struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, targetName);

    struct target *targetList = hashFindVal(assemblyToTarget, assembly->name);
    if (targetList == NULL)
	{
	targetList = targetsForAssembly(conn, assembly);
	if (targetList == NULL)
	    errAbort("No targets for assembly %s", assembly->name);
	hashAdd(assemblyToTarget, assembly->name, targetList);
	}

    /* Loop through targetList zeroing out existing ovelaps. */
    struct target *target;
    boolean allSkip = TRUE;
    for (target = targetList; target != NULL; target = target->next)
	{
	target->overlapBases = target->uniqOverlapBases = 0;
	target->skip = enrichmentExists(conn, ef, target->target);
	if (!target->skip)
	    allSkip = FALSE;
	}

    /* Do a big dispatch based on format. */
    if (!allSkip)
	{
	if (sameString(format, "fastq"))
	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
	else if (sameString(format, "bigWig"))
	    doEnrichmentsFromBigWig(conn, ef, vf, assembly, targetList);
	else if (startsWith("bed_", format))
	    doEnrichmentsFromBed(conn, ef, vf, assembly, targetList);
	else if (cdwIsSupportedBigBedFormat(format) || sameString(format, "bigBed"))
	    doEnrichmentsFromBigBed(conn, ef, vf, assembly, targetList);
	else if (sameString(format, "gtf"))
	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
	else if (sameString(format, "gff"))
	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
	else if (sameString(format, "bam"))
	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
	else if (sameString(format, "vcf"))
	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
	else if (sameString(format, "idat"))
	    verbose(2, "Ignoring idat %s, in doEnrichments.", ef->cdwFileName);
	else if (sameString(format, "customTrack"))
	    verbose(2, "Ignoring customTrack %s, in doEnrichments.", ef->cdwFileName);
	else if (sameString(format, "rcc"))
	    verbose(2, "Ignoring rcc %s, in doEnrichments.", ef->cdwFileName);
	else if (sameString(format, "bam.bai"))
	    verbose(2, "Ignoring bam.bai %s, in doEnrichments - just and index file.", 
		ef->cdwFileName);
	else if (sameString(format, "vcf.gz.tbi"))
	    verbose(2, "Ignoring vcf.gz.tbi %s, in doEnrichments - just and index file.", 
		ef->cdwFileName);
	else if (sameString(format, "unknown"))
	    verbose(2, "Unknown format in doEnrichments(%s), that's ok.", ef->cdwFileName);
	else
	    errAbort("Unrecognized format %s in doEnrichments(%s)", format, path);
	}

    /* Clean up and go home. */
    cdwAssemblyFree(&assembly);
    }
cdwValidFileFree(&vf);
}
void readCloneNames(struct lineFile *clf)
/* read internal BAC clone names and Sanger sts names */
{
struct alias *a = NULL;
struct sanger *s = NULL;
char *words[4], *name = NULL, *sanger = NULL, *extName = NULL;
int i, rel;
char sep = '|';
boolean found = FALSE, posFound = FALSE;

/* alias hash is keyed by Sanger sts name */
aliasHash = newHash(16);
/* hash of Sanger names keyed by external name */
sangerByExtNameHash = newHash(16);

/* Read in all rows */
while (lineFileChopCharNext(clf, sep, words, 5))
    {
    name = cloneString(words[0]);
    sanger = cloneString(words[1]);
    if (!sameString(words[2], ""))
        rel = sqlUnsigned(words[2]);
    else
        rel = 3;
    /* find external name for this internal name from the extNameHash */
    if ((extName = hashFindVal(extNameHash, name)) == NULL)
        {
        /* if not found in BAC hash, then need to use internal name to make extName */
        extName = translateName(name, FALSE);
        }
    if ((a = hashFindVal(aliasHash, sanger)) == NULL)
        {
        /* allocate memory for alias struct */
        AllocVar(a); 
        /* allocate memory for UniSTS IDs, aliases, internal and external names and relations */
        /* and initialize the arrays */
        AllocArray(a->uniStsId, (sizeof(char *) * NUMSANGER));
        AllocArray(a->aliases, (sizeof(char *) * NUMALIASES));
        AllocArray(a->extName, (sizeof(char *) * MAXSANGER));
        AllocArray(a->intName, (sizeof(char *) * MAXSANGER));
        AllocArray(a->relation, (sizeof(int) * MAXSANGER));

        for (i = 0; i < NUMSANGER; i++)
            {
            a->uniStsId[i] = NULL;
            }
        for (i = 0; i < MAXSANGER; i++)
            {
            a->extName[i] = NULL;
            a->intName[i] = NULL;
            a->relation[i] = -1;
            }
        for (i = 0; i < NUMALIASES; i++)
            {
            a->aliases[i] = NULL;
            }
        }
    /* find empty slot in arrays to add external and internal names */
    posFound = FALSE;
    for (i = 0; i < NUMALIASES && (!posFound); i++)
        {
        if (a->extName[i] == NULL)
            {
            posFound = TRUE;
            a->extName[i] = cloneString(extName);
            if (a->intName[i] == NULL)
                a->intName[i] = cloneString(name);
            else
                errAbort("For marker %s, the empty slot in the intName array is not the same as that for the extName array in the alias struct\n", extName);
            if (a->relation[i] == -1)
                a->relation[i] = rel;
            else 
                errAbort("For marker %s, the empty slot in the relation array is not the same as that for the extName array in the alias struct\n", extName);
            }
        }
   
    a->sangerName = cloneString(sanger);
    a->primer1 = NULL;
    a->primer2 = NULL;
    /* add this alias struct to the hash keyed by sanger name */
    hashAdd(aliasHash, sanger, a);
    /* add sanger name to hash keyed by external name */
    if ((s = hashFindVal(sangerByExtNameHash, extName)) == NULL)
        {
        /* allocate memory for struct with array of Sanger names */
        AllocVar(s);
        /* initialize the array */
        for (i = 0; i < MAXSANGER; i++)
            {
            s->sangerName[i] = NULL;
            }
        }
    found = FALSE;
    for (i = 0; i < MAXSANGER && (!found); i++)
        {
        if (s->sangerName[i] == NULL)
            {
            found = TRUE;
            s->sangerName[i] = cloneString(sanger);
            }
        }
  /* add this list of sanger names to a hash keyed by external name, extName */
    hashAdd(sangerByExtNameHash, extName, s);
    }
}
Ejemplo n.º 11
0
int main(int argc, char *argv[])
{
struct hash *bacHash;
char line[1024];
int lineCount;
char *words[256];
int wordCount;
int fileIx;
char *fileName;
FILE *f;

if (argc < 2)
    usage();
bacHash = newHash(16);

for (fileIx = 1; fileIx < argc; ++fileIx)
    {
    fileName = argv[fileIx];
    uglyf("Processing %s\n", fileName);
    f = mustOpen(fileName, "r");
    lineCount = 0;
    while (fgets(line, sizeof(line), f))
        {
        ++lineCount;
        wordCount = chopLine(line, words);
        if (wordCount == ArraySize(words))
            errAbort("Too many words line %d of %s\n", lineCount, fileName);
        if (wordCount != 0)
            {
            char *bacName;
            int cIx;
            struct contigTrack *ctList = NULL, *ct;
            struct bacTrack *bt;
            struct hashEl *hel;

            /* Check line syntax and parse it. */
            if (!sameString(words[1], "glues"))
                errAbort("Bad format line %d of %s\n", lineCount, fileName);
            bacName = words[2];
            for (cIx = 4; cIx < wordCount; cIx += 5)
                {
                char *parts[3];
                int partCount;

                AllocVar(ct);
                ct->ix = atoi(words[cIx]);
                ct->strand = words[cIx+1][0];
                ct->dir = words[cIx+2][0];
                partCount = chopString(words[cIx+3], "(-)", parts, ArraySize(parts));
                if (partCount != 2)
                    errAbort("Bad format line %d of %s\n", lineCount, fileName);
                ct->start = atoi(parts[0]);
                ct->end = atoi(parts[1]);
                ct->cookedScore = atof(words[cIx+4]);
                slAddHead(&ctList, ct);                
                }
            slSort(&ctList, cmpContigTrack);
        
            /* Lookup bacTrack and make it if new. */
            hel = hashLookup(bacHash, bacName);
            if (hel == NULL)
                {
                AllocVar(bt);
                hel = hashAdd(bacHash, bacName, bt);
                bt->name = hel->name;
                slAddHead(&bacList, bt);
                }
            else
                {
                bt = hel->val;
                }
            
            /* Process pairs into bacTrack. */
            addPairs(bt, ctList);
            slFreeList(&ctList);
            }
        }
    fclose(f);
    }
slSort(&bacList, cmpBacTrack);

printStats();
return 0;
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}
Ejemplo n.º 13
0
void rcvs(char *codingTable, char *clusterTable)
/* rcvs - Compare riken noncoding vs. nonspliced. */
{
struct hash *idHash = newHash(16); // Key id1, val id2
struct hash *nonCodingHash = newHash(16);  // Key clusterId, value 
struct hash *splicedHash = newHash(16);  // Key id2, present if spliced
struct sqlConnection *conn = sqlConnect("mgsc");
struct sqlResult *sr;
char **row;
char *words[16];
int wordCount;
struct lineFile *lf;
int codingSpliced = 0;
int noncodingSpliced = 0;
int codingNonspliced = 0;
int noncodingNonspliced = 0;

/* Read id's into hash */
sr = sqlGetResult(conn, NOSQLINJ "select id1,id2 from rikenIds");
while ((row = sqlNextRow(sr)) != NULL)
    hashAdd(idHash, row[0], cloneString(row[1]));
sqlFreeResult(&sr);

/* Read spliced into hash */
sr = sqlGetResult(conn,
	NOSQLINJ "select name from rikenOrientInfo where intronOrientation != 0");
while ((row = sqlNextRow(sr)) != NULL)
    hashAdd(splicedHash, row[0], NULL);
sqlFreeResult(&sr);

/* Read noncoding clusters into hash */
lf = lineFileOpen(codingTable, TRUE);
while (lineFileNextRow(lf, words, 2))
    {
    if (sameString(words[1], "NoPProt"))
        hashAdd(nonCodingHash, words[0], NULL);
    }
lineFileClose(&lf);

/* Stream through cluster table counting and correlating. */
lf = lineFileOpen(clusterTable, TRUE);
while (lineFileNextRow(lf, words, 2))
    {
    char *cluster = words[0];
    char *id1 = words[1];
    char *id2 = hashMustFindVal(idHash, id1);
    if (hashLookup(nonCodingHash, cluster))
        {
	if (hashLookup(splicedHash, id2))
	    ++noncodingSpliced;
	else
	    ++noncodingNonspliced;
	}
    else
        {
	if (hashLookup(splicedHash, id2))
	    ++codingSpliced;
	else
	    ++codingNonspliced;
	}
    }
printf("noncodingNonspliced %d\n", noncodingNonspliced);
printf("noncodingSpliced %d\n", noncodingSpliced);
printf("codingNonspliced %d\n", codingNonspliced);
printf("codingSpliced %d\n", codingSpliced);
printf("total %d\n", noncodingNonspliced + noncodingSpliced + codingNonspliced + codingSpliced);
}
Ejemplo n.º 14
0
void dupeFoo(char *pslName, char *faName, char *regionFile)
/* dupeFoo - Do some duplication analysis. */
{
struct lineFile *lf;
struct frag *fragList = NULL, *frag;
struct hash *fragHash = newHash(16);
struct psl *psl;
int fragCount=0,missCount=0,dupeCount=0,kSub=0,
   k1=0, k10=0,k100=0,k1000=0,k10000=0,diffChrom=0,distance;

/* Read in fragment list and put it in hash. */
fragList = readFragList(faName);
for (frag = fragList; frag != NULL; frag = frag->next)
    hashAdd(fragHash, frag->name, frag);

/* Read psl's and store under the fragment the belong to. */
lf = pslFileOpen(pslName);
while ((psl = pslNext(lf)) != NULL)
    {
    if ((frag = hashFindVal(fragHash, psl->qName)) == NULL)
        errAbort("Couldn't find %s in %s line %d of %s", 
		psl->qName, faName, lf->lineIx, lf->fileName);
    slAddHead(&frag->pslList, psl);
    }
lineFileClose(&lf);

/* Look through fragments and report missing and dupes. */
for (frag = fragList; frag != NULL; frag = frag->next)
    {
    ++fragCount;
    if ((psl = frag->pslList) == NULL)
        {
	++missCount;
	printf("missing %s\n", frag->name);
	}
    else
        {
	for (psl = frag->pslList; psl != NULL; psl = psl->next)
	    {
	    if (sameString(psl->tName, frag->chrom))
	        {
		distance = frag->start - psl->tStart;
		if (distance != 0)
		    {
		    if (distance < 0) distance = -distance;
		    if (distance >= 10000000) ++k10000;
		    else if (distance >= 1000000) ++k1000;
		    else if (distance >= 100000) ++k100;
		    else if (distance >= 10000) ++k10;
		    else if (distance >= 1000) ++k1;
		    else ++kSub;
		    }
		}
	    else
	        {
		++diffChrom;
		}
	    }
	}
    }
printPercent("Total", fragCount, fragCount);
printPercent("Unaligned", missCount, fragCount);
printPercent("Other Chrom", diffChrom, fragCount);
printPercent("Same Chrom >10M", k10000, fragCount);
printPercent("Same Chrom >1M", k1000, fragCount);
printPercent("Same Chrom >10Ok", k100, fragCount);
printPercent("Same Chrom >1Ok", k10, fragCount);
printPercent("Same Chrom >1k", k1, fragCount);
printPercent("Self-overlap", kSub, fragCount);
writeRegions(fragList, regionFile);
}
void analyse(int start, int stop)
{
    struct hash *hash;
    char line[512];
    int lineCount = 0;
    char *words[32];
    int wordCount;
    struct cdnaInfo *cdnaList = NULL;
    struct cdnaInfo *ci = NULL;
    int cdnaCount;
    int maxCdnaCount = stop - start;

    cdnaCount = 1;
    if (start > 1)
    {
        for (;;)
        {
            if (!fgets(line, sizeof(line), inFile))
                errAbort("Not %d cDNAs in file, only %d\n", start, cdnaCount);
            ++lineCount;
            if (line[0] == '#') /* Skip comments. */
                continue;
            wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words));
            if (wordCount <= 0) /* Skip empty lines. */
                continue;
            if (!differentWord(words[1], "alignments"))
            {
                ++cdnaCount;
                if (cdnaCount >= start)
                    break;
            }
        }
    }
    cdnaCount = 0;
    hash = newHash(14); /* Hash table with 16k entries. */
    for (;;)
    {
        if (!fgets(line, sizeof(line), inFile))
            break;
        ++lineCount;
        if (line[0] == '#') /* Skip comments. */
            continue;
        wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words));
        if (wordCount <= 0) /* Skip empty lines. */
            continue;
        if (wordCount < 4)  /* Everyone else has at least four words. */
        {
            errAbort("Short line %d:\n", lineCount);
        }
        if (sameWord(words[1], "Blasting"))
        {
            char *cdnaName = words[2];
            if ((ci = lookupInfo(hash, cdnaName)) == NULL)
            {
                struct hashEl *hel;
                ci = needMem(sizeof(*ci));
                hel = hashAdd(hash, cdnaName, ci);
                ci->next = cdnaList;
                cdnaList = ci;
                ci->ix = atoi(words[0]);
                ci->name = hel->name;
            }
        }
        else if (sameWord(words[2], "hits"))
        {
            /* Newer style - includes cDNA matching range. */
            if (ci == NULL)
                continue;
            hitLine(ci, lineCount, words[0], words[1], words[3], words[4], words[5], words[9]);
        }
        else if (sameWord(words[1], "hits"))
            /* Older style - no cDNA matching range. */
        {
            if (ci == NULL)
                continue;
            hitLine(ci, lineCount, words[0],     NULL, words[2], words[3], words[4], words[8]);
        }
        else if (sameWord(words[1], "alignments"))
        {
            struct dnaSeq *cdnaSeq;
            struct wormCdnaInfo info;
            if (ci == NULL)
                continue;
            if (differentWord(ci->name, words[3]))
                errAbort("Line %d - %s is not %s", lineCount, words[3], ci->name);
            if (!ci->finished)
            {
                if (!anyCdnaSeq(ci->name, &cdnaSeq, &info))
                {
                    warn("Can't find cDNA %s", ci->name);
                    ci->isDupe = TRUE;
                }
                else
                {
                    ci->baseCount = cdnaSeq->size;
                    ci->baseCrc = dnaCrc(cdnaSeq->dna, cdnaSeq->size);
                    slReverse(&ci->roughAli);
                    ci->roughScore = bestRoughScore(ci->roughAli);
                    filterDupeCdna(ci, cdnaSeq);
                    ci->isBackwards = (info.orientation == '-');
                    refineAlis(ci, cdnaSeq);
                    ci->fineScore = bestFineScore(ci->fineAli);
                    ci->isEmbryonic = info.isEmbryonic;
                    ci->finished = TRUE;
                    freeDnaSeq(&cdnaSeq);
                    ++cdnaCount;
                    if (cdnaCount >= maxCdnaCount)
                        break;
                }
            }
        }
        else
        {
            errAbort("Can't deal with line %d\n", lineCount);
        }
    }

    slReverse(&cdnaList);

    doGoodBad(cdnaList);
    doUnusual(cdnaList);
//makeCdnaToGene(cdnaList);

    /* Clean up. */

    /* These two are slow and not really necessary. */
#ifdef FASTIDIOUS
    slFreeList(&cdnaList);
    freeHash(&hash);
#endif

    uglyf("Done analyse\n");
}
Ejemplo n.º 16
0
static void clusterClone(int argc, char *argv[])
{
int i;

for (i=1; i < argc; ++i)
    {
    struct lineFile *lf;
    struct psl *psl;
    unsigned tSize;
    char *prevAccPart = (char *)NULL;
    char *prevAccName = (char *)NULL;
    char *prevTargetName = (char *)NULL;
    struct hashEl *el;
    struct hash *chrHash = newHash(0);
    struct hash *coordHash = newHash(0);
    struct coordEl *coord;
    struct coordEl **coordListPt = (struct coordEl **) NULL;
    unsigned querySize = 0;
    int partCount = 0;
    int partsConsidered = 0;

    verbose(2,"#\tprocess: %s\n", argv[i]);
    lf=pslFileOpen(argv[i]);
    while ((struct psl *)NULL != (psl = pslNext(lf)) )
	{
	char *accName = (char *)NULL;
	char *targetName = (char *)NULL;
	int chrCount = 0;
	double percentCoverage;

	accName = cloneString(psl->qName);
	if ((char *)NULL == prevAccPart)
	    {
	    prevAccPart = cloneString(psl->qName);  /* first time */
	    querySize = psl->qSize;
	    ++partsConsidered;
	    }
	chopSuffixAt(accName,'_');

	if ((char *)NULL == prevAccName)
		prevAccName = cloneString(accName);  /* first time */
	if ((char *)NULL == prevTargetName)
		prevTargetName = cloneString(psl->tName);  /* first time */

	/*	encountered a new accession name, process the one we
 	 *	were working on
	 */
	if (differentWord(accName, prevAccName))
	    {
	    if (partCount > 0)
		processResult(chrHash, coordHash, prevAccName, querySize,
		    partsConsidered);
	    else
		verbose(1,"# ERROR %s %s - no coordinates found in %d parts considered\n",
		    prevTargetName, prevAccName, partsConsidered);
	    freeMem(prevAccName);
	    prevAccName = cloneString(accName);
	    freeHash(&chrHash);
	    freeHash(&coordHash);
	    chrHash = newHash(0);
	    coordHash = newHash(0);
	    querySize = 0;
	    partCount = 0;
	    partsConsidered = 0;
	    }

	tSize = psl->tEnd - psl->tStart;
	percentCoverage = 100.0*((double)(tSize+1)/(psl->qSize + 1));
	if (differentWord(psl->qName, prevAccPart))
	    {
	    ++partsConsidered;
	    querySize += psl->qSize;
	    freeMem(prevAccPart);
	    prevAccPart = cloneString(psl->qName);
	    }

	targetName = cloneString(psl->tName);
	if (differentWord(targetName, prevTargetName))
	    {
	    freeMem(prevTargetName);
	    prevTargetName = cloneString(targetName);
	    }
	/*	keep a hash of chrom names encountered	*/
	el = hashLookup(chrHash, targetName);
	if (el == NULL)
	    {
	    if (percentCoverage > minCover)
		{
		hashAddInt(chrHash, targetName, 1);
		chrCount = 1;
		}
	    else
		{
		hashAddInt(chrHash, targetName, 0);
		chrCount = 0;
		}
	    }
	else
	    {
	    if (percentCoverage > minCover)
		{
		chrCount = ptToInt(el->val) + 1;
		el->val=intToPt(chrCount);
		}
	    }

	AllocVar(coord);
	coord->start = psl->tStart;
	coord->end = psl->tEnd;
	coord->qSize = psl->qSize;
	coord->strand = sameWord(psl->strand,"+") ? 1 : 0;
	/*	when coverage is sufficient	*/
	if (percentCoverage > minCover)
	    {
	    ++partCount;
	    coord->name = cloneString(psl->qName);
	    /*	for each chrom name, accumulate a list of coordinates */
	    el = hashLookup(coordHash, targetName);
	    if (el == NULL)
		{
		AllocVar(coordListPt);
		hashAdd(coordHash, targetName, coordListPt);
		}
	    else
		{
		coordListPt = el->val;
		}
	    slAddHead(coordListPt,coord);
	verbose(2,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }
	else
	    {
	verbose(3,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }


	freeMem(accName);
	freeMem(targetName);
	pslFree(&psl);
	}
    if (partCount > 0)
	processResult(chrHash, coordHash, prevAccName, querySize,
	    partsConsidered);
    else
	verbose(1,"# ERROR %s %s - no coordinates found\n",
	    prevTargetName, prevAccName);
    freeMem(prevAccName);
    freeHash(&chrHash);
    freeHash(&coordHash);
    lineFileClose(&lf);
    }
}	/*	static void clusterClone()	*/
void addRedoHash(struct cdnaInfo *ci, char *reason)
{
    struct hashEl *hel;
    if ((hel = hashLookup(redoHash, ci->name)) == NULL)
        hashAdd(redoHash, ci->name, reason);
}
Ejemplo n.º 18
0
/*	convolve() - perform the task on the input data
 *	I would like to rearrange this business here, and instead of
 *	reading in the data and leaving it in the hash for all other
 *	routines to work with, it would be best to get it immediately
 *	into an array.  That makes the work of the other routines much
 *	easier.
 */
static void convolve(int argc, char *argv[])
{
int i;
struct lineFile *lf;			/* for line file utilities	*/

for (i = 1; i < argc; ++i)
    {
    int lineCount = 0;			/* counting input lines	*/
    char *line = (char *)NULL;		/* to receive data input line	*/
    char *words[128];			/* to split data input line	*/
    int wordCount = 0;			/* result of split	*/
    struct hash *histo0;	/*	first histogram	*/
    struct hash *histo1;	/*	second histogram	*/
    int medianBin0 = 0;		/*	bin at median for histo0	*/
    double medianLog_2 = -500.0;	/*	log at median	*/
    int bin = 0;		/*	0 to N-1 for N bins	*/
    int convolutions = 0;	/*	loop counter for # of convolutions */

    histo0 = newHash(0);

    lf = lineFileOpen(argv[i], TRUE);	/*	input file	*/
    verbose(1, "Processing %s\n", argv[1]);
    while (lineFileNext(lf, &line, NULL))
	{
	int j;			/*	loop counter over words	*/
	int inputValuesCount = 0;
	struct histoGram *hg;	/*	an allocated hash element	*/

	++lineCount;
	chopPrefixAt(line, '#'); /* ignore any comments starting with # */
	if (strlen(line) < 3)	/*	anything left on this line ? */
	    continue;		/*	no, go to next line	*/
	wordCount = chopByWhite(line, words, 128);
	if (wordCount < 1)
warn("Expecting at least a word at line %d, file: %s, found %d words",
	lineCount, argv[i], wordCount);
	if (wordCount == 128)
warn("May have more than 128 values at line %d, file: %s", lineCount, argv[i]);

	verbose(2, "Input data read from file: %s\n", argv[i]);
	for (j = 0; j < wordCount; ++j)
	    {
	    char binName[128];
	    double dataValue;
	    double probInput;
	    double log_2;
	    dataValue = strtod(words[j], NULL);
	    ++inputValuesCount;
	    if (logs)
		{
		log_2 = dataValue;
		probInput = pow(2.0,log_2);
		} else {
		if (dataValue > 0.0)
		    {
		    log_2 = log2(dataValue);
		    probInput = dataValue;
		    } else {
		    log_2 = -500.0;	/*	arbitrary limit	*/
		    probInput = pow(2.0,log_2);
		    }
		}
	    if (log_2 > medianLog_2)
		{
		medianLog_2 = log_2;
		medianBin0 = bin;
		}
	    verbose(2, "bin %d: %g %0.5g\n",
		    inputValuesCount-1, probInput, log_2);

	    AllocVar(hg);	/*	the histogram element	*/
	    hg->bin = bin;
	    hg->prob = probInput;
	    hg->log_2 = log_2;
	    snprintf(binName, sizeof(binName), "%d", hg->bin);
	    hashAdd(histo0, binName, hg);

	    ++bin;
	    }	/*	for each word on an input line	*/
	}	/*	for each line in a file	*/

	/*	file read complete, echo input	*/
	if (verboseLevel() >= 2)
	    printHistogram(histo0, medianBin0);

	/*	perform convolutions to specified count
	 *	the iteration does histo0 with itself to produce histo1
	 *	Then histo0 is freed and histo1 copied to it for the
	 *	next loop.
	 */
	for (convolutions = 0; convolutions < convolve_count; ++convolutions)
	    {
	    int medianBin;
	    histo1 = newHash(0);
	    medianBin = iteration(histo0, histo1);
	    if (verboseLevel() >= 2)
		printHistogram(histo1, medianBin);
	    freeHashAndVals(&histo0);
	    histo0 = histo1;
	    }

    }		/*	for each input file	*/
}	/*	convolve()	*/
Ejemplo n.º 19
0
static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, 
    char *fileName, int lineIx)
/* Read the semi-colon separated end bits of a GTF line into gl and
 * hashes. */
{
char *type, *val;
struct hashEl *hel;
bool gotSemi;

for (;;)
   {
   gotSemi = FALSE;
   if ((type = nextWord(&s)) == NULL)
       break;
   s = skipLeadingSpaces(s);
   if (NULL == s || s[0] == 0)
       errAbort("Unpaired type(%s)/val on end of gtf line %d of %s", type, lineIx, fileName);
   if (s[0] == '"' || s[0] == '\'')
       {
       val = s;
       readQuotedString(fileName, lineIx, s, val, &s);
       }
   else
       {
       int len;
       val = nextWord(&s);
       len = strlen(val) - 1;
       if (val[len] == ';')
	   {
	   val[len] = 0;
	   len -= 1;
           gotSemi = TRUE;
	   }
       if (len < 0)
           errAbort("Empty value for %s line %d of %s", type, lineIx, fileName);
       }
   if (s != NULL && !gotSemi)
      {
      s = strchr(s, ';');
      if (s != NULL)
         ++s;
      }
   /* only use the first occurance of gene_id and transcript_id */
   if (sameString("gene_id", type) && (gl->geneId == NULL))
       {
       struct gffGeneId *gg;
       if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
	   {
	   AllocVar(gg);
           hel = hashAdd(gff->geneIdHash, val, gg);
	   gg->name = hel->name;
	   slAddHead(&gff->geneIdList, gg);
	   }
	else
	   {
	   gg = hel->val;
	   }
       gl->geneId = gg->name;
       }
   else if (sameString("transcript_id", type) && (gl->group == NULL))
       {
       struct gffGroup *gg;
       if ((hel = hashLookup(gff->groupHash, val)) == NULL)
	   {
	   AllocVar(gg);
           hel = hashAdd(gff->groupHash, val, gg);
	   gg->name = hel->name;
	   gg->seq = gl->seq;
	   gg->source = gl->source;
	   slAddHead(&gff->groupList, gg);
	   }
	else
	   {
	   gg = hel->val;
	   }
       gl->group = gg->name;
       }
   else if (sameString("exon_id", type))
       gl->exonId = gffFileGetStr(gff, val);
   else if (sameString("exon_number", type))
       {
       if (!isdigit(val[0]))
           errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
       gl->exonNumber = atoi(val);
       }
   else if (sameString("intron_id", type))
       gl->intronId = gffFileGetStr(gff, val);
   else if (sameString("intron_status", type))
       gl->intronStatus = gffFileGetStr(gff, val);
   else if (sameString("protein_id", type))
       gl->proteinId = gffFileGetStr(gff, val);
   else if (sameString("gene_name", type))
       gl->geneName = gffFileGetStr(gff, val);
   else if (sameString("transcript_name", type))
       gl->transcriptName = gffFileGetStr(gff, val);
   }
if (gl->group == NULL)
    {
    if (gl->geneId == NULL)
        warn("No gene_id or transcript_id line %d of %s", lineIx, fileName);
    }
}
void flagReported(struct hash* reportedHash, char* key)
/* flag an error as having been reported */
{
hashAdd(reportedHash, key, NULL);
}
Ejemplo n.º 21
0
void filereader(FILE* in, int bflag, char* sym, int n){
	FILE* iq = fopen("iq.txt", "w");
	FILE* ob = fopen("ob.txt", "w");
	int count = 0;
	if(!bflag){
		char ch;
		int id;
		while(!feof(in)){
			fscanf(in,"%c",&ch);
			if(ch == 'A'){
				char side;
				char* symbol = (char*)malloc(sizeof(char*));
				int quantity;
				double price; 
				fscanf(in,"%d %c %s %d %lf\n",&id, &side, symbol, &quantity, &price);
				if(!strcmp(sym,symbol)){
					hashAdd(id, side, symbol, quantity, price);
					//printf("%ld %c %s %d %lf\n", id, side, symbol, quantity, price);
					//count++;
				}
			}else if(ch == 'X'){
				char* symbol = (char*)malloc(sizeof(char*));
				fscanf(in,"%d %s\n",&id, symbol);
				hashDel(id);
			}else if(ch == 'T'){
				int id;
				char* symbol = (char*)malloc(sizeof(char*));
				int quantity;
				fscanf(in, "%d %s %d\n", &id, symbol, & quantity);
				if(!strcmp(sym,symbol)){
					changeNode(id, quantity);
				}
			}else if(ch == 'R'){
				int id;
				char* symbol = (char*)malloc(sizeof(char*));
				int quantity;
				double price;
				fscanf(in, "%d %s %d %lf\n", &id, symbol, &quantity, &price);
				changePnode(id, quantity, price);
			}else if(ch == 'C'){
				int id;
				char* symbol = (char*)malloc(sizeof(char*));
				int quantity;
				fscanf(in, "%d %s %d\n", &id, symbol, & quantity);
				if(!strcmp(sym,symbol)){
					changeNode(id, quantity);
				}

			}
			count++;
			//printf("count %d\n", count);
			//printf("n in file %d\n", n);
			if(count == n){
				count = 0;
				double sellPrice = 0.0;
				double buyPrice = 0.0;
				if(sell != NULL)
					sellPrice = sell->price;
				if(buy != NULL)
					buyPrice = buy->price;
				fprintf(iq, "%lf %lf\n", sellPrice, buyPrice);
			}

		}

	}else{
		char c;
		  while(!feof(in)){
			fread(&c, sizeof(char), 1,in);
			if(c == 'A'){
				int id;
				char side;
				char *symbol = (char *) malloc(sizeof(char *));
				int quantity;
				double price;
				fread(&id,sizeof(int),1,in);
				fread(&side,sizeof(char),1,in);
				fread(symbol,sizeof(char),4,in);
				symbol[strlen(symbol)] = '\0';
				fread(&quantity,sizeof(int),1,in);
				fread(&price,sizeof(double),1,in);
				if(!strcmp(sym,symbol)){
					hashAdd(id, side, symbol, quantity, price);
					//printf("%ld %c %s %d %lf\n", id, side, symbol, quantity, price);
				}
			}
			if(c == 'X'){
				int id;
				char *symbol = (char *) malloc(sizeof(char *));
				fread(&id,sizeof(int),1,in);
				fread(symbol,sizeof(char),4,in);
				symbol[strlen(symbol)] = '\0';
				hashDel(id);
			}
			
			if(c == 'T'){
				int id;
				char *symbol = (char *) malloc(sizeof(char *));
				int quantity;
				fread(&id,sizeof(int),1,in);
				fread(symbol,sizeof(char),4,in);
				symbol[strlen(symbol)] = '\0';
				fread(&quantity,sizeof(int),1,in);
				changeNode(id, quantity);				
			}
			if(c == 'C'){
				int id;
				char *symbol = (char *) malloc(sizeof(char *));
				int quantity;
				fread(&id,sizeof(int),1,in);
				fread(symbol,sizeof(char),4,in);
				symbol[strlen(symbol)] = '\0';
				fread(&quantity,sizeof(int),1,in);
				changeNode(id,quantity);
			}
			if(c == 'R'){
				int id;
				char *symbol = (char *) malloc(sizeof(char *));
				int quantity;
				double price;
				fread(&id,sizeof(int),1,in);
				fread(symbol,sizeof(char),4,in);
				symbol[strlen(symbol)] = '\0';
				fread(&quantity,sizeof(int),1,in);
				fread(&price,sizeof(double),1,in);
				changePnode(id, quantity, price);
			}
		}
	
	}
	printhash(ob);
}
void doExpRatio(struct trackDb *tdb, char *item, struct customTrack *ct)
/* Generic expression ratio deatils using microarrayGroups.ra file */
/* and not the expRecord tables. */
{
char *expScale = trackDbRequiredSetting(tdb, "expScale");
char *expStep = trackDbRequiredSetting(tdb, "expStep");
double maxScore = atof(expScale);
double stepSize = atof(expStep);
struct bed *bedList;
char *itemName = cgiUsualString("i2","none");
char *expName = (item == NULL) ? itemName : item;
char *tdbSetting = trackDbSettingOrDefault(tdb, "expColor", "redGreen");
char *colorVal = NULL;
enum expColorType colorScheme;
char colorVarName[256];
safef(colorVarName, sizeof(colorVarName), "%s.color", tdb->track);
colorVal = cartUsualString(cart, colorVarName, tdbSetting);
colorScheme = getExpColorType(colorVal);

if (sameWord(tdb->grp, "cancerGenomics"))
    {
    /* set global flag */
    isCancerGenomicsTrack = TRUE;
    }

if (!ct)
    {
    genericHeader(tdb, itemName);
    bedList = loadMsBed(tdb, tdb->table, seqName, winStart, winEnd);
    }
else if (ct->dbTrack)
    {
    genericHeader(tdb, itemName);
    printCustomUrl(tdb, itemName, TRUE);
    bedList = ctLoadMultScoresBedDb(ct, seqName, winStart, winEnd);
    }
else
    bedList = bedFilterListInRange(ct->bedList, NULL, seqName, winStart, winEnd);
if (bedList == NULL)
    printf("<b>No Expression Data in this Range.</b>\n");
else if (expName && sameString(expName, "zoomInMore"))
    printf("<b>Too much data to display in detail in this range.</b>\n");
else
    {
    struct microarrayGroups *groupings = NULL;
    struct maGrouping *combineGroup;
    struct hash *erHash = newHash(6);
    int i;
    if (!ct)
	{
	groupings = maGetTrackGroupings(database, tdb);
	combineGroup = maCombineGroupingFromCart(groupings, cart, tdb->track);
	}
    else
	combineGroup = maGetGroupingFromCt(ct);
    maBedClumpGivenGrouping(bedList, combineGroup);
    for (i = 0; i < combineGroup->numGroups; i++)
	{
	/* make stupid exprecord hash.perhaps eventually this won't be needed */
	char id[16];
	struct expRecord *er = basicExpRecord(combineGroup->names[i], i, 2);
	safef(id, sizeof(id), "%d", i);
	hashAdd(erHash, id, er);
	}
    puts("<h2></h2><p>\n");
    msBedPrintTable(bedList, erHash, itemName, expName, -1*maxScore, maxScore,
	stepSize, 2, msBedDefaultPrintHeader, msBedExpressionPrintRow,
	printExprssnColorKey, getColorForExprBed, colorScheme);
    hashTraverseEls(erHash, erHashElFree);
    hashFree(&erHash);
    microarrayGroupsFree(&groupings);
    }
puts("<h2></h2><p>\n");
bedFreeList(&bedList);
}
Ejemplo n.º 23
0
Archivo: seqOut.c Proyecto: bowhan/kent
void doGenePredNongenomic(struct sqlConnection *conn, int typeIx)
/* Get mrna or protein associated with selected genes. */
{
/* Note this does do the whole genome at once rather than one
 * chromosome at a time, but that's ok because the gene prediction
 * tracks this serves are on the small side. */
char *typeWords[3];
char *table;
struct lm *lm = lmInit(64*1024);
int fieldCount;
struct bed *bed, *bedList = cookedBedsOnRegions(conn, curTable, getRegions(),
	lm, &fieldCount);
int typeWordCount;

textOpen();

/* Figure out which table to use. */
if (isRefGeneTrack(curTable))
    {
    if (typeIx == 1) /* Protein */
        doRefGeneProteinSequence(conn, bedList);
    else
        doRefGeneMrnaSequence(conn, bedList);
    }
else
    {
    char *dupType = cloneString(findTypeForTable(database, curTrack, curTable, ctLookupName));
    typeWordCount = chopLine(dupType, typeWords);
    if (typeIx >= typeWordCount)
	internalErr();
    table = typeWords[typeIx];
    if (sqlTableExists(conn, table))
	{
	struct sqlResult *sr;
	char **row;
	char query[256];
	struct hash *hash = newHash(18);
	boolean gotResults = FALSE;

	/* Make hash of all id's passing filters. */
	for (bed = bedList; bed != NULL; bed = bed->next)
	    hashAdd(hash, bed->name, NULL);

	/* Scan through table, outputting ones that match. */
	sqlSafef(query, sizeof(query), "select name, seq from %s", table);
	sr = sqlGetResult(conn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    if (hashLookup(hash, row[0]))
		{
		hPrintf(">%s\n", row[0]);
		writeSeqWithBreaks(stdout, row[1], strlen(row[1]), 60);
		gotResults = TRUE;
		}
	    }
	sqlFreeResult(&sr);
	hashFree(&hash);
	if (!gotResults)
	    hPrintf(NO_RESULTS);
	}
    else
	{
	internalErr();
	}
    freez(&dupType);
    }
lmCleanup(&lm);
}
Ejemplo n.º 24
0
void bedItemOverlapCount(struct hash *chromHash, char *infile, char *outfile){
unsigned maxChromSize = 0;
unitSize *counts = (unitSize *)NULL;
FILE *f = mustOpen(outfile, "w");
struct hashCookie hc = hashFirst(chromHash);
struct hashEl *hel;
while( (hel = hashNext(&hc)) != NULL) {
    unsigned num = (unsigned) ptToInt(hel->val);
    maxChromSize = max(num, maxChromSize);
}
verbose(2,"#\tmaxChromSize: %u\n", maxChromSize);
if (maxChromSize < 1)
    errAbort("maxChromSize is zero ?");

/*	Allocate just once for the largest chrom and reuse this array */
counts = needHugeMem(sizeof(unitSize) * maxChromSize);

/*	Reset the array to be zero to be reused */
memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize);

unsigned chromSize = 0;
char *prevChrom = (char *)NULL;
boolean outputToDo = FALSE;
struct hash *seenHash = newHash(5);

    struct lineFile *bf = lineFileOpen(infile , TRUE);
    struct bed *bed = (struct bed *)NULL;
    char *row[12];
    int numFields = doBed12 ? 12 : 3;

    while (lineFileNextRow(bf,row, numFields))
	{
	int i;
	bed = bedLoadN(row, numFields);

	verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);

	if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
	    {
	    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
	    if (outputToDo)
		outputCounts(counts, prevChrom, chromSize, f);
	    outputToDo = FALSE;
	    memset((void *)counts, 0,
		sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */
	    freez(&prevChrom); 
	    // prevChrom is now NULL so it will be caught by next if!
	    }
	if ((char *)NULL == prevChrom)  // begin a chr
	    {
	    if (hashLookup(seenHash, bed->chrom))
		errAbort("ERROR:input file not sorted. %s seen before on line %d\n",
		    bed->chrom, bf->lineIx);

	    hashAdd(seenHash, bed->chrom, NULL);
	    prevChrom = cloneString(bed->chrom);
	    chromSize = hashIntVal(chromHash, prevChrom);
	    verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
	    }
	if (bed->chromEnd > chromSize)
	    {
	    // check for circular chrM
	    if (doBed12 || bed->chromStart>=chromSize 
		|| differentWord(bed->chrom,"chrM")) 
		{
		warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
		bed->chromEnd);
		errAbort("chromEnd > chromSize ?  %d > %d", 
		    bed->chromEnd,chromSize);
		}

	    for (i = bed->chromStart; i < chromSize; ++i)
		INCWOVERFLOW(counts,i);
	    for (i = 0; i < (bed->chromEnd - chromSize); ++i)
		INCWOVERFLOW(counts,i);
	    }
	else if (doBed12)
	    {
	    int *starts = bed->chromStarts;
	    int *sizes = bed->blockSizes;
	    int *endStarts = &bed->chromStarts[bed->blockCount];

	    for(; starts < endStarts; starts++, sizes++)
		{
		unsigned int end = *starts + *sizes + bed->chromStart;
		for (i = *starts + bed->chromStart; i < end; ++i)
		    INCWOVERFLOW(counts,i);
		}
	    }
	else
	    {
	    for (i = bed->chromStart; i < bed->chromEnd; ++i)
		INCWOVERFLOW(counts, i);
	    }
	outputToDo = TRUE;
	bedFree(&bed); // plug the memory leak
	}

    lineFileClose(&bf);
    // Note, next file could be on same chr!

if (outputToDo)
    outputCounts(counts, prevChrom, chromSize, f);

if (doOutBounds)
    fprintf(stderr, "min %lu max %lu\n", (unsigned long)overMin, (unsigned long)overMax);

verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
carefulClose(&f);
freeMem(counts);
freez(&prevChrom);
// hashFreeWithVals(&chromHash, freez);
freeHash(&seenHash);
}
Ejemplo n.º 25
0
int xmlParse(void* Pool,PXMLNode x,wchar_t *src,int flags)
{
  int sz=xstrlen(src);
  int i,st,n;
  wchar_t str[256];
  PXMLNode stack[256];
  int stackpos=0;
  PXMLNode p=x,c;
  PHashLink l;
  PXMLNode chlds=NULL;
  if(!sz)return 0;

  for(i=0;i<sz;i++)
  {
    if(xmlIsSpace(src[i]))continue;
    if(src[i]==L'<' && src[i+1]==L'!' && src[i+2]==L'-' && src[i+3]==L'-' && sz-i>4)
    {
      i+=4;
      st=i;
      while((src[i]!=L'-' || src[i+1]!=L'-' || src[i+2]!=L'>') && sz-i>3)i++;
      i+=2;
      if(sz-i<3)return 0;
      c=xmlNewChild(Pool,p,xmlComment);
      c->szCont=xstrndup(Pool,src+st,i-st-3);
      if(chlds)chlds->pNext=c;
      else p->pChildren=c;
      chlds=c;
      continue;
    }
    if(src[i]==L'<')
    {
      i++;
      if(src[i]==L'/')
      {
        i++;
        st=i;
        i=xmlGetWordEnd(src,i);
        if(i==-1)return 0;
        i=xmlSkipSpace(src,i);CHK;
        if(src[i]!=L'>')return 0;
        n=i-st;
        if(!xstrncmp(p->szName,src+st,n))return 0;
        //i++;
        p=p->pParent;
        if(p->eType==xmlRoot)return 1;
        stackpos--;
        chlds=stack[stackpos];
        continue;
      }
      if(WC(src[i]))
      {
        st=i;
        i=xmlGetWordEnd(src,i);CHK;
        c=xmlNewChild(Pool,p,xmlLeaf);
        if(p->eType!=xmlRoot)p->eType=xmlNode;
        n=i-st;
        if(n>255)n=255;
        xstrncpy(str,src+st,n);
        if(p->hChildren==NULL)p->hChildren=hashNew(Pool);
        l=hashAdd(p->hChildren,str,c);
        c->szName=l->szKey;
        if(chlds)chlds->pNext=c;
        else p->pChildren=c;
        chlds=c;
        i=xmlParseTag(Pool,c,&p,src,i);
        if(i==0)return 0;
        if(p==c)
        {
          stack[stackpos]=chlds;
          stackpos++;
          chlds=NULL;
        }
        i=xmlSkipSpace(src,i);CHK;
        st=i;
        i=xmlSkipTill(src,i,L'<');CHK;
        if(i>st)
        {
          p->szCont=xmlSubst(xstrndup(Pool,src+st,i-st));
        }
        i--;
        continue;
      }
      if(src[i]==L'!' || src[i]==L'?')
      {
        st=i+1;
        if(src[i+1]==L'[')
        {
          if(xstrncmp(src+i,L"![CDATA[",8))
          {
            const wchar_t *p=src+i+8;
            while((p=xstrchr(p,']')))
            {
              if(p[1]==L']' && p[2]==L'>')break;
            }
            if(p)i=(int)(p-src+2);
          }else i=xmlSkipTill(src,i,L'>');
        }else
        {
          i=xmlSkipTill(src,i,L'>');
        }
        if(i==-1)return 0;
        c=xmlNewChild(Pool,p,src[st-1]==L'!'?xmlExcl:xmlQuest);
        c->szCont=xstrndup(Pool,src+st,i-st);
        if(chlds)chlds->pNext=c;
        else p->pChildren=c;
        chlds=c;
        continue;
      }
    }else
    {
      st=i;
      i=xmlSkipTill(src,i,L'<');CHK;
      c=xmlNewChild(Pool,p,xmlContent);
      c->szCont=xmlSubst(xstrndup(Pool,src+st,i-st));
      if(chlds)chlds->pNext=c;
      else p->pChildren=c;
      chlds=c;
      i--;
    }
  }
  return 1;
}
Ejemplo n.º 26
0
struct joinerDtf *filteringTables()
/* Get list of tables we're filtering on as joinerDtf list (with
 * the field entry NULL). */
{
if (!anyFilter())
    return NULL;
else
    {
    struct joinerDtf *dtfList = NULL, *dtf;
    struct hashEl *varList, *var;
    struct hash *uniqHash = hashNew(0);
    int prefixSize = strlen(hgtaFilterVarPrefix);
    varList = cartFindPrefix(cart, hgtaFilterVarPrefix);
    for (var = varList; var != NULL; var = var->next)
        {
	char *dupe = cloneString(var->name + prefixSize);
	char *parts[5];
	int partCount;
	char dbTable[256];
	char *db, *table, *field, *type;
	partCount = chopByChar(dupe, '.', parts, ArraySize(parts));
	if (partCount != 4)
	    {
	    warn("Part count != expected 4 line %d of %s", __LINE__, __FILE__);
	    continue;
	    }
	db = parts[0];
	table = parts[1];
	field = parts[2];
	type = parts[3];
	safef(dbTable, sizeof(dbTable), "%s.%s", db, table);
	if (! filteredOrLinked(db, table))
	    continue;
	if (!hashLookup(uniqHash, dbTable))
	    {
	    boolean gotFilter = FALSE;

	    if (sameString(type, filterPatternVar))
	        {
		char *pat = trimSpaces(var->val);
		gotFilter = wildReal(pat);
		}
	    else if (sameString(type, filterCmpVar))
	        {
		char *patVar = filterPatternVarName(db, table, field);
		char *pat = trimSpaces(cartOptionalString(cart, patVar));
		gotFilter = cmpReal(pat, var->val);
		}
	    else if (sameString(type, filterRawQueryVar))
	        {
		char *pat = trimSpaces(var->val);
		gotFilter = (pat != NULL && pat[0] != 0);
		}
	    if (gotFilter)
		{
		hashAdd(uniqHash, dbTable, NULL);
		AllocVar(dtf);
		dtf->database = cloneString(db);
		dtf->table = cloneString(table);
		slAddHead(&dtfList, dtf);
		}
	    }
	freeMem(dupe);
	}
    hashFree(&uniqHash);
    return dtfList;
    }
}
Ejemplo n.º 27
0
static struct grp *makeGroupList(char *db, struct trackDb *trackList, struct grp **pHubGrpList,
                                 boolean allTablesOk)
/* Get list of groups that actually have something in them. */
{
struct grp *groupsAll, *groupList = NULL, *group;
struct hash *groupsInTrackList = newHash(0);
struct hash *groupsInDatabase = newHash(0);
struct trackDb *track;

/* Stream through track list building up hash of active groups. */
for (track = trackList; track != NULL; track = track->next)
    {
    if (!hashLookup(groupsInTrackList,track->grp))
        hashAdd(groupsInTrackList, track->grp, NULL);
    }

/* Scan through group table, putting in ones where we have data. */
groupsAll = hLoadGrps(db);
for (group = slPopHead(&groupsAll); group != NULL; group = slPopHead(&groupsAll))
    {
    if (hashLookup(groupsInTrackList, group->name))
	{
	slAddTail(&groupList, group);
	hashAdd(groupsInDatabase, group->name, group);
	}
    else
        grpFree(&group);
    }

/* if we have custom tracks, we want to add the track hubs
 * after that group */
struct grp *addAfter = NULL;
if ((groupList != NULL) && sameString(groupList->name, "user"))
    addAfter = groupList;

/* Add in groups from hubs. */
for (group = slPopHead(pHubGrpList); group != NULL; group = slPopHead(pHubGrpList))
    {
    // if the group isn't represented in any track, don't add it to list
    if (!hashLookup(groupsInTrackList,group->name))
	continue;
    /* check to see if we're inserting hubs rather than
     * adding them to the front of the list */
    struct grp *newGrp = grpDup(group);
    if (addAfter != NULL)
	{
	newGrp->next = addAfter->next;
	addAfter->next = newGrp;
	}
    else
	slAddHead(&groupList, newGrp);
    hashAdd(groupsInDatabase, newGrp->name, newGrp);
    }

/* Do some error checking for tracks with group names that are
 * not in database.  Just warn about them. */
if (!trackHubDatabase(db))
    for (track = trackList; track != NULL; track = track->next)
    {
    if (!hashLookup(groupsInDatabase, track->grp))
         warn("Track %s has group %s, which isn't in grp table",
	 	track->table, track->grp);
    }

/* Create dummy group for all tracks. */
AllocVar(group);
group->name = cloneString("allTracks");
group->label = cloneString("All Tracks");
slAddTail(&groupList, group);

/* Create another dummy group for all tables. */
if (allTablesOk)
    {
    AllocVar(group);
    group->name = cloneString("allTables");
    group->label = cloneString("All Tables");
    slAddTail(&groupList, group);
    }

hashFree(&groupsInTrackList);
hashFree(&groupsInDatabase);
return groupList;
}
Ejemplo n.º 28
0
static void showLinkedTables(struct joiner *joiner, struct dbTable *inList,
	char *varPrefix, char *buttonName, char *buttonText)
/* Print section with list of linked tables and check boxes to turn them
 * on. */
{
struct dbTable *outList = NULL, *out, *in;
char dtName[256];
struct hash *uniqHash = newHash(0);
struct hash *inHash = newHash(8);

/* Build up list of tables we link to in outList. */
for (in = inList; in != NULL; in = in->next)
    {
    struct sqlConnection *conn = NULL;
    if (!trackHubDatabase(database))
	conn = hAllocConn(in->db);
    struct joinerPair *jpList, *jp;

    /* Keep track of tables in inList. */
    safef(dtName, sizeof(dtName), "%s.%s", inList->db, inList->table);
    hashAdd(inHash, dtName, NULL);

    /* First table in input is not allowed in output. */
    if (in == inList)
        hashAdd(uniqHash, dtName, NULL);

    /* Scan through joining information and add tables,
     * avoiding duplicate additions. */
    jpList = joinerRelate(joiner, in->db, in->table);
    for (jp = jpList; jp != NULL; jp = jp->next)
        {
	safef(dtName, sizeof(dtName), "%s.%s",
		jp->b->database, jp->b->table);
	if (!hashLookup(uniqHash, dtName) &&
	   !cartTrackDbIsAccessDenied(jp->b->database, jp->b->table))
	    {
	    hashAdd(uniqHash, dtName, NULL);
	    out = dbTableNew(jp->b->database, jp->b->table);
	    slAddHead(&outList, out);
	    }
	}
    joinerPairFreeList(&jpList);
    hFreeConn(&conn);
    }
slSort(&outList, dbTableCmp);

/* Print html. */
if (outList != NULL)
    {
    webNewSection("Linked Tables");
    hTableStart();
    for (out = outList; out != NULL; out = out->next)
	{
	struct sqlConnection *conn = hAllocConn(out->db);
	struct asObject *asObj = asForTable(conn, out->table);
	char *var = dbTableVar(varPrefix, out->db, out->table);
	hPrintf("<TR>");
	hPrintf("<TD>");
	cgiMakeCheckBox(var, varOn(var));
	hPrintf("</TD>");
	hPrintf("<TD>%s</TD>", out->db);
	hPrintf("<TD>%s</TD>", out->table);
	hPrintf("<TD>");
	if (asObj != NULL)
	    hPrintf("%s", asObj->comment);
	else
	    hPrintf("&nbsp;");
	hPrintf("</TD>");
	hPrintf("</TR>");
	hFreeConn(&conn);
	}
    hTableEnd();
    hPrintf("<BR>");

    cgiMakeButton(buttonName, buttonText);
    }
}
Ejemplo n.º 29
0
void submitRefToFiles(struct sqlConnection *conn, struct sqlConnection *conn2, struct sqlConnection *connSp,
    char *ref, char *fileRoot, char *inJax)
/* Create a .ra and a .tab file for given reference. */
{
/* Initially the tab file will have some duplicate lines, so
 * write to temp file, and then filter. */
char raName[PATH_LEN], tabName[PATH_LEN], capName[PATH_LEN];
FILE *ra = NULL, *tab = NULL, *cap = NULL;
struct dyString *query = dyStringNew(0);
struct sqlResult *sr;
char **row;
char *pubMed;
struct slName *list, *el;
boolean gotAny = FALSE;
struct hash *uniqImageHash = newHash(0);
struct hash *captionHash = newHash(0);
int imageWidth = 0, imageHeight = 0;
char path[PATH_LEN];
struct dyString *caption = dyStringNew(0);
struct dyString *copyright = dyStringNew(0);
struct dyString *probeNotes = dyStringNew(0);
boolean lookedForCopyright = FALSE;
	
safef(raName, sizeof(raName), "%s.ra", fileRoot);
safef(tabName, sizeof(tabName), "%s.tab", fileRoot);
safef(capName, sizeof(capName), "%s.txt", fileRoot);
tab = mustOpen(tabName, "w");
cap = mustOpen(capName, "w");


sqlDyStringPrintf(query, "select authors,journal,title,year from BIB_Refs where ");
sqlDyStringPrintf(query, "_Refs_key = '%s'", ref);
sr = sqlGetResultVerbose(conn, query->string);
row = sqlNextRow(sr);
if (row == NULL)
    errAbort("Can't find _Refs_key %s in BIB_Refs", ref);

/* Make ra file with stuff common to whole submission set. */
ra = mustOpen(raName, "w");
fprintf(ra, "submissionSource MGI\n");
fprintf(ra, "acknowledgement Thanks to the Gene Expression Database group at "
            "Mouse Genome Informatics (MGI) for collecting, annotating and sharing "
	    "this image. The MGI images were last updated in VisiGene on March 28, 2006. "
	    "Additional and more up to date annotations and images may be available "
	    "directly at <A HREF='http://www.informatics.jax.org' target='_blank'>MGI.</A>\n");
fprintf(ra, "submitSet jax%s\n", ref);
fprintf(ra, "taxon 10090\n");	/* Mus musculus taxon */
fprintf(ra, "fullDir http://hgwdev.gi.ucsc.edu/visiGene/full/inSitu/Mouse/jax\n");
fprintf(ra, "thumbDir http://hgwdev.gi.ucsc.edu/visiGene/200/inSitu/Mouse/jax\n");
fprintf(ra, "setUrl http://www.informatics.jax.org/\n");
fprintf(ra, "itemUrl http://www.informatics.jax.org/searches/image.cgi?%%s\n");
fprintf(ra, "abUrl http://www.informatics.jax.org/searches/antibody.cgi?%%s\n");
fprintf(ra, "journal %s\n", row[1]);
fprintf(ra, "publication %s\n", row[2]);
fprintf(ra, "year %s\n", row[3]);

/* The contributor (author) list is in format Kent WJ; Haussler DH; format in
 * Jackson.  We convert it to Kent W.J.,Haussler D.H., format for visiGene. */
fprintf(ra, "contributor ");
list = charSepToSlNames(row[0], ';');
for (el = list; el != NULL; el = el->next)
    {
    char *lastName = skipLeadingSpaces(el->name);
    char *initials = strrchr(lastName, ' ');
    if (initials == NULL)
	initials = "";
    else
	*initials++ = 0;
    fprintf(ra, "%s", lastName);
    if (initials[0] != 0)
	{
	char c;
	fprintf(ra, " ");
	while ((c = *initials++) != 0)
	    fprintf(ra, "%c.", c);
	}
    fprintf(ra, ",");
    }
fprintf(ra, "\n");
slNameFreeList(&list);
sqlFreeResult(&sr);

/* Add in link to PubMed record on publication. */
dyStringClear(query);
sqlDyStringPrintf(query,
   "select ACC_Accession.accID from ACC_Accession,ACC_LogicalDB "
   "where ACC_Accession._Object_key = %s "
   "and ACC_Accession._LogicalDB_key = ACC_LogicalDB._LogicalDB_key "
   "and ACC_LogicalDB.name = 'PubMed'", ref);
pubMed = sqlQuickStringVerbose(conn, query->string);
if (pubMed != NULL)
    fprintf(ra, "pubUrl https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=%s\n", pubMed);
freez(&pubMed);

dyStringClear(query);
sqlDyStringPrintf(query, 
	"select distinct MRK_Marker.symbol as gene,"
               "GXD_Specimen.sex as sex,"
	       "GXD_Specimen.age as age,"
	       "GXD_Specimen.ageMin as ageMin,"
	       "GXD_Specimen.ageMax as ageMax,"
	       "IMG_ImagePane.paneLabel as paneLabel,"
	       "ACC_Accession.numericPart as fileKey,"
	       "IMG_Image._Image_key as imageKey,"
	       "GXD_Assay._ProbePrep_key as probePrepKey,"
	       "GXD_Assay._AntibodyPrep_key as antibodyPrepKey,"
	       "GXD_Assay._ReporterGene_key as reporterGeneKey,"
	       "GXD_FixationMethod.fixation as fixation,"
	       "GXD_EmbeddingMethod.embeddingMethod as embedding,"
	       "GXD_Assay._Assay_key as assayKey,"
	       "GXD_Specimen.hybridization as sliceType,"
	       "GXD_Specimen._Genotype_key as genotypeKey,"
	       "IMG_ImagePane._ImagePane_key as imagePaneKey\n"
	"from MRK_Marker,"
	     "GXD_Assay,"
	     "GXD_Specimen,"
	     "GXD_InSituResult,"
	     "GXD_InSituResultImage,"
	     "GXD_FixationMethod,"
	     "GXD_EmbeddingMethod,"
	     "IMG_ImagePane,"
	     "IMG_Image,"
	     "ACC_Accession\n"
	"where MRK_Marker._Marker_key = GXD_Assay._Marker_key "
	  "and GXD_Assay._Assay_key = GXD_Specimen._Assay_key "
	  "and GXD_Specimen._Specimen_key = GXD_InSituResult._Specimen_key "
	  "and GXD_InSituResult._Result_key = GXD_InSituResultImage._Result_key "
	  "and GXD_InSituResultImage._ImagePane_key = IMG_ImagePane._ImagePane_key "
	  "and GXD_FixationMethod._Fixation_key = GXD_Specimen._Fixation_key "
	  "and GXD_EmbeddingMethod._Embedding_key = GXD_Specimen._Embedding_key "
	  "and IMG_ImagePane._Image_key = IMG_Image._Image_key "
	  "and IMG_Image._Image_key = ACC_Accession._Object_key "
	  "and ACC_Accession.prefixPart = 'PIX:' "
	  "and GXD_Assay._ImagePane_key is NULL "
	);
sqlDyStringPrintf(query, "and GXD_Assay._Refs_key = '%s'", ref);
sr = sqlGetResultVerbose(conn, query->string);

fprintf(tab, "#");
fprintf(tab, "gene\t");
fprintf(tab, "probeColor\t");
fprintf(tab, "sex\t");
fprintf(tab, "age\t");
fprintf(tab, "ageMin\t");
fprintf(tab, "ageMax\t");
fprintf(tab, "paneLabel\t");
fprintf(tab, "fileName\t");
fprintf(tab, "submitId\t");
fprintf(tab, "fPrimer\t");
fprintf(tab, "rPrimer\t");
fprintf(tab, "abName\t");
fprintf(tab, "abTaxon\t");
fprintf(tab, "abSubmitId\t");
fprintf(tab, "fixation\t");
fprintf(tab, "embedding\t");
fprintf(tab, "bodyPart\t");
fprintf(tab, "sliceType\t");
fprintf(tab, "genotype\t");
fprintf(tab, "strain\t");
fprintf(tab, "priority\t");
fprintf(tab, "captionId\t");
fprintf(tab, "imageWidth\t");
fprintf(tab, "imageHeight\n");

while ((row = sqlNextRow(sr)) != NULL)
    {
    char *gene = row[0];
    char *sex = row[1];
    char *age = row[2];
    char *ageMin = row[3];
    char *ageMax = row[4];
    char *paneLabel = row[5];
    char *fileKey = row[6];
    char *imageKey = row[7];
    char *probePrepKey = row[8];
    char *antibodyPrepKey = row[9];
    char *reporterGeneKey = row[10];
    char *fixation = row[11];
    char *embedding = row[12];
    char *assayKey = row[13];
    char *sliceType = row[14];
    char *genotypeKey = row[15];
    char *imagePaneKey = row[16];
    double calcAge = -1;
    char *probeColor = "";
    char *bodyPart = "";
    char *abName = NULL;
    char *rPrimer = NULL, *fPrimer = NULL;
    char *genotype = NULL;
    char *strain = NULL;
    char *priority = NULL;
    char abTaxon[32];
    char *captionId = "";
    char *abSubmitId = NULL;

    verbose(3, "   "); dumpRow(row, 16);

    if (age == NULL)
        continue;

    if (!lookedForCopyright)
	{
	struct sqlResult *sr = NULL;
	char **row;
	lookedForCopyright = TRUE;

	dyStringClear(query);
	sqlDyStringPrintf(query, 
	     "select note from MGI_NoteChunk,MGI_Note,MGI_NoteType,ACC_MGIType "
	     "where MGI_Note._Object_key = %s "
	     "and ACC_MGIType.name = 'Image' "
	     "and ACC_MGIType._MGIType_key = MGI_Note._MGIType_key "
	     "and MGI_NoteType.noteType='Copyright' "
	     "and MGI_Note._NoteType_key = MGI_NoteType._NoteType_key "
	     "and MGI_Note._Note_key = MGI_NoteChunk._Note_key "
	     "order by sequenceNum"
	     , imageKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	while ((row = sqlNextRow(sr)) != NULL)
	   dyStringAppend(copyright, row[0]);
	sqlFreeResult(&sr);

	verbose(2,"imageKey=%s\n",imageKey);

	if (copyright->stringSize != 0)
	    {
	    fprintf(ra, "copyright %s\n", copyright->string);
	    }
	}

    /* Massage sex */
        {
	if (sameString(sex, "Male"))
	    sex = "male";
	else if (sameString(sex, "Female"))
	    sex = "female";
	else
	    sex = "";
	}

    /* Massage age */
	{
	char *embryoPat = "embryonic day ";
	char *newbornPat = "postnatal newborn";
	char *dayPat = "postnatal day ";
	char *weekPat = "postnatal week ";
	char *adultPat = "postnatal adult";
	double calcMinAge = atof(ageMin);
	double calcMaxAge = atof(ageMax);
	double mouseBirthAge = 21.0;
	//double mouseAdultAge = 63.0;	/* Relative to conception, not birth */

	if (age[0] == 0)
	    {
	    warn("age null, ageMin %s, ageMax %s\n", ageMin, ageMax);
	    calcAge = (calcMinAge + calcMaxAge) * 0.5;
	    }
	else if (startsWith(embryoPat, age))
	    calcAge = atof(age+strlen(embryoPat));
	else if (sameString(newbornPat, age))
	    calcAge = mouseBirthAge;
	else if (startsWith(dayPat, age))
	    calcAge = atof(age+strlen(dayPat)) + mouseBirthAge;
        else if (startsWith(weekPat, age))
	    calcAge = 7.0 * atof(age+strlen(weekPat)) + mouseBirthAge;
	else if (sameString(adultPat, age) && calcMaxAge - calcMinAge > 1000 
		&& calcMinAge < 365)
	    calcAge = 365;	/* Most adult mice are relatively young */
	else
	    {
	    warn("Calculating age from %s", age);
	    calcAge = (calcMinAge + calcMaxAge) * 0.5;
	    }
	if (calcAge < calcMinAge)
	    calcAge = calcMinAge;
	if (calcAge > calcMaxAge)
	    calcAge = calcMaxAge;
	}
    
    /* Massage probeColor */
        {
	if (!isStrNull(reporterGeneKey))
	    {
	    /* Fixme: make sure that reporterGene's end up in probeType table. */
	    char *name = NULL;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, 
	    	"select term from VOC_Term where _Term_key = %s", 
	    	reporterGeneKey);
	    name = sqlQuickStringVerbose(conn2, query->string);
	    if (name == NULL)
	        warn("Can't find _ReporterGene_key %s in VOC_Term", 
			reporterGeneKey);
	    else if (sameString(name, "GFP"))
	        probeColor = "green";
	    else if (sameString(name, "lacZ"))
	        probeColor = "blue";
	    else 
	        warn("Don't know color of reporter gene %s", name);
	    freez(&name);
	    }
	if (!isStrNull(probePrepKey))
	    {
	    char *name = NULL;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, 
	      "select GXD_VisualizationMethod.visualization "
	      "from GXD_VisualizationMethod,GXD_ProbePrep "
	      "where GXD_ProbePrep._ProbePrep_key = %s "
	      "and GXD_ProbePrep._Visualization_key = GXD_VisualizationMethod._Visualization_key"
	      , probePrepKey);
	    name = sqlQuickStringVerbose(conn2, query->string);
	    if (name == NULL)
	        warn("Can't find visualization from _ProbePrep_key %s", probePrepKey);
	    probeColor = colorFromLabel(name, gene);
	    freez(&name);
	    if (probeColor[0] == 0)
	        {
		dyStringClear(query);
		sqlDyStringPrintf(query, 
			"select GXD_Label.label from GXD_Label,GXD_ProbePrep "
		        "where GXD_ProbePrep._ProbePrep_key = %s " 
			"and GXD_ProbePrep._Label_key = GXD_Label._Label_key"
		        , probePrepKey);
		name = sqlQuickStringVerbose(conn2, query->string);
		if (name == NULL)
		    warn("Can't find label from _ProbePrep_key %s", 
		    	probePrepKey);
		probeColor = colorFromLabel(name, gene);
		}
	    freez(&name);
	    }
	if (!isStrNull(antibodyPrepKey) && probeColor[0] == 0 )
	    {
	    char *name = NULL;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, 
		  "select GXD_Label.label from GXD_Label,GXD_AntibodyPrep "
		  "where GXD_AntibodyPrep._AntibodyPrep_key = %s "
		  "and GXD_AntibodyPrep._Label_key = GXD_Label._Label_key"
		  , antibodyPrepKey);
	    name = sqlQuickStringVerbose(conn2, query->string);
	    if (name == NULL)
		warn("Can't find label from _AntibodyPrep_key %s", antibodyPrepKey);
	    probeColor = colorFromLabel(name, gene);
	    freez(&name);
	    }
	}

    /* Get abName, abTaxon, abSubmitId */
    abTaxon[0] = 0;
    if (!isStrNull(antibodyPrepKey))
        {
	struct sqlResult *sr = NULL;
	int orgKey = 0;
	char **row;
	dyStringClear(query);
	sqlDyStringPrintf(query, 
		"select antibodyName,_Organism_key,GXD_Antibody._Antibody_key "
		"from GXD_AntibodyPrep,GXD_Antibody "
		"where GXD_AntibodyPrep._AntibodyPrep_key = %s "
		"and GXD_AntibodyPrep._Antibody_key = GXD_Antibody._Antibody_key"
		, antibodyPrepKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	row = sqlNextRow(sr);
	if (row != NULL)
	    {
	    abName = cloneString(row[0]);
	    orgKey = atoi(row[1]);
	    abSubmitId = cloneString(row[2]);
	    }
	sqlFreeResult(&sr);

	if (orgKey > 0)
	    {
	    char *latinName = NULL, *commonName = NULL;
	    int spTaxon = 0;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, "select latinName from MGI_Organism "
	                          "where _Organism_key = %d", orgKey);
	    latinName = sqlQuickStringVerbose(conn2, query->string);
	    if (latinName != NULL 
		&& !sameString(latinName, "Not Specified")
		&& !sameString(latinName, "Not Applicable"))
		{
		char *e = strchr(latinName, '/');
		if (e != NULL) 
		   *e = 0;	/* Chop off / and after. */
		spTaxon = spBinomialToTaxon(connSp, latinName);
		}
	    else
	        {
		dyStringClear(query);
		sqlDyStringPrintf(query, "select commonName from MGI_Organism "
	                          "where _Organism_key = %d", orgKey);
		commonName = sqlQuickStringVerbose(conn2, query->string);
		if (commonName != NULL 
		    && !sameString(commonName, "Not Applicable")
		    && !sameString(commonName, "Not Specified"))
		    {
		    spTaxon = spCommonToTaxon(connSp, commonName);
		    }
		}
	    if (spTaxon != 0)
	        safef(abTaxon, sizeof(abTaxon), "%d", spTaxon);
	    freez(&latinName);
	    freez(&commonName);
	    }
	}
    if (abName == NULL)
        abName = cloneString("");
    if (abSubmitId == NULL)
        abSubmitId = cloneString("");

    /* Get rPrimer, lPrimer */
    if (!isStrNull(probePrepKey))
        {
	struct sqlResult *sr = NULL;
	char **row;
	dyStringClear(query);
	sqlDyStringPrintf(query,
	    "select primer1sequence,primer2sequence "
	    "from PRB_Probe,GXD_ProbePrep "
	    "where PRB_Probe._Probe_key = GXD_ProbePrep._Probe_key "
	    "and GXD_ProbePrep._ProbePrep_key = %s"
	    , probePrepKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	row = sqlNextRow(sr);
	if (row != NULL)
	    {
	    fPrimer = cloneString(row[0]);
	    rPrimer = cloneString(row[1]);
	    }
	sqlFreeResult(&sr);
	}

    /* Note Jackson database actually stores the primers very
     * erratically.  In all the cases I can find for in situs
     * the primers are actually stored in free text in the PRB_Notes
     * e.g.  ... primers CGCGGATCCAGGGGAAACAGAAGGGCTGCG and CCCAAGCTTAGACTGTACAGGCTGAGCC ...
     */
    if (fPrimer == NULL || fPrimer[0]==0)
        {
	struct sqlResult *sr = NULL;
	char **row;
	dyStringClear(query);
	sqlDyStringPrintf(query,
	    "select PRB_Notes.note from GXD_ProbePrep, PRB_Notes"
	    " where GXD_ProbePrep._ProbePrep_key = %s"
	    "  and GXD_ProbePrep._Probe_key = PRB_Notes._Probe_key"
	    " order by PRB_Notes.sequenceNum"
	    , probePrepKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	dyStringClear(probeNotes);
	while ((row = sqlNextRow(sr)) != NULL)
	   dyStringAppend(probeNotes, row[0]);
	sqlFreeResult(&sr);

	if (probeNotes->stringSize > 0)
	    {
	    char f[256];
	    char r[256];
	    int i = 0;
	    char *s = strstr(probeNotes->string," primers ");
	    if (s)
		{
		s += strlen(" primers ");
		i = 0;
		while (strchr("ACGT",*s) && (i<sizeof(f)))
		    f[i++] = *s++;
		f[i]=0;
		if (strstr(s," and ")==s)
		    {
		    s += strlen(" and ");
		    i = 0;
    		    while (strchr("ACGT",*s) && (i<sizeof(r)))
    			r[i++] = *s++;
    		    r[i]=0;
		    if (strlen(f) >= 10 && strlen(r) >= 10)
			{
			fPrimer = cloneString(f);
			rPrimer = cloneString(r);
			}
		    else
			{
			verbose(1, "bad primer parse:_ProbePrep_key=%s fPrimer=[%s], rPrimer=[%s]\n",
			    probePrepKey,f,r);
			}
		    }
		}
	    }
	}
	
    if (fPrimer == NULL)
        fPrimer = cloneString("");
    if (rPrimer == NULL)
        rPrimer = cloneString("");

    fixation = blankOutUnknown(fixation);
    embedding = blankOutUnknown(embedding);

    /* Massage body part and slice type.  We only handle whole mounts. */
    if (sameString(sliceType, "whole mount"))
	{
	bodyPart = "whole";
	priority = "100";
	}
    else
	{
        sliceType = "";
	priority = "1000";
	}

    genotypeAndStrainFromKey(genotypeKey, conn2, &genotype, &strain);

    if (isStrNull(paneLabel))
	paneLabel = cloneString("");	  /* trying to suppress nulls in output */
    stripChar(paneLabel, '"');	/* Get rid of a difficult quote to process. */
    
    /* Fetch image dimensions from file. */
    imageWidth=0;
    imageHeight=0;
    safef(path, sizeof(path), "%s/%s.jpg", inJax, fileKey);
    if (fileExists(path))
    	jpegSize(path,&imageWidth,&imageHeight);  /* will errAbort if no valid .jpeg exists */
    else
	warn("Picture Missing! %s ",path);
    
    /* Deal caption if any.  Most of the work only happens the
     * first time see the image. */
    if (!hashLookup(uniqImageHash, imageKey))
        {
	struct sqlResult *sr = NULL;
	char **row;
	hashAdd(uniqImageHash, imageKey, NULL);
	dyStringClear(caption);
	dyStringClear(query);
	sqlDyStringPrintf(query, 
	     "select note from MGI_NoteChunk,MGI_Note,MGI_NoteType,ACC_MGIType "
	     "where MGI_Note._Object_key = %s "
	     "and ACC_MGIType.name = 'Image' "
	     "and ACC_MGIType._MGIType_key = MGI_Note._MGIType_key "
	     "and MGI_NoteType.noteType='Caption' "
	     "and MGI_Note._NoteType_key = MGI_NoteType._NoteType_key "
	     "and MGI_Note._Note_key = MGI_NoteChunk._Note_key "
	     "order by sequenceNum"
	     , imageKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	while ((row = sqlNextRow(sr)) != NULL)
	   dyStringAppend(caption, row[0]);
	sqlFreeResult(&sr);

	if (caption->stringSize > 0)
	    {
	    subChar(caption->string, '\t', ' ');
	    subChar(caption->string, '\n', ' ');
	    fprintf(cap, "%s\t%s\n", imageKey, caption->string);
	    hashAdd(captionHash, imageKey, imageKey);
	    }
	}
    if (hashLookup(captionHash, imageKey))
        captionId = imageKey;
    else
        captionId = "";

    fprintf(tab, "%s\t", gene);
    fprintf(tab, "%s\t", probeColor);
    fprintf(tab, "%s\t", sex);
    fprintf(tab, "%3.2f\t", calcAge);
    fprintf(tab, "%s\t", ageMin);
    fprintf(tab, "%s\t", ageMax);
    fprintf(tab, "%s\t", paneLabel);   /* may have to change NULL to empty string or "0" ? */
    fprintf(tab, "%s.jpg\t", fileKey);
    fprintf(tab, "%s\t", imageKey);
    fprintf(tab, "%s\t", fPrimer);
    fprintf(tab, "%s\t", rPrimer);
    fprintf(tab, "%s\t", abName);
    fprintf(tab, "%s\t", abTaxon);
    fprintf(tab, "%s\t", abSubmitId);
    fprintf(tab, "%s\t", fixation);
    fprintf(tab, "%s\t", embedding);
    fprintf(tab, "%s\t", bodyPart);
    fprintf(tab, "%s\t", sliceType);
    fprintf(tab, "%s\t", genotype);
    fprintf(tab, "%s\t", strain);
    fprintf(tab, "%s\t", priority);
    fprintf(tab, "%s\t", captionId);
    fprintf(tab, "%d\t", imageWidth);
    fprintf(tab, "%d\n", imageHeight);

    printExpression(tab,  conn2,  imagePaneKey, assayKey);
    gotAny = TRUE;
    freez(&genotype);
    freez(&abName);
    freez(&abSubmitId);
    freez(&rPrimer);
    freez(&fPrimer);
    }
sqlFreeResult(&sr);

carefulClose(&ra);
carefulClose(&tab);
carefulClose(&cap);

if (!gotAny)
    {
    remove(raName);
    remove(capName);
    remove(tabName);
    }
dyStringFree(&probeNotes);
dyStringFree(&copyright);
dyStringFree(&caption);
dyStringFree(&query);
hashFree(&uniqImageHash);
hashFree(&captionHash);

}
Ejemplo n.º 30
0
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, 
	char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile)
/* txGeneAlias - Make kgAlias and kgProtAlias tables.. */
{
/* Read and hash oldToNew */
struct hash *newToOldHash = loadNewToOldHash(oldToNew);

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);
struct sqlResult *sr;
char **row;
char query[256];

/* Open files. */
struct lineFile *lf = lineFileOpen(xrefFile, TRUE);
FILE *fAlias = mustOpen(aliasFile, "w");
FILE *fProt = mustOpen(protAliasFile, "w");

/* Stream through xref file, which has much of the info we need,
 * and which contains a line for each gene. */
char *words[KGXREF_NUM_COLS];
while (lineFileRowTab(lf, words))
    {
    /* Load the xref, and output most of it's fields as aliases. */
    struct kgXref *x = kgXrefLoad(words);
    char *id = x->kgID;
    outAlias(fAlias, id, x->kgID);
    outAlias(fAlias, id, x->mRNA);
    outAlias(fAlias, id, x->spID);
    outAlias(fAlias, id, x->spDisplayID);
    outAlias(fAlias, id, x->geneSymbol);
    outAlias(fAlias, id, x->refseq);
    outAlias(fAlias, id, x->protAcc);
    char *old = hashFindVal(newToOldHash, id);
    if (old != NULL)
        outAlias(fAlias, id, old);

    /* If we've got a uniProt ID, use that to get more info from uniProt. */
    char *acc = x->spID;
    if (acc[0] != 0)
        {
	/* Get current accession and output a bunch of easy protein aliases. */
	acc = spLookupPrimaryAcc(uConn, acc);
	outProt(fProt, id, acc, acc);
	outProt(fProt, id, acc, x->spDisplayID);
	outProt(fProt, id, acc, x->geneSymbol);
	outProt(fProt, id, acc, x->protAcc);
	if (old != NULL)
	    outProt(fProt, id, acc, old);

	/* Throw in old swissProt accessions. */
	sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc);
	sr = sqlGetResult(uConn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    outAlias(fAlias, id, row[0]);
	    outProt(fProt, id, acc, row[0]);
	    }

	/* Throw in gene names that SwissProt knows about */
	struct slName *gene, *geneList = spGenes(uConn, acc);
	for (gene = geneList; gene != NULL; gene = gene->next)
	    {
	    outAlias(fAlias, id, gene->name);
	    outProt(fProt, id, acc, gene->name);
	    }
	slFreeList(&geneList);
	}
    /* Throw in gene names from genbank. */
    /* At some point we may want to restrict this to the primary transcript in a cluster. */
    ev = hashFindVal(evHash,  id);
    if (ev != NULL)
	{
	int i;
	for (i=0; i<ev->accCount; ++i)
	    {
	    sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc);
	    int nameId = sqlQuickNum(gConn, query);
	    if (nameId != 0)
		{
		char name[64];
		sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId);
		if (sqlQuickQuery(gConn, query, name, sizeof(name)))
		    outAlias(fAlias, id, name);
		}
	    }
	}

    kgXrefFree(&x);
    }

carefulClose(&fAlias);
carefulClose(&fProt);
}