Ejemplo n.º 1
0
void doTransRegCodeProbe(struct trackDb *tdb, char *item,
	char *codeTable, char *motifTable,
	char *tfToConditionTable, char *conditionTable)
/* Display detailed info on a ChIP-chip probe from transRegCode experiments. */
{
char query[256];
struct sqlResult *sr;
char **row;
int rowOffset = hOffsetPastBin(database, seqName, tdb->table);
struct sqlConnection *conn = hAllocConn(database);
struct transRegCodeProbe *probe = NULL;

cartWebStart(cart, database, "ChIP-chip Probe Info");
sqlSafef(query, sizeof(query), "select * from %s where name = '%s'",
	tdb->table, item);
sr = sqlGetResult(conn, query);
if ((row = sqlNextRow(sr)) != NULL)
    probe = transRegCodeProbeLoad(row+rowOffset);
sqlFreeResult(&sr);
if (probe != NULL)
    {
    struct tfData *tfList = NULL, *tf;
    struct hash *tfHash = newHash(0);
    struct transRegCode *trc;
    int i;

    /* Print basic info. */
    printf("<B>Name:</B> %s<BR>\n", probe->name);
    printPosOnChrom(probe->chrom, probe->chromStart, probe->chromEnd,
    	NULL, TRUE, probe->name);

    /* Make up list of all transcriptionFactors. */
    for (i=0; i<probe->tfCount; ++i)
        {
	/* Parse out factor and condition. */
	char *tfName = probe->tfList[i];
	char *condition = strchr(tfName, '_');
	struct tfCond *cond;
	if (condition != NULL)
	    *condition++ = 0;
	else
	    condition = "n/a";
	tf = hashFindVal(tfHash, tfName);
	if (tf == NULL)
	    {
	    AllocVar(tf);
	    hashAddSaveName(tfHash, tfName, tf, &tf->name);
	    slAddHead(&tfList, tf);
	    }
	AllocVar(cond);
	cond->name = cloneString(condition);
	cond->binding = probe->bindVals[i];
	slAddHead(&tf->conditionList, cond);
	}
    slSort(&tfList, tfDataCmpName);

    /* Fold in motif hits in region. */
    if (sqlTableExists(conn, codeTable))
        {
	sr = hRangeQuery(conn, codeTable,
		probe->chrom, probe->chromStart, probe->chromEnd,
		"chipEvidence != 'none'", &rowOffset);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    trc = transRegCodeLoad(row+rowOffset);
	    tf = hashFindVal(tfHash, trc->name);
	    if (tf != NULL)
		slAddTail(&tf->trcList, trc);
	    }
	sqlFreeResult(&sr);
	}
    if (tfList == NULL)
	printf("No significant immunoprecipitation.");
    else
	{
	tfBindLevelSection(tfList, conn, motifTable, tfToConditionTable);
	}
    transRegCodeProbeFree(&probe);
    growthConditionSection(conn, conditionTable);
    }
printf("\n<HR>\n");
printTrackHtml(tdb);
hFreeConn(&conn);
}
Ejemplo n.º 2
0
void bedItemOverlapCount(struct hash *chromHash, char *infile, char *outfile){
unsigned maxChromSize = 0;
unitSize *counts = (unitSize *)NULL;
FILE *f = mustOpen(outfile, "w");
struct hashCookie hc = hashFirst(chromHash);
struct hashEl *hel;
while( (hel = hashNext(&hc)) != NULL) {
    unsigned num = (unsigned) ptToInt(hel->val);
    maxChromSize = max(num, maxChromSize);
}
verbose(2,"#\tmaxChromSize: %u\n", maxChromSize);
if (maxChromSize < 1)
    errAbort("maxChromSize is zero ?");

/*	Allocate just once for the largest chrom and reuse this array */
counts = needHugeMem(sizeof(unitSize) * maxChromSize);

/*	Reset the array to be zero to be reused */
memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize);

unsigned chromSize = 0;
char *prevChrom = (char *)NULL;
boolean outputToDo = FALSE;
struct hash *seenHash = newHash(5);

    struct lineFile *bf = lineFileOpen(infile , TRUE);
    struct bed *bed = (struct bed *)NULL;
    char *row[12];
    int numFields = doBed12 ? 12 : 3;

    while (lineFileNextRow(bf,row, numFields))
	{
	int i;
	bed = bedLoadN(row, numFields);

	verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);

	if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
	    {
	    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
	    if (outputToDo)
		outputCounts(counts, prevChrom, chromSize, f);
	    outputToDo = FALSE;
	    memset((void *)counts, 0,
		sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */
	    freez(&prevChrom); 
	    // prevChrom is now NULL so it will be caught by next if!
	    }
	if ((char *)NULL == prevChrom)  // begin a chr
	    {
	    if (hashLookup(seenHash, bed->chrom))
		errAbort("ERROR:input file not sorted. %s seen before on line %d\n",
		    bed->chrom, bf->lineIx);

	    hashAdd(seenHash, bed->chrom, NULL);
	    prevChrom = cloneString(bed->chrom);
	    chromSize = hashIntVal(chromHash, prevChrom);
	    verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
	    }
	if (bed->chromEnd > chromSize)
	    {
	    // check for circular chrM
	    if (doBed12 || bed->chromStart>=chromSize 
		|| differentWord(bed->chrom,"chrM")) 
		{
		warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
		bed->chromEnd);
		errAbort("chromEnd > chromSize ?  %d > %d", 
		    bed->chromEnd,chromSize);
		}

	    for (i = bed->chromStart; i < chromSize; ++i)
		INCWOVERFLOW(counts,i);
	    for (i = 0; i < (bed->chromEnd - chromSize); ++i)
		INCWOVERFLOW(counts,i);
	    }
	else if (doBed12)
	    {
	    int *starts = bed->chromStarts;
	    int *sizes = bed->blockSizes;
	    int *endStarts = &bed->chromStarts[bed->blockCount];

	    for(; starts < endStarts; starts++, sizes++)
		{
		unsigned int end = *starts + *sizes + bed->chromStart;
		for (i = *starts + bed->chromStart; i < end; ++i)
		    INCWOVERFLOW(counts,i);
		}
	    }
	else
	    {
	    for (i = bed->chromStart; i < bed->chromEnd; ++i)
		INCWOVERFLOW(counts, i);
	    }
	outputToDo = TRUE;
	bedFree(&bed); // plug the memory leak
	}

    lineFileClose(&bf);
    // Note, next file could be on same chr!

if (outputToDo)
    outputCounts(counts, prevChrom, chromSize, f);

if (doOutBounds)
    fprintf(stderr, "min %lu max %lu\n", (unsigned long)overMin, (unsigned long)overMax);

verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
carefulClose(&f);
freeMem(counts);
freez(&prevChrom);
// hashFreeWithVals(&chromHash, freez);
freeHash(&seenHash);
}
void analyse(int start, int stop)
{
    struct hash *hash;
    char line[512];
    int lineCount = 0;
    char *words[32];
    int wordCount;
    struct cdnaInfo *cdnaList = NULL;
    struct cdnaInfo *ci = NULL;
    int cdnaCount;
    int maxCdnaCount = stop - start;

    cdnaCount = 1;
    if (start > 1)
    {
        for (;;)
        {
            if (!fgets(line, sizeof(line), inFile))
                errAbort("Not %d cDNAs in file, only %d\n", start, cdnaCount);
            ++lineCount;
            if (line[0] == '#') /* Skip comments. */
                continue;
            wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words));
            if (wordCount <= 0) /* Skip empty lines. */
                continue;
            if (!differentWord(words[1], "alignments"))
            {
                ++cdnaCount;
                if (cdnaCount >= start)
                    break;
            }
        }
    }
    cdnaCount = 0;
    hash = newHash(14); /* Hash table with 16k entries. */
    for (;;)
    {
        if (!fgets(line, sizeof(line), inFile))
            break;
        ++lineCount;
        if (line[0] == '#') /* Skip comments. */
            continue;
        wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words));
        if (wordCount <= 0) /* Skip empty lines. */
            continue;
        if (wordCount < 4)  /* Everyone else has at least four words. */
        {
            errAbort("Short line %d:\n", lineCount);
        }
        if (sameWord(words[1], "Blasting"))
        {
            char *cdnaName = words[2];
            if ((ci = lookupInfo(hash, cdnaName)) == NULL)
            {
                struct hashEl *hel;
                ci = needMem(sizeof(*ci));
                hel = hashAdd(hash, cdnaName, ci);
                ci->next = cdnaList;
                cdnaList = ci;
                ci->ix = atoi(words[0]);
                ci->name = hel->name;
            }
        }
        else if (sameWord(words[2], "hits"))
        {
            /* Newer style - includes cDNA matching range. */
            if (ci == NULL)
                continue;
            hitLine(ci, lineCount, words[0], words[1], words[3], words[4], words[5], words[9]);
        }
        else if (sameWord(words[1], "hits"))
            /* Older style - no cDNA matching range. */
        {
            if (ci == NULL)
                continue;
            hitLine(ci, lineCount, words[0],     NULL, words[2], words[3], words[4], words[8]);
        }
        else if (sameWord(words[1], "alignments"))
        {
            struct dnaSeq *cdnaSeq;
            struct wormCdnaInfo info;
            if (ci == NULL)
                continue;
            if (differentWord(ci->name, words[3]))
                errAbort("Line %d - %s is not %s", lineCount, words[3], ci->name);
            if (!ci->finished)
            {
                if (!anyCdnaSeq(ci->name, &cdnaSeq, &info))
                {
                    warn("Can't find cDNA %s", ci->name);
                    ci->isDupe = TRUE;
                }
                else
                {
                    ci->baseCount = cdnaSeq->size;
                    ci->baseCrc = dnaCrc(cdnaSeq->dna, cdnaSeq->size);
                    slReverse(&ci->roughAli);
                    ci->roughScore = bestRoughScore(ci->roughAli);
                    filterDupeCdna(ci, cdnaSeq);
                    ci->isBackwards = (info.orientation == '-');
                    refineAlis(ci, cdnaSeq);
                    ci->fineScore = bestFineScore(ci->fineAli);
                    ci->isEmbryonic = info.isEmbryonic;
                    ci->finished = TRUE;
                    freeDnaSeq(&cdnaSeq);
                    ++cdnaCount;
                    if (cdnaCount >= maxCdnaCount)
                        break;
                }
            }
        }
        else
        {
            errAbort("Can't deal with line %d\n", lineCount);
        }
    }

    slReverse(&cdnaList);

    doGoodBad(cdnaList);
    doUnusual(cdnaList);
//makeCdnaToGene(cdnaList);

    /* Clean up. */

    /* These two are slow and not really necessary. */
#ifdef FASTIDIOUS
    slFreeList(&cdnaList);
    freeHash(&hash);
#endif

    uglyf("Done analyse\n");
}
struct mafAli *hgMafFrag(
	char *database,     /* Database, must already have hSetDb to this */
	char *track, 	    /* Name of MAF track */
	char *chrom, 	    /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
	char strand, 	    /* Chromosome strand. */
	char *outName, 	    /* Optional name to use in first component */
	struct slName *orderList /* Optional order of organisms. */
	)
/* mafFrag- Extract maf sequences for a region from database.
 * This creates a somewhat unusual MAF that extends from start
 * to end whether or not there are actually alignments.  Where
 * there are no alignments (or alignments missing a species)
 * a . character fills in.   The score is always zero, and
 * the sources just indicate the species.  You can mafFree this
 * as normal. */
{
int chromSize = hChromSize(database, chrom);
struct sqlConnection *conn = hAllocConn(database);
struct dnaSeq *native = hChromSeq(database, chrom, start, end);
struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end);
char masterSrc[128];
struct hash *orgHash = newHash(10);
struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL;
int curPos = start, symCount = 0;
struct slName *name;
int order = 0;

/* Check that the mafs are really copacetic, the particular
 * subtype we think is in the database that this (relatively)
 * simple code can handle. */
safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom);
mafCheckFirstComponentSrc(mafList, masterSrc);
mafCheckFirstComponentStrand(mafList, '+');
slSort(&mafList, mafCmp);

/* Prebuild organisms if possible from input orderList. */
for (name = orderList; name != NULL; name = name->next)
    {
    AllocVar(org);
    slAddHead(&orgList, org);
    hashAddSaveName(orgHash, name->name, org, &org->name);
    org->dy = dyStringNew(native->size*1.5);
    org->order = order++;
    if (nativeOrg == NULL)
        nativeOrg = org;
    }
if (orderList == NULL)
    {
    AllocVar(org);
    slAddHead(&orgList, org);
    hashAddSaveName(orgHash, database, org, &org->name);
    org->dy = dyStringNew(native->size*1.5);
    if (nativeOrg == NULL)
        nativeOrg = org;
    }

/* Go through all mafs in window, mostly building up
 * org->dy strings. */
for (maf = mafList; maf != NULL; maf = maf->next)
    {
    struct mafComp *mc, *mcMaster = maf->components;
    struct mafAli *subMaf = NULL;
    order = 0;
    if (curPos < mcMaster->start)
	{
	fillInMissing(nativeOrg, orgList, native, start,
		curPos, mcMaster->start);
	symCount += mcMaster->start - curPos;
	}
    if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst
    						    * backtracking */
	{
	if (mafNeedSubset(maf, masterSrc, curPos, end))
	    {
	    subMaf = mafSubset(maf, masterSrc, curPos, end);
	    if (subMaf == NULL)
	        continue;
	    }
	else
	    subMaf = maf;
	for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order)
	    {
	    /* Extract name up to dot into 'orgName' */
	    char buf[128], *e, *orgName;

	    if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */
		continue;

	    mc->leftStatus = mc->rightStatus = 0; /* squash annotation */

	    e = strchr(mc->src, '.');
	    if (e == NULL)
		orgName = mc->src;
	    else
		{
		int len = e - mc->src;
		if (len >= sizeof(buf))
		    errAbort("organism/database name %s too long", mc->src);
		memcpy(buf, mc->src, len);
		buf[len] = 0;
		orgName = buf;
		}

	    /* Look up dyString corresponding to  org, and create a
	     * new one if necessary. */
	    org = hashFindVal(orgHash, orgName);
	    if (org == NULL)
		{
		if (orderList != NULL)
		   errAbort("%s is not in orderList", orgName);
		AllocVar(org);
		slAddHead(&orgList, org);
		hashAddSaveName(orgHash, orgName, org, &org->name);
		org->dy = dyStringNew(native->size*1.5);
		dyStringAppendMultiC(org->dy, '.', symCount);
		if (nativeOrg == NULL)
		    nativeOrg = org;
		}
	    if (orderList == NULL && order > org->order)
		org->order = order;
	    org->hit = TRUE;

	    /* Fill it up with alignment. */
	    dyStringAppendN(org->dy, mc->text, subMaf->textSize);
	    }
	for (org = orgList; org != NULL; org = org->next)
	    {
	    if (!org->hit)
		dyStringAppendMultiC(org->dy, '.', subMaf->textSize);
	    org->hit = FALSE;
	    }
	symCount += subMaf->textSize;
	curPos = mcMaster->start + mcMaster->size;
	if (subMaf != maf)
	    mafAliFree(&subMaf);
	}
    }
if (curPos < end)
    {
    fillInMissing(nativeOrg, orgList, native, start, curPos, end);
    symCount += end - curPos;
    }
mafAliFreeList(&mafList);

slSort(&orgList, oneOrgCmp);
if (strand == '-')
    {
    for (org = orgList; org != NULL; org = org->next)
	reverseComplement(org->dy->string, org->dy->stringSize);
    }

/* Construct our maf */
AllocVar(maf);
maf->textSize = symCount;
for (org = orgList; org != NULL; org = org->next)
    {
    struct mafComp *mc;
    AllocVar(mc);
    if (org == orgList)
        {
	if (outName != NULL)
	    {
	    mc->src = cloneString(outName);
	    mc->srcSize = native->size;
	    mc->strand = '+';
	    mc->start = 0;
	    mc->size = native->size;
	    }
	else
	    {
	    mc->src = cloneString(masterSrc);
	    mc->srcSize = chromSize;
	    mc->strand = strand;
	    if (strand == '-')
	       reverseIntRange(&start, &end, chromSize);
	    mc->start = start;
	    mc->size = end-start;
	    }
	}
    else
        {
	int size = countAlpha(org->dy->string);
	mc->src = cloneString(org->name);
	mc->srcSize = size;
	mc->strand = '+';
	mc->start = 0;
	mc->size = size;
	}
    mc->text = cloneString(org->dy->string);
    dyStringFree(&org->dy);
    slAddHead(&maf->components, mc);
    }
slReverse(&maf->components);

slFreeList(&orgList);
freeHash(&orgHash);
hFreeConn(&conn);
return maf;
}
Ejemplo n.º 5
0
/* main function */
int main_bismark (int argc, char *argv[]) {
    
    char *output, *outReportfile, *outCpGfile, *outbedGraphfile, *row[100], *samfilecopy;
    char *forwardcg, *forwardchg, *forwardchh, *forwardread, *forwardread1;
    char *reversecg, *reversechg, *reversechh, *reverseread, *reverseread1;
    unsigned long long int *cnt;
    unsigned long long int *cnt2 = NULL;
    int optSam = 0, c, optaddChr = 0, optStats = 0, optBis = 0, optFull = 0, optKeep = 0;
    unsigned int optisize = 500;
    int optcov = 5;
    char *optoutput = NULL;
    struct hash *cpgHash = newHash(0);
    struct hash *chgHash = newHash(0);
    struct hash *chhHash = newHash(0);
    time_t start_time, end_time;
    start_time = time(NULL);
    
    while ((c = getopt(argc, argv, "SCsbFBo:c:I:h?")) >= 0) {
        switch (c) {
            case 'S': optSam = 1; break;
            case 'C': optaddChr = 1; break;
            case 's': optStats = 1; break;
            case 'b': optBis = 1; break;
            case 'F': optFull = 1; break;
            case 'B': optKeep = 1; break;
            case 'c': optcov = (int)strtol(optarg, 0, 0); break;
            case 'I': optisize = (unsigned int)strtol(optarg, 0, 0); break;
            case 'o': optoutput = strdup(optarg); break;
            case 'h':
            case '?': return bismark_usage(); break;
            default: return 1;
        }
    }
    if (optind + 3 > argc)
        return bismark_usage();

    char *chr_size_file = argv[optind];
    char *cpg_bed_file = argv[optind+1];
    char *sam_file = argv[optind+2];

    fprintf(stderr, "* CpG file provided: %s\n", cpg_bed_file);
    fprintf(stderr, "* Insert size cutoff: %u\n", optisize);
    fprintf(stderr, "* Read coverage threshold: %i\n", optcov);
   
    struct hash *chrHash = hashNameIntFile(chr_size_file);
    
    samfilecopy = cloneString(sam_file);
    int numFields = chopByChar(samfilecopy, ',', row, ArraySize(row));
    fprintf(stderr, "* Provided %i BAM/SAM file(s)\n", numFields);


    if(optFull) {
        fprintf(stderr, "* Warning: will run in Full mode, 8 track files and 1 report file will be generated\n");
        fprintf(stderr, "* Warning: will output stats over each C (in CHG)\n");
        fprintf(stderr, "* Warning: will output stats over each C (in CHH)\n");
        optStats = 0;
        optBis = 1;
    }
    
    if(optStats) {
        fprintf(stderr, "* Warning: will report stats only as -s specified\n");
    }
    // if use select bismark like output, read cpgHash at each C stats
    if(optBis) {
        fprintf(stderr, "* Warning: will output stats over each C (in CpG)\n");
        cpgHash = cpgBed2BinKeeperHashBismark(chrHash, cpg_bed_file);
    }else{
        fprintf(stderr, "* Warning: will output stats over each CpG\n");
        cpgHash = cpgBed2BinKeeperHash(chrHash, cpg_bed_file);
    }

    if(optoutput) {
        output = optoutput;
    } else {
        output = cloneString(get_filename_without_ext(basename(row[0])));
    }
    

    if(asprintf(&outCpGfile, "%s.CpG.bedGraph", output) < 0)
        errAbort("Mem Error.\n");
    if(asprintf(&outbedGraphfile, "%s.density.bedGraph", output) < 0)
        errAbort("Mem Error.\n");
    if (asprintf(&outReportfile, "%s.report", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardcg, "%s.forward.CG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardchg, "%s.forward.CHG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardchh, "%s.forward.CHH.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardread, "%s.forward.Density.bed", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&forwardread1, "%s.forward.Density.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reversecg, "%s.reverse.CG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reversechg, "%s.reverse.CHG.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reversechh, "%s.reverse.CHH.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reverseread, "%s.reverse.Density.bed", output) < 0)
        errAbort("Preparing output wrong");
    if (asprintf(&reverseread1, "%s.reverse.Density.bedGraph", output) < 0)
        errAbort("Preparing output wrong");
    

    //sam file to bed file
    //fprintf(stderr, "* Parsing the SAM/BAM file\n");
    cnt = bismarkBamParse(sam_file, chrHash, cpgHash, chgHash, chhHash, forwardread, reverseread, optSam, optaddChr, optFull, optisize);
    
    //write to file
    if (optFull){
        fprintf(stderr, "* Output CpG methylation calls\n");
        writecpgBismarkLite(cpgHash, forwardcg, reversecg, optcov);
        fprintf(stderr, "* Output CHG methylation calls\n");
        writecpgBismarkLiteHash(chgHash, forwardchg, reversechg, optcov);
        fprintf(stderr, "* Output CHH methylation calls\n");
        writecpgBismarkLiteHash(chhHash, forwardchh, reversechh, optcov);
        fprintf(stderr, "* Sorting methylation calls\n");
        sortBedfile(forwardcg);
        sortBedfile(reversecg);
        sortBedfile(forwardchg);
        sortBedfile(reversechg);
        sortBedfile(forwardchh);
        sortBedfile(reversechh);
        fprintf(stderr, "* Sorting density bed\n");
        sortBedfile(forwardread);
        sortBedfile(reverseread);
        fprintf(stderr, "* Generating density bedGraph\n");
        bedItemOverlapCount(chrHash, forwardread, forwardread1);
        bedItemOverlapCount(chrHash, reverseread, reverseread1);
    }else{
        cnt2 = writecpgBismark(cpgHash, outbedGraphfile, outCpGfile, optStats, optcov);
        //sort output
        if(!optStats) {
            fprintf(stderr, "* Sorting output density\n");
            sortBedfile(outbedGraphfile);
        }
        //sort output
        if(!optStats) {
            fprintf(stderr, "* Sorting output CpG methylation call\n");
            sortBedfile(outCpGfile);
        }
    }

    //generate bigWig
    //fprintf(stderr, "* Generating bigWig\n");
    //bigWigFileCreate(outbedGraphfile, chr_size_file, 256, 1024, 0, 1, outbigWigfile);
    //bedGraphToBigWig(outbedGraphfile, chr_size_file, outbigWigfile);
    
    //write report file
    fprintf(stderr, "* Preparing report file\n");
    writeReportBismark(outReportfile, cnt, cnt2, numFields, row, optBis, hashIntSum(chrHash));

    if(!optKeep){
        fprintf(stderr, "* Deleting (huge) density bed files\n");
        unlink(forwardread);
        unlink(reverseread);
    }
    
    //cleaning
    hashFree(&chrHash);
    hashFree(&cpgHash);
    hashFree(&chgHash);
    hashFree(&chhHash);
    free(outCpGfile);
    free(outbedGraphfile);
    //free(outbigWigfile);
    free(outReportfile);
    free(samfilecopy);
    free(forwardcg);
    free(forwardchg);
    free(forwardchh);
    free(forwardread);
    free(forwardread1);
    free(reversecg);
    free(reversechg);
    free(reversechh);
    free(reverseread);
    free(reverseread1);
    end_time = time(NULL);
    fprintf(stderr, "* Done, time used %.0f seconds.\n", difftime(end_time, start_time));
    return 0;
}
void readCloneNames(struct lineFile *clf)
/* read internal BAC clone names and Sanger sts names */
{
struct alias *a = NULL;
struct sanger *s = NULL;
char *words[4], *name = NULL, *sanger = NULL, *extName = NULL;
int i, rel;
char sep = '|';
boolean found = FALSE, posFound = FALSE;

/* alias hash is keyed by Sanger sts name */
aliasHash = newHash(16);
/* hash of Sanger names keyed by external name */
sangerByExtNameHash = newHash(16);

/* Read in all rows */
while (lineFileChopCharNext(clf, sep, words, 5))
    {
    name = cloneString(words[0]);
    sanger = cloneString(words[1]);
    if (!sameString(words[2], ""))
        rel = sqlUnsigned(words[2]);
    else
        rel = 3;
    /* find external name for this internal name from the extNameHash */
    if ((extName = hashFindVal(extNameHash, name)) == NULL)
        {
        /* if not found in BAC hash, then need to use internal name to make extName */
        extName = translateName(name, FALSE);
        }
    if ((a = hashFindVal(aliasHash, sanger)) == NULL)
        {
        /* allocate memory for alias struct */
        AllocVar(a); 
        /* allocate memory for UniSTS IDs, aliases, internal and external names and relations */
        /* and initialize the arrays */
        AllocArray(a->uniStsId, (sizeof(char *) * NUMSANGER));
        AllocArray(a->aliases, (sizeof(char *) * NUMALIASES));
        AllocArray(a->extName, (sizeof(char *) * MAXSANGER));
        AllocArray(a->intName, (sizeof(char *) * MAXSANGER));
        AllocArray(a->relation, (sizeof(int) * MAXSANGER));

        for (i = 0; i < NUMSANGER; i++)
            {
            a->uniStsId[i] = NULL;
            }
        for (i = 0; i < MAXSANGER; i++)
            {
            a->extName[i] = NULL;
            a->intName[i] = NULL;
            a->relation[i] = -1;
            }
        for (i = 0; i < NUMALIASES; i++)
            {
            a->aliases[i] = NULL;
            }
        }
    /* find empty slot in arrays to add external and internal names */
    posFound = FALSE;
    for (i = 0; i < NUMALIASES && (!posFound); i++)
        {
        if (a->extName[i] == NULL)
            {
            posFound = TRUE;
            a->extName[i] = cloneString(extName);
            if (a->intName[i] == NULL)
                a->intName[i] = cloneString(name);
            else
                errAbort("For marker %s, the empty slot in the intName array is not the same as that for the extName array in the alias struct\n", extName);
            if (a->relation[i] == -1)
                a->relation[i] = rel;
            else 
                errAbort("For marker %s, the empty slot in the relation array is not the same as that for the extName array in the alias struct\n", extName);
            }
        }
   
    a->sangerName = cloneString(sanger);
    a->primer1 = NULL;
    a->primer2 = NULL;
    /* add this alias struct to the hash keyed by sanger name */
    hashAdd(aliasHash, sanger, a);
    /* add sanger name to hash keyed by external name */
    if ((s = hashFindVal(sangerByExtNameHash, extName)) == NULL)
        {
        /* allocate memory for struct with array of Sanger names */
        AllocVar(s);
        /* initialize the array */
        for (i = 0; i < MAXSANGER; i++)
            {
            s->sangerName[i] = NULL;
            }
        }
    found = FALSE;
    for (i = 0; i < MAXSANGER && (!found); i++)
        {
        if (s->sangerName[i] == NULL)
            {
            found = TRUE;
            s->sangerName[i] = cloneString(sanger);
            }
        }
  /* add this list of sanger names to a hash keyed by external name, extName */
    hashAdd(sangerByExtNameHash, extName, s);
    }
}
Ejemplo n.º 7
0
static void clusterClone(int argc, char *argv[])
{
int i;

for (i=1; i < argc; ++i)
    {
    struct lineFile *lf;
    struct psl *psl;
    unsigned tSize;
    char *prevAccPart = (char *)NULL;
    char *prevAccName = (char *)NULL;
    char *prevTargetName = (char *)NULL;
    struct hashEl *el;
    struct hash *chrHash = newHash(0);
    struct hash *coordHash = newHash(0);
    struct coordEl *coord;
    struct coordEl **coordListPt = (struct coordEl **) NULL;
    unsigned querySize = 0;
    int partCount = 0;
    int partsConsidered = 0;

    verbose(2,"#\tprocess: %s\n", argv[i]);
    lf=pslFileOpen(argv[i]);
    while ((struct psl *)NULL != (psl = pslNext(lf)) )
	{
	char *accName = (char *)NULL;
	char *targetName = (char *)NULL;
	int chrCount = 0;
	double percentCoverage;

	accName = cloneString(psl->qName);
	if ((char *)NULL == prevAccPart)
	    {
	    prevAccPart = cloneString(psl->qName);  /* first time */
	    querySize = psl->qSize;
	    ++partsConsidered;
	    }
	chopSuffixAt(accName,'_');

	if ((char *)NULL == prevAccName)
		prevAccName = cloneString(accName);  /* first time */
	if ((char *)NULL == prevTargetName)
		prevTargetName = cloneString(psl->tName);  /* first time */

	/*	encountered a new accession name, process the one we
 	 *	were working on
	 */
	if (differentWord(accName, prevAccName))
	    {
	    if (partCount > 0)
		processResult(chrHash, coordHash, prevAccName, querySize,
		    partsConsidered);
	    else
		verbose(1,"# ERROR %s %s - no coordinates found in %d parts considered\n",
		    prevTargetName, prevAccName, partsConsidered);
	    freeMem(prevAccName);
	    prevAccName = cloneString(accName);
	    freeHash(&chrHash);
	    freeHash(&coordHash);
	    chrHash = newHash(0);
	    coordHash = newHash(0);
	    querySize = 0;
	    partCount = 0;
	    partsConsidered = 0;
	    }

	tSize = psl->tEnd - psl->tStart;
	percentCoverage = 100.0*((double)(tSize+1)/(psl->qSize + 1));
	if (differentWord(psl->qName, prevAccPart))
	    {
	    ++partsConsidered;
	    querySize += psl->qSize;
	    freeMem(prevAccPart);
	    prevAccPart = cloneString(psl->qName);
	    }

	targetName = cloneString(psl->tName);
	if (differentWord(targetName, prevTargetName))
	    {
	    freeMem(prevTargetName);
	    prevTargetName = cloneString(targetName);
	    }
	/*	keep a hash of chrom names encountered	*/
	el = hashLookup(chrHash, targetName);
	if (el == NULL)
	    {
	    if (percentCoverage > minCover)
		{
		hashAddInt(chrHash, targetName, 1);
		chrCount = 1;
		}
	    else
		{
		hashAddInt(chrHash, targetName, 0);
		chrCount = 0;
		}
	    }
	else
	    {
	    if (percentCoverage > minCover)
		{
		chrCount = ptToInt(el->val) + 1;
		el->val=intToPt(chrCount);
		}
	    }

	AllocVar(coord);
	coord->start = psl->tStart;
	coord->end = psl->tEnd;
	coord->qSize = psl->qSize;
	coord->strand = sameWord(psl->strand,"+") ? 1 : 0;
	/*	when coverage is sufficient	*/
	if (percentCoverage > minCover)
	    {
	    ++partCount;
	    coord->name = cloneString(psl->qName);
	    /*	for each chrom name, accumulate a list of coordinates */
	    el = hashLookup(coordHash, targetName);
	    if (el == NULL)
		{
		AllocVar(coordListPt);
		hashAdd(coordHash, targetName, coordListPt);
		}
	    else
		{
		coordListPt = el->val;
		}
	    slAddHead(coordListPt,coord);
	verbose(2,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }
	else
	    {
	verbose(3,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }


	freeMem(accName);
	freeMem(targetName);
	pslFree(&psl);
	}
    if (partCount > 0)
	processResult(chrHash, coordHash, prevAccName, querySize,
	    partsConsidered);
    else
	verbose(1,"# ERROR %s %s - no coordinates found\n",
	    prevTargetName, prevAccName);
    freeMem(prevAccName);
    freeHash(&chrHash);
    freeHash(&coordHash);
    lineFileClose(&lf);
    }
}	/*	static void clusterClone()	*/
Ejemplo n.º 8
0
void hgFindSpec(char *org, char *database, char *hgFindSpecName, char *sqlFile,
                char *hgRoot, boolean strict)
/* hgFindSpec - Create hgFindSpec table from text files. */
{
    struct hash *uniqHash = newHash(8);
    struct hash *htmlHash = newHash(8);
    struct hgFindSpec *hfsList = NULL, *hfs;
    char rootDir[512], orgDir[512], asmDir[512];
    char tab[512];
    snprintf(tab, sizeof(tab), "%s.tab", hgFindSpecName);

    /* Create track list from hgRoot and hgRoot/org and hgRoot/org/assembly
     * ra format database. */
    sprintf(rootDir, "%s", hgRoot);
    sprintf(orgDir, "%s/%s", hgRoot, org);
    sprintf(asmDir, "%s/%s/%s", hgRoot, org, database);
    layerOn(strict, database, asmDir, uniqHash, htmlHash, FALSE, &hfsList);
    layerOn(strict, database, orgDir, uniqHash, htmlHash, FALSE, &hfsList);
    layerOn(strict, database, rootDir, uniqHash, htmlHash, TRUE, &hfsList);
    slSort(&hfsList, hgFindSpecCmp);
    if (verboseLevel() > 0)
        printf("Loaded %d search specs total\n", slCount(hfsList));

    /* Write to tab-separated file. */
    {
        FILE *f = mustOpen(tab, "w");
        for (hfs = hfsList; hfs != NULL; hfs = hfs->next)
            hgFindSpecTabOut(hfs, f);
        carefulClose(&f);
    }

    /* Update database */
    {
        char *create, *end;
        char query[256];
        struct sqlConnection *conn = sqlConnect(database);

        /* Load in table definition. */
        readInGulp(sqlFile, &create, NULL);
        create = trimSpaces(create);
        create = subTrackName(create, hgFindSpecName);
        end = create + strlen(create)-1;
        if (*end == ';') *end = 0;
        sqlRemakeTable(conn, hgFindSpecName, create);

        /* Load in regular fields. */
        sqlSafef(query, sizeof query, "load data local infile '%s' into table %s", tab,
                 hgFindSpecName);
        sqlUpdate(conn, query);

        /* Load in settings fields. */
        for (hfs = hfsList; hfs != NULL; hfs = hfs->next)
        {
            if (hfs->settingsHash != NULL)
            {
                char *settings = settingsFromHash(hfs->settingsHash);
                updateBigTextField(conn, hgFindSpecName, "searchName",
                                   hfs->searchName,
                                   "searchSettings", settings);
                freeMem(settings);
            }
        }

        sqlDisconnect(&conn);
        if (verboseLevel() > 0)
            printf("Loaded database %s\n", database);
    }
}
struct g2cFile *loadG2cFile(char *fileName)
{
char lineBuf[1024*8];
int lineLen;
char *words[256*8];
int wordCount;
FILE *f;
int lineCount = 0;
struct g2cFile *gf = alloc(sizeof(*gf));
int hitCount = 0;
int cdnaCount = 0;
int geneCount = 0;

gf->name = fileName;
f = mustOpen(fileName, "r");
gf->cdnaHash = newHash(14);
while (fgets(lineBuf, sizeof(lineBuf), f) != NULL)
    {
    ++lineCount;
    lineLen = strlen(lineBuf);
    if (lineLen >= sizeof(lineBuf) - 1)
        {
        errAbort("%s\nLine %d of %s too long, can only handle %d chars\n",
            lineBuf, lineCount, fileName, sizeof(lineBuf)-1);
        }
    wordCount = chopString(lineBuf, whiteSpaceChopper, words, ArraySize(words));
    if (wordCount > 0)
        {
        struct gene *gene = alloc(sizeof(*gene));
        char *geneName = words[0];
        int i;
        
        /* Create new gene struct and put it on list. */
        gene->name = cloneString(geneName);
        slAddHead(&gf->geneList, gene);
        ++geneCount;

        /* Put all cdna hits on gene. */
        for (i=1; i<wordCount; ++i)
            {
            struct cdnaHit *hit;
            struct cdnaVal *cdnaVal;
            struct hashEl *hel;
            char *cdnaName = words[i];

            /* Get cdna, or if it's the first time we've seen it
             * make up a data structure for it and hang it on
             * hash list and cdna list. */
            if ((hel = hashLookup(gf->cdnaHash, cdnaName)) == NULL)
                {
                cdnaVal = alloc(sizeof(*cdnaVal));
                hel = hashAdd(gf->cdnaHash, cdnaName, cdnaVal);
                cdnaVal->name = hel->name;
                slAddHead(&gf->cdnaList, cdnaVal);
                ++cdnaCount;
                }
            else
                {
                cdnaVal = hel->val;
                }
            ++cdnaVal->useCount;

            /* Make up new cdna hit and hang it on the gene. */
            hit = alloc(sizeof(*hit));
            hit->hel = hel;
            hit->name = hel->name;
            slAddHead(&gene->hitList, hit);
            ++hitCount;
            }
        slReverse(&gene->hitList);
        }    
    }
slReverse(&gf->geneList);
slSort(&gf->geneList, cmpName);
slSort(&gf->cdnaList, cmpName);
fclose(f);
reportHashStats(gf->cdnaHash);
printf("Loaded %s.  %d genes %d cdnas %d hits\n", fileName,
    geneCount, cdnaCount, hitCount);
return gf;
}
Ejemplo n.º 10
0
void axtChain(char *axtIn, char *tNibDir, char *qNibDir, char *chainOut)
/* axtChain - Chain together axt alignments.. */
{
struct hash *pairHash = newHash(0);  /* Hash keyed by qSeq<strand>tSeq */
struct seqPair *spList = NULL, *sp;
FILE *f = mustOpen(chainOut, "w");
char *qName = "",  *tName = "";
struct dnaSeq *qSeq = NULL, *tSeq = NULL;
char qStrand = 0, tStrand = 0;
struct chain *chainList = NULL, *chain;
FILE *details = NULL;
struct dnaSeq *seq = NULL;
struct hash *qFaHash = newHash(0);
struct hash *tFaHash = newHash(0);
FILE *faF;
boolean qIsTwoBit = twoBitIsFile(qNibDir);
boolean tIsTwoBit = twoBitIsFile(tNibDir);

axtScoreSchemeDnaWrite(scoreScheme, f, "axtChain");

if (detailsName != NULL)
    details = mustOpen(detailsName, "w");
/* Read input file and divide alignments into various parts. */
if (optionExists("psl"))
    spList = readPslBlocks(axtIn, pairHash, f);
else
    spList = readAxtBlocks(axtIn, pairHash, f);

if (optionExists("faQ"))
    {
    faF = mustOpen(qNibDir, "r");
    verbose(1, "reading query fasta sequence from '%s'\n", qNibDir);
    while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq))
        hashAdd(qFaHash, seq->name, seq);
    fclose(faF);
    }
if (optionExists("faT"))
    {
    faF = mustOpen(tNibDir, "r");
    verbose(1, "reading target fasta sequence from '%s'\n", tNibDir);
    while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq))
        hashAdd(tFaHash, seq->name, seq);
    fclose(faF);
    }
for (sp = spList; sp != NULL; sp = sp->next)
    {
    slReverse(&sp->blockList);
    removeExactOverlaps(&sp->blockList);
    verbose(1, "%d blocks after duplicate removal\n", slCount(sp->blockList));
    if (optionExists("faQ"))
        {
        assert (qFaHash != NULL);
        loadFaSeq(qFaHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand, qNibDir);
        }
    else
	{
        loadIfNewSeq(qNibDir, qIsTwoBit, sp->qName, sp->qStrand,
		&qName, &qSeq, &qStrand);
        }
    if (optionExists("faT"))
        {
        assert (tFaHash != NULL);
        loadFaSeq(tFaHash, sp->tName, '+', &tName, &tSeq, &tStrand, tNibDir);
        }
    else
	{
        loadIfNewSeq(tNibDir, tIsTwoBit, sp->tName, '+',
		&tName, &tSeq, &tStrand);
	}
    chainPair(sp, qSeq, tSeq, &chainList, details);
    }
slSort(&chainList, chainCmpScore);
for (chain = chainList; chain != NULL; chain = chain->next)
    {
    assert(chain->qStart == chain->blockList->qStart
	&& chain->tStart == chain->blockList->tStart);
    chainWrite(chain, f);
    }

carefulClose(&f);
}
void bioImageLoad(char *setRaFile, char *itemTabFile)
/* bioImageLoad - Load data into bioImage database. */
{
struct hash *raHash = raReadSingle(setRaFile);
struct hash *rowHash;
struct lineFile *lf = lineFileOpen(itemTabFile, TRUE);
char *line, *words[256];
struct sqlConnection *conn = sqlConnect(database);
int rowSize;
int submissionSetId;
struct hash *fullDirHash = newHash(0);
struct hash *screenDirHash = newHash(0);
struct hash *thumbDirHash = newHash(0);
struct hash *treatmentHash = newHash(0);
struct hash *bodyPartHash = newHash(0);
struct hash *sliceTypeHash = newHash(0);
struct hash *imageTypeHash = newHash(0);
struct hash *sectionSetHash = newHash(0);
struct dyString *dy = dyStringNew(0);

/* Read first line of tab file, and from it get all the field names. */
if (!lineFileNext(lf, &line, NULL))
    errAbort("%s appears to be empty", lf->fileName);
if (line[0] != '#')
    errAbort("First line of %s needs to start with #, and then contain field names",
    	lf->fileName);
rowHash = hashRowOffsets(line+1);
rowSize = rowHash->elCount;
if (rowSize >= ArraySize(words))
    errAbort("Too many fields in %s", lf->fileName);

/* Check that have all required fields */
    {
    char *fieldName;
    int i;

    for (i=0; i<ArraySize(requiredSetFields); ++i)
        {
	fieldName = requiredSetFields[i];
	if (!hashLookup(raHash, fieldName))
	    errAbort("Field %s is not in %s", fieldName, setRaFile);
	}

    for (i=0; i<ArraySize(requiredItemFields); ++i)
        {
	fieldName = requiredItemFields[i];
	if (!hashLookup(rowHash, fieldName))
	    errAbort("Field %s is not in %s", fieldName, itemTabFile);
	}

    for (i=0; i<ArraySize(requiredFields); ++i)
        {
	fieldName = requiredFields[i];
	if (!hashLookup(rowHash, fieldName) && !hashLookup(raHash, fieldName))
	    errAbort("Field %s is not in %s or %s", fieldName, setRaFile, itemTabFile);
	}
    }

/* Create/find submission record. */
submissionSetId = saveSubmissionSet(conn, raHash);

/* Process rest of tab file. */
while (lineFileNextRowTab(lf, words, rowSize))
    {
    int fullDir = cachedId(conn, "location", "name", 
    	fullDirHash, "fullDir", raHash, rowHash, words);
    int screenDir = cachedId(conn, "location", "name", 
    	screenDirHash, "screenDir", raHash, rowHash, words);
    int thumbDir = cachedId(conn, "location", 
    	"name", thumbDirHash, "thumbDir", raHash, rowHash, words);
    int bodyPart = cachedId(conn, "bodyPart", 
    	"name", bodyPartHash, "bodyPart", raHash, rowHash, words);
    int sliceType = cachedId(conn, "sliceType", 
    	"name", sliceTypeHash, "sliceType", raHash, rowHash, words);
    int imageType = cachedId(conn, "imageType", 
    	"name", imageTypeHash, "imageType", raHash, rowHash, words);
    int treatment = cachedId(conn, "treatment", 
    	"conditions", treatmentHash, "treatment", raHash, rowHash, words);
    char *fileName = getVal("fileName", raHash, rowHash, words, NULL);
    char *submitId = getVal("submitId", raHash, rowHash, words, NULL);
    char *taxon = getVal("taxon", raHash, rowHash, words, NULL);
    char *isEmbryo = getVal("isEmbryo", raHash, rowHash, words, NULL);
    char *age = getVal("age", raHash, rowHash, words, NULL);
    char *sectionSet = getVal("sectionSet", raHash, rowHash, words, "");
    char *sectionIx = getVal("sectionIx", raHash, rowHash, words, "0");
    char *gene = getVal("gene", raHash, rowHash, words, "");
    char *locusLink = getVal("locusLink", raHash, rowHash, words, "");
    char *refSeq = getVal("refSeq", raHash, rowHash, words, "");
    char *genbank = getVal("genbank", raHash, rowHash, words, "");
    char *priority = getVal("priority", raHash, rowHash, words, "200");
    int sectionId = 0;
    int oldId;
    // char *xzy = getVal("xzy", raHash, rowHash, words, xzy);

    if (sectionSet[0] != 0 && !sameString(sectionSet, "0"))
        {
	struct hashEl *hel = hashLookup(sectionSetHash, sectionSet);
	if (hel != NULL)
	    sectionId = ptToInt(hel->val);
	else
	    {
	    sqlUpdate(conn, "insert into sectionSet values(default)");
	    sectionId = sqlLastAutoId(conn);
	    hashAdd(sectionSetHash, sectionSet, intToPt(sectionId));
	    }
	}

    dyStringClear(dy);
    dyStringAppend(dy, "select id from image ");
    dyStringPrintf(dy, "where fileName = '%s' ", fileName);
    dyStringPrintf(dy, "and fullLocation = %d",  fullDir);
    oldId = sqlQuickNum(conn, dy->string);
    if (oldId != 0)
        {
	if (replace)
	    {
	    dyStringClear(dy);
	    dyStringPrintf(dy, "delete from image where id = %d", oldId);
	    sqlUpdate(conn, dy->string);
	    }
	else
	    errAbort("%s is already in database line %d of %s", 
	    	fileName, lf->lineIx, lf->fileName);
	}

    dyStringClear(dy);
    dyStringAppend(dy, "insert into image set\n");
    dyStringPrintf(dy, " id = default,\n");
    dyStringPrintf(dy, " fileName = '%s',\n", fileName);
    dyStringPrintf(dy, " fullLocation = %d,\n", fullDir);
    dyStringPrintf(dy, " screenLocation = %d,\n", screenDir);
    dyStringPrintf(dy, " thumbLocation = %d,\n", thumbDir);
    dyStringPrintf(dy, " submissionSet = %d,\n", submissionSetId);
    dyStringPrintf(dy, " sectionSet = %d,\n", sectionId);
    dyStringPrintf(dy, " sectionIx = %s,\n", sectionIx);
    dyStringPrintf(dy, " submitId = '%s',\n", submitId);
    dyStringPrintf(dy, " gene = '%s',\n", gene);
    dyStringPrintf(dy, " locusLink = '%s',\n", locusLink);
    dyStringPrintf(dy, " refSeq = '%s',\n", refSeq);
    dyStringPrintf(dy, " genbank = '%s',\n", genbank);
    dyStringPrintf(dy, " priority = %s,\n", priority);
    dyStringPrintf(dy, " taxon = %s,\n", taxon);
    dyStringPrintf(dy, " isEmbryo = %s,\n", isEmbryo);
    dyStringPrintf(dy, " age = %s,\n", age);
    dyStringPrintf(dy, " bodyPart = %d,\n", bodyPart);
    dyStringPrintf(dy, " sliceType = %d,\n", sliceType);
    dyStringPrintf(dy, " imageType = %d,\n", imageType);
    dyStringPrintf(dy, " treatment = %d\n", treatment);

    sqlUpdate(conn, dy->string);
    }
}
void doMiddle(struct cart *theCart)
/* Set up globals and make web page */
{
/* struct liftOverChain *chainList = NULL, *chain; */
char *userData;
/* char *dataFile; */
char *dataFormat;
char *organism;
char *db;
float minBlocks, minMatch;
boolean multiple, fudgeThick;
int minSizeQ, minSizeT;
boolean refreshOnly = FALSE;

/* char *err = NULL; */
struct liftOverChain *chainList = NULL, *choice;

cart = theCart;

if (cgiOptionalString(HGLFT_ERRORHELP_VAR))
    {
    puts("<PRE>");
    puts(liftOverErrHelp());
    //system("/usr/bin/cal");
    puts("</PRE>");
    return;
    }

/* Get data to convert - from userData variable, or if 
 * that is empty from a file. */

if (cartOptionalString(cart, "SubmitFile"))
    userData = cartOptionalString(cart, HGLFT_DATAFILE_VAR);
else
    userData = cartOptionalString(cart, HGLFT_USERDATA_VAR);
dataFormat = cartCgiUsualString(cart, HGLFT_DATAFORMAT_VAR, DEFAULT_FORMAT);
cartWebStart(cart, NULL, "Lift Genome Annotations");

getDbAndGenome(cart, &db, &organism, oldVars);

chainList = liftOverChainListFiltered();

choice = defaultChoices(chainList, db);
if (choice == NULL)
    errAbort("Sorry, no conversions available from this assembly\n");

minSizeQ = cartCgiUsualInt(cart, HGLFT_MINSIZEQ, choice->minSizeQ);
minSizeT = cartCgiUsualInt(cart, HGLFT_MINSIZET, choice->minSizeT);
minBlocks = cartCgiUsualDouble(cart, HGLFT_MINBLOCKS, choice->minBlocks);
minMatch = cartCgiUsualDouble(cart, HGLFT_MINMATCH, choice->minMatch);
fudgeThick = cartCgiUsualBoolean(cart, HGLFT_FUDGETHICK, (choice->fudgeThick[0]=='Y') ? TRUE : FALSE);
multiple = cartCgiUsualBoolean(cart, HGLFT_MULTIPLE, (choice->multiple[0]=='Y') ? TRUE : FALSE);
refreshOnly = cartCgiUsualInt(cart, HGLFT_REFRESHONLY_VAR, 0);

webMain(choice, dataFormat, multiple);
liftOverChainFreeList(&chainList);

if (!refreshOnly && userData != NULL && userData[0] != '\0')
    {
    struct hash *chainHash = newHash(0);
    char *chainFile;
    struct tempName oldTn, mappedTn, unmappedTn;
    FILE *old, *mapped, *unmapped;
    char *line;
    int lineSize;
    char *fromDb, *toDb;
    int ct = 0, errCt = 0;

    /* read in user data and save to file */
    makeTempName(&oldTn, HGLFT, ".user");
    old = mustOpen(oldTn.forCgi, "w");
    fputs(userData, old);
    fputs("\n", old);           /* in case user doesn't end last line */
    carefulClose(&old);
    chmod(oldTn.forCgi, 0666);

    /* setup output files -- one for converted lines, the other
     * for lines that could not be mapped */
    makeTempName(&mappedTn, HGLFT, ".bed");
    makeTempName(&unmappedTn, HGLFT, ".err");
    mapped = mustOpen(mappedTn.forCgi, "w");
    chmod(mappedTn.forCgi, 0666);
    unmapped = mustOpen(unmappedTn.forCgi, "w");
    chmod(unmappedTn.forCgi, 0666);

    fromDb = cgiString(HGLFT_FROMDB_VAR);
    toDb = cgiString(HGLFT_TODB_VAR);
    chainFile = liftOverChainFile(fromDb, toDb);
    if (chainFile == NULL)
        errAbort("ERROR: Can't convert from %s to %s: no chain file loaded",
                                fromDb, toDb);
    readLiftOverMap(chainFile, chainHash);
    if (sameString(dataFormat, WIGGLE_FORMAT))
        /* TODO: implement Wiggle */
	{}
    else if (sameString(dataFormat, POSITION_FORMAT))
	{
	/* minSizeT here and in liftOverChain.c/h has been renamed minChainT in liftOver.c */
	/* ignore multiple, it must be false when position is used */
	ct = liftOverPositions(oldTn.forCgi, chainHash, 
			minMatch, minBlocks, 0, minSizeQ,
			minSizeT, 0, 
			fudgeThick, mapped, unmapped, FALSE, NULL, &errCt);

	
        }
    else if (sameString(dataFormat, BED_FORMAT))
        {
	/* minSizeT here and in liftOverChain.c/h has been renamed minChainT in liftOver.c */
        ct = liftOverBed(oldTn.forCgi, chainHash, 
			minMatch, minBlocks, 0, minSizeQ,
			minSizeT, 0,
			fudgeThick, mapped, unmapped, multiple, NULL, &errCt);
        }
    else
        /* programming error */
        errAbort("ERROR: Unsupported data format: %s\n", dataFormat);

    webNewSection("Results");
    if (ct)
        {
        /* some records succesfully converted */
        cgiParagraph("");
        printf("Successfully converted %d record", ct);
        printf("%s: ", ct > 1 ? "s" : "");
        printf("<A HREF=%s TARGET=_blank>View Conversions</A>\n", mappedTn.forCgi);
        }
    if (errCt)
        {
        /* some records not converted */
        cgiParagraph("");
        printf("Conversion failed on %d record", errCt);
        printf("%s. &nbsp;&nbsp;&nbsp;", errCt > 1 ? "s" : "");
        printf("<A HREF=%s TARGET=_blank>Display failure file</A>&nbsp; &nbsp;\n",
                         unmappedTn.forCgi);
        printf("<A HREF=\"../cgi-bin/hgLiftOver?%s=1\" TARGET=_blank>Explain failure messages</A>\n", HGLFT_ERRORHELP_VAR);
        puts("<P>Failed input regions:\n");
        struct lineFile *errFile = lineFileOpen(unmappedTn.forCgi, TRUE);
        puts("<BLOCKQUOTE><PRE>\n");
        while (lineFileNext(errFile, &line, &lineSize))
            puts(line);
        lineFileClose(&errFile);
        puts("</PRE></BLOCKQUOTE>\n");
        }
    if (sameString(dataFormat, POSITION_FORMAT) && multiple)
	{
        puts("<BLOCKQUOTE><PRE>\n");
        puts("Note: multiple checkbox ignored since it is not supported for position format.");
        puts("</PRE></BLOCKQUOTE>\n");
	}
    carefulClose(&unmapped);
    }
webDataFormats();
webDownloads();
cartWebEnd();
}
Ejemplo n.º 13
0
void faToTwoBit(char **RinFiles, char **RoutFile)
/* Convert inFiles in fasta format to outfile in 2 bit 
 * format. */
{
struct twoBit *twoBitList = NULL, *twoBit;
int i;
struct hash *uniqHash = newHash(18);
FILE *f;

//int inFileCount = RinFileCount[0];
char *inFiles=RinFiles[0];
char *outFile = RoutFile[0];
char *delim = "@";
char *ptr= NULL;	

//for (i=0; i<inFileCount; ++i)
//    {
i = 0;
if ((ptr = strtok(inFiles, delim)) != NULL) {
	do {
		i++;
            char *fileName=ptr;
	    struct lineFile *lf = lineFileOpen(fileName, TRUE);
	    struct dnaSeq seq;
	    ZeroVar(&seq);
	    while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
	        {
		if (seq.size == 0)
		    {
		    warn("Skipping item %s which has no sequence.\n",seq.name);
		    continue;
		    }
		    
	        /* strip off version number */
	        if (stripVersion)
	            {
	            char *sp = NULL;
	            sp = strchr(seq.name,'.');
	            if (sp != NULL)
                *sp = '\0';
	            }
	
	        if (hashLookup(uniqHash, seq.name))
        	    {
        	    if (!ignoreDups)
        	        errAbort("Duplicate sequence name %s", seq.name);
        	    else
        	        continue;
        	    }
		hashAdd(uniqHash, seq.name, NULL);
		if (noMaskFT)
		    faToDna(seq.dna, seq.size);
		else
		    unknownToN(seq.dna, seq.size);
		twoBit = twoBitFromDnaSeq(&seq, !noMaskFT);
		slAddHead(&twoBitList, twoBit);
	}
    lineFileClose(&lf);      


	} while ((ptr = strtok(NULL, delim)) != NULL);
}

slReverse(&twoBitList);
f = mustOpen(outFile, "wb");
twoBitWriteHeader(twoBitList, f);
for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next)
    {
    twoBitWriteOne(twoBit, f);
    }
carefulClose(&f);
}
Ejemplo n.º 14
0
void testOneTable(struct htmlPage *trackPage, char *org, char *db,
	char *group, char *track, char *table)
/* Test stuff on one table if we haven't already tested this table. */
{
/* Why declared here and not globally? */
static struct hash *uniqHash = NULL;
char fullName[256];
if (uniqHash == NULL)
     uniqHash = newHash(0);
safef(fullName, sizeof(fullName), "%s.%s", db, table);
if (!hashLookup(uniqHash, fullName))
    {
    struct htmlPage *tablePage;
    struct htmlForm *mainForm;

    hashAdd(uniqHash, fullName, NULL);
    verbose(1, "Testing %s %s %s %s %s\n", naForNull(org), db, group, track, table);
    tablePage = quickSubmit(trackPage, org, db, group, 
	    track, table, "selectTable", hgtaTable, table);
    if (!isObsolete(table) && tablePage != NULL)
	{
	if ((mainForm = htmlFormGet(tablePage, "mainForm")) == NULL)
	    {
	    qaStatusSoftError(tablesTestList->status, 
		    "Couldn't get main form on tablePage for %s %s %s %s", db, group, track, table);
	    }
	else
	    {
	    testSchema(tablePage, mainForm, org, db, group, track, table);
	    testSummaryStats(tablePage, mainForm, org, db, group, track, table);
	    if (outTypeAvailable(mainForm, "bed")) 
		{
		if (outTypeAvailable(mainForm, "primaryTable"))
		    {
		    int rowCount;
		    rowCount = testAllFields(tablePage, mainForm, org, db, group, track, table);
		    testOneField(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutSequence(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutBed(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutHyperlink(tablePage, mainForm, org, db, group, track, table, rowCount);
		    testOutGff(tablePage, mainForm, org, db, group, track, table);
		    if (rowCount > 0)
			testOutCustomTrack(tablePage, mainForm, org, db, group, track, table);
		    }
		}
	    else if (outTypeAvailable(mainForm, "primaryTable"))
		{
		/* If BED type is not available then the region will be ignored, and
		 * we'll end up scanning whole table.  Make sure table is not huge
		 * before proceeding. */
		if (tableSize(db, table) < 500000)
		    {
		    int rowCount;
		    rowCount = testAllFields(tablePage, mainForm, org, db, group, track, table);
		    testOneField(tablePage, mainForm, org, db, group, track, table, rowCount);
		    }
		}
	    }
	htmlPageFree(&tablePage);
	}
    carefulCheckHeap();
    }
}
Ejemplo n.º 15
0
static int bedToGffLines(struct bed *bedList, struct slName *exonFramesList, struct hTableInfo *hti,
			 int fieldCount, char *source, boolean gtf2StopCodons)
/* Translate a (list of) bed into gff and print out.
 * Note that field count (perhaps reduced by bitwise intersection)
 * can in effect override hti. */
{
if (! bedList)
    return 0;
struct hash *nameHash = newHash(20);
struct bed *bed;
struct slName *exonFrames = exonFramesList;
int i, exonStart, exonEnd;
char txName[256];
int itemCount = 0;
static int namelessIx = 0;

for (bed = bedList;  bed != NULL;  bed = bed->next)
    {
    /* Enforce unique transcript_ids. */
    if (bed->name != NULL)
	{
	struct hashEl *hel = hashLookup(nameHash, bed->name);
	int dupCount = (hel != NULL ? ptToInt(hel->val) : 0);
	if (dupCount > 0)
	    {
	    safef(txName, sizeof(txName), "%s_dup%d", bed->name, dupCount);
	    hel->val = intToPt(dupCount + 1);
	    }
	else
	    {
	    safef(txName, sizeof(txName), "%s", bed->name);
	    hashAddInt(nameHash, bed->name, 1);
	    }
	}
    else
	safef(txName, sizeof(txName), "tx%d", ++namelessIx);
    if (hti->hasBlocks && hti->hasCDS && fieldCount > 4)
	{
	/* first pass: compute frames, in order dictated by strand. */
	int startIndx = 0, stopIndx = 0;
	char *frames = NULL;
	char *ef = NULL;
	if (exonFramesList)
    	    ef = exonFrames->name;
	frames = computeFrames(bed, ef, &startIndx, &stopIndx);

	/* second pass: one exon (possibly CDS, start/stop_codon) per block. */
	for (i=0;  i < bed->blockCount;  i++)
	    {
	    exonStart = bed->chromStart + bed->chromStarts[i];
	    exonEnd = exonStart + bed->blockSizes[i];
	    if ((exonStart < bed->thickEnd) && (exonEnd > bed->thickStart))
		{
		int exonCdsStart = max(exonStart, bed->thickStart);
		int exonCdsEnd = min(exonEnd, bed->thickEnd);
		addCdsStartStop(bed, source, exonCdsStart, exonCdsEnd,
				frames, i, startIndx, stopIndx, gtf2StopCodons, txName);
		}
	    addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName);
	    }
	freeMem(frames);
	}
    else if (hti->hasBlocks && fieldCount > 4)
	{
	for (i=0;  i < bed->blockCount;  i++)
	    {
	    exonStart = bed->chromStart + bed->chromStarts[i];
	    exonEnd = exonStart + bed->blockSizes[i];
	    addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName);
	    }
	}
    else if (hti->hasCDS && fieldCount > 4)
	{
	if (bed->thickStart == 0 && bed->thickEnd == 0)
	    bed->thickStart = bed->thickEnd = bed->chromStart;
	if (bed->thickStart > bed->chromStart)
	    {
	    addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->thickStart, '.', txName);
	    }
	if (bed->thickEnd > bed->thickStart)
	    addGffLineFromBed(bed, source, "CDS", bed->thickStart, bed->thickEnd, '0', txName);
	if (bed->thickEnd < bed->chromEnd)
	    {
	    addGffLineFromBed(bed, source, "exon", bed->thickEnd, bed->chromEnd, '.', txName);
	    }
	}
    else
	{
	addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->chromEnd, '.', txName);
	}
    itemCount++;
    if (exonFrames)
    	exonFrames = exonFrames->next;
    }
hashFree(&nameHash);
return itemCount;
}
void update(struct g2cFile *old, struct g2cFile *up)
{
struct gene *oldGene, *upGene;
struct cdnaHit *oldHit, *upHit;
struct hash *geneHash;
struct hashEl *hel;
int sameHitCount = 0;
int newHitCount = 0;
int newGeneCount = 0;
int updatedGeneCount = 0;
int altCount = 0;
struct geneFamily smallFamily;
struct geneFamily *family;

printf("Updating %s with %s\n", old->name, up->name);

/* Hash the existing gene names for faster lookup. */
geneHash = newHash(12);
for (oldGene = old->geneList; oldGene != NULL; oldGene = oldGene->next)
    hashAdd(geneHash, oldGene->name, oldGene);

for (upGene = up->geneList; upGene != NULL; upGene = upGene->next)
    {
    boolean changedGene = FALSE;
    if (isAltSplicedName(upGene->name))
        {
        family = getAltFamily(geneHash, upGene->name);
        ++altCount;
        }
    else
        {
        hel = hashLookup(geneHash, upGene->name);
        if (hel != NULL)
            {
            smallFamily.gene = hel->val;
            smallFamily.next = NULL;
            family = &smallFamily;
            }
        else
            family = NULL;
        }

    /* Set corresponding gene in old file to NULL until we
     * need to find it. */
    oldGene = NULL;
    for (upHit = upGene->hitList; upHit != NULL; upHit = upHit->next)
        {
        if ((oldHit = findHitInFamily(family, upHit->name)) != NULL)
            ++sameHitCount;
        else
            {
            if (oldGene == NULL)
                {
                /* We haven't found corresponding gene yet.  First
                 * look for it in the family. */
                struct geneFamily *member;
                for (member = family; member != NULL; member = member->next)
                    {
                    if (strcmp(member->gene->name, upGene->name) == 0)
                        {
                        oldGene = member->gene;
                        break;
                        }
                    }
                /* The corresponding gene doesn't exist yet. We
                 * have to make it up and hang it on the genelist
                 * for the file, the hash list, and the family list. */
                if (oldGene == NULL)
                    {
                    oldGene = alloc(sizeof(*oldGene));
                    oldGene->name = upGene->name;
                    slAddHead(&old->geneList, oldGene);
                    hashAdd(geneHash, oldGene->name, oldGene);
                    member = alloc(sizeof(*member));
                    member->gene = oldGene;
                    slAddHead(&family, member);
                    ++newGeneCount;
                    }
                }
            oldHit = alloc(sizeof(*oldHit));
            oldHit->name = upHit->name;
            oldHit->hel = hel;
            slAddHead(&oldGene->hitList, oldHit);
            ++newHitCount;
            changedGene = TRUE;
            }
        }
    if (changedGene)
        ++updatedGeneCount;
    }
slSort(&old->geneList, cmpName);
printf("Updated %d genes (including %d alt spliced ones) with %d cdna hits (%d hits unchanged) %d new genes\n",
    updatedGeneCount, altCount, newHitCount, sameHitCount, newGeneCount);
}
void txGeneXref(char *genomeDb, char *uniProtDb, char *genePredFile, char *infoFile, char *pickFile, 
	char *evFile, char *outFile)
/* txGeneXref - Make kgXref type table for genes.. */
{
/* Load picks into hash.  We don't use cdsPicksLoadAll because empty fields
 * cause that autoSql-generated routine problems. */
struct hash *pickHash = newHash(18);
struct hash *geneToProtHash = makeGeneToProtHash(genePredFile);
struct cdsPick *pick;
struct lineFile *lf = lineFileOpen(pickFile, TRUE);
char *row[CDSPICK_NUM_COLS];
while (lineFileRowTab(lf, row))
    {
    pick = cdsPickLoad(row);
    removePickVersions(pick);
    hashAdd(pickHash, pick->name, pick);
    }

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);

/* Read in info file, and loop through it to make out file. */
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
FILE *f = mustOpen(outFile, "w");
for (info = infoList; info != NULL; info = info->next)
    {
    char *kgID = info->name;
    char *mRNA = "";
    char *spID = "";
    char *spDisplayID = "";
    char *geneSymbol = NULL;
    char *refseq = "";
    char *protAcc = "";
    char *description = NULL;
    char query[256];
    char *proteinId = hashMustFindVal(geneToProtHash, info->name);
    boolean isAb = sameString(info->category, "antibodyParts");
    pick = hashFindVal(pickHash, info->name);
    ev = hashFindVal(evHash, info->name);
    if (pick != NULL)
       {
       /* Fill in the relatively straightforward fields. */
       refseq = pick->refSeq;
       if (info->orfSize > 0)
	    {
	    protAcc = pick->refProt;
	    spID = proteinId;
	    if (sameString(protAcc, spID))
		spID = pick->uniProt;
	    if (spID[0] != 0)
	       spDisplayID = spAnyAccToId(uConn, spID);
	    }

       /* Fill in gene symbol and description from refseq if possible. */
       if (refseq[0] != 0)
           {
	   struct sqlResult *sr;
	   safef(query, sizeof(query), "select name,product from refLink where mrnaAcc='%s'",
	   	refseq);
	   sr = sqlGetResult(gConn, query);
	   char **row = sqlNextRow(sr);
	   if (row != NULL)
	       {
	       geneSymbol = cloneString(row[0]);
	       if (!sameWord("unknown protein", row[1]))
		   description = cloneString(row[1]);
	       }
	    sqlFreeResult(&sr);
	   }

       /* If need be try uniProt for gene symbol and description. */
       if (spID[0] != 0 && (geneSymbol == NULL || description == NULL))
           {
	   char *acc = spLookupPrimaryAcc(uConn, spID);
	   if (description == NULL)
	       description = spDescription(uConn, acc);
	   if (geneSymbol == NULL)
	       {
	       struct slName *nameList = spGenes(uConn, acc);
	       if (nameList != NULL)
		   geneSymbol = cloneString(nameList->name);
	       slFreeList(&nameList);
	       }
	   }

       }

    /* If it's an antibody fragment use that as name. */
    if (isAb)
        {
	geneSymbol = cloneString("abParts");
	description = cloneString("Parts of antibodies, mostly variable regions.");
	isAb = TRUE;
	}

    if (ev == NULL)
	{
	mRNA = cloneString("");
	if (!isAb)
	    {
	    errAbort("%s is %s but not %s\n", info->name, infoFile, evFile);
	    }
	}
    else
	{
	mRNA = cloneString(ev->primary);
	chopSuffix(mRNA);
	}

    /* Still no joy? Try genbank RNA records. */
    if (geneSymbol == NULL || description == NULL)
	{
	if (ev != NULL)
	    {
	    int i;
	    for (i=0; i<ev->accCount; ++i)
		{
		char *acc = ev->accs[i];
		chopSuffix(acc);
		if (geneSymbol == NULL)
		    {
		    safef(query, sizeof(query), 
			"select geneName.name from gbCdnaInfo,geneName "
			"where geneName.id=gbCdnaInfo.geneName and gbCdnaInfo.acc = '%s'", acc);
		    geneSymbol = sqlQuickString(gConn, query);
		    if (geneSymbol != NULL)
			{
			if (sameString(geneSymbol, "n/a"))
			   geneSymbol = NULL;
			}
		    }
		if (description == NULL)
		    {
		    safef(query, sizeof(query), 
			"select description.name from gbCdnaInfo,description "
			"where description.id=gbCdnaInfo.description "
			"and gbCdnaInfo.acc = '%s'", acc);
		    description = sqlQuickString(gConn, query);
		    if (description != NULL)
			{
			if (sameString(description, "n/a"))
			   description = NULL;
			}
		    }
		}
	    }
	}
    if (geneSymbol == NULL)
        geneSymbol = mRNA;
    if (description == NULL)
        description = mRNA;

    /* Get rid of some characters that will cause havoc downstream. */
    stripChar(geneSymbol, '\'');
    subChar(geneSymbol, '<', '[');
    subChar(geneSymbol, '>', ']');

    /* Abbreviate geneSymbol if too long */
    if (strlen(geneSymbol) > 40)
        strcpy(geneSymbol+37, "...");

    fprintf(f, "%s\t", kgID);
    fprintf(f, "%s\t", mRNA);
    fprintf(f, "%s\t", spID);
    fprintf(f, "%s\t", spDisplayID);
    fprintf(f, "%s\t", geneSymbol);
    fprintf(f, "%s\t", refseq);
    fprintf(f, "%s\t", protAcc);
    fprintf(f, "%s\n", description);
    }
carefulClose(&f);
}
Ejemplo n.º 18
0
struct hash *agpLoadAll(char *agpFile)
/* load AGP entries into a hash of AGP lists, one per chromosome */
{
struct hash *agpHash = newHash(0);
struct lineFile *lf = lineFileOpen(agpFile, TRUE);
char *words[9];
int lastPos = 0;
int wordCount;
struct agpFrag *agpFrag;
struct agpGap *agpGap;
char *chrom;
struct agp *agp;
struct hashEl *hel;

while ((wordCount = lineFileChopNext(lf, words, ArraySize(words))) != 0)
    {
    lineFileExpectAtLeast(lf, 8, wordCount);
    chrom = words[0];
    if (!hashFindVal(agpHash, chrom))
        lastPos = 1;
    AllocVar(agp);
    if (words[4][0] != 'N' && words[4][0] != 'U')
        {
        /* not a gap */
        lineFileExpectWords(lf, 9, wordCount);
        agpFrag = agpFragLoad(words);
        if (agpFrag->chromStart != lastPos)
            errAbort(
               "Frag start (%d, %d) doesn't match previous end line %d of %s\n",
                     agpFrag->chromStart, lastPos, lf->lineIx, lf->fileName);
        if (agpFrag->chromEnd - agpFrag->chromStart != 
                        agpFrag->fragEnd - agpFrag->fragStart)
            errAbort("Sizes don't match in %s and %s line %d of %s\n",
                    agpFrag->chrom, agpFrag->frag, lf->lineIx, lf->fileName);
        lastPos = agpFrag->chromEnd + 1;
        agp->entry = agpFrag;
        agp->isFrag = TRUE;
        }
    else
        {
        /* gap */
        lineFileExpectWords(lf, 8, wordCount);
        agpGap = agpGapLoad(words);
        if (agpGap->chromStart != lastPos)
            errAbort("Gap start (%d, %d) doesn't match previous end line %d of %s\n",
                     agpGap->chromStart, lastPos, lf->lineIx, lf->fileName);
        lastPos = agpGap->chromEnd + 1;
        agp->entry = agpGap;
        agp->isFrag = FALSE;
        }
    if ((hel = hashLookup(agpHash, chrom)) == NULL)
        hashAdd(agpHash, chrom, agp);
    else
        slAddHead(&(hel->val), agp);
    }
#ifndef DEBUG
    {
struct hashCookie cookie;
struct hashEl *hel;
cookie = hashFirst(agpHash);
while ((hel = hashNext(&cookie)) != NULL)
    {
    struct agp *agpList;
    agpList = (struct agp *)hel->val;
    /*
    for (agp = agpList; agp != NULL; agp = agp->next)
        printf("isFrag: %d\n", agp->isFrag);
        */
    }
    }
#endif
/* reverse AGP lists */
//hashTraverseVals(agpHash, slReverse);
#ifndef DEBUG
    {
struct hashCookie cookie;
struct hashEl *hel;
cookie = hashFirst(agpHash);
while ((hel = hashNext(&cookie)) != NULL)
    {
    struct agp *agpList;
    slReverse(&hel->val);
    agpList = hel->val;
    /*
    agpList = (struct agp *)hel->val;
    slReverse(&agpList);
    hashRemove(agpHash, hel->name);
    hashAdd(agpHash, hel->name, agpList);
    */
    /*
    for (agp = agpList; agp != NULL; agp = agp->next)
        printf("isFrag: %d\n", agp->isFrag);
        */
    }
    }
#endif
return agpHash;
}
Ejemplo n.º 19
0
int main(int argc, char *argv[])
{
struct hash *bacHash;
char line[1024];
int lineCount;
char *words[256];
int wordCount;
int fileIx;
char *fileName;
FILE *f;

if (argc < 2)
    usage();
bacHash = newHash(16);

for (fileIx = 1; fileIx < argc; ++fileIx)
    {
    fileName = argv[fileIx];
    uglyf("Processing %s\n", fileName);
    f = mustOpen(fileName, "r");
    lineCount = 0;
    while (fgets(line, sizeof(line), f))
        {
        ++lineCount;
        wordCount = chopLine(line, words);
        if (wordCount == ArraySize(words))
            errAbort("Too many words line %d of %s\n", lineCount, fileName);
        if (wordCount != 0)
            {
            char *bacName;
            int cIx;
            struct contigTrack *ctList = NULL, *ct;
            struct bacTrack *bt;
            struct hashEl *hel;

            /* Check line syntax and parse it. */
            if (!sameString(words[1], "glues"))
                errAbort("Bad format line %d of %s\n", lineCount, fileName);
            bacName = words[2];
            for (cIx = 4; cIx < wordCount; cIx += 5)
                {
                char *parts[3];
                int partCount;

                AllocVar(ct);
                ct->ix = atoi(words[cIx]);
                ct->strand = words[cIx+1][0];
                ct->dir = words[cIx+2][0];
                partCount = chopString(words[cIx+3], "(-)", parts, ArraySize(parts));
                if (partCount != 2)
                    errAbort("Bad format line %d of %s\n", lineCount, fileName);
                ct->start = atoi(parts[0]);
                ct->end = atoi(parts[1]);
                ct->cookedScore = atof(words[cIx+4]);
                slAddHead(&ctList, ct);                
                }
            slSort(&ctList, cmpContigTrack);
        
            /* Lookup bacTrack and make it if new. */
            hel = hashLookup(bacHash, bacName);
            if (hel == NULL)
                {
                AllocVar(bt);
                hel = hashAdd(bacHash, bacName, bt);
                bt->name = hel->name;
                slAddHead(&bacList, bt);
                }
            else
                {
                bt = hel->val;
                }
            
            /* Process pairs into bacTrack. */
            addPairs(bt, ctList);
            slFreeList(&ctList);
            }
        }
    fclose(f);
    }
slSort(&bacList, cmpBacTrack);

printStats();
return 0;
}
Ejemplo n.º 20
0
void gensatImageDownload(char *gensatXml, char *outDir, char *outLog)
/* gensatImageDownload - Download images from gensat guided by xml file.. */
{
struct xap *xap;
struct gsGensatImage *image;
char *ftpUri = "ftp://ftp.ncbi.nih.gov/pub/gensat";
char *jpgCgiUri = "http://www.ncbi.nlm.nih.gov/projects/gensat/gensat_img.cgi?action=image&mode=full&fmt=jpeg&id=";
char finalJpg[PATH_LEN];
char finalDir[PATH_LEN];
char wgetSource[PATH_LEN];
struct hash *dirHash = newHash(16);
struct dyString *mkdir = dyStringNew(0);
int imageIx = 0;

fLog = mustOpen(outLog, "a");
fprintf(fLog, "starting gensatImageDownload from %s to %s\n", gensatXml, outDir);
xap = xapListOpen(gensatXml, "GensatImageSet", gsStartHandler, gsEndHandler);


while ((image = xapListNext(xap, "GensatImage")) != NULL)
    {
    int id = image->gsGensatImageId->text;
    char *imageFile = image->gsGensatImageImageInfo->gsGensatImageImageInfoFullImg
    			->gsGensatImageInfo->gsGensatImageInfoFilename->text;

    /* Mangle file name a little */
    subChar(imageFile, '(', '_');
    stripChar(imageFile, ')');

    /* Figure out name of jpeg file in outDir. */
    verbose(1, "image %d, id %d\n", ++imageIx, id);
    safef(finalJpg, sizeof(finalJpg), "%s/%s", outDir, imageFile);
    stripString(finalJpg, ".full"); /* Image magick can't handle two suffixes */
    chopSuffix(finalJpg);
    strcat(finalJpg, ".jpg");

    /* Create directory that it goes in if necessary */
    splitPath(finalJpg, finalDir, NULL, NULL);
    if (!hashLookup(dirHash, finalDir))
        {
	hashAdd(dirHash, finalDir, NULL);
	dyStringClear(mkdir);
	dyStringPrintf(mkdir, "mkdir -p %s", finalDir);
	if (system(mkdir->string) != 0)
	    errAbort("Couldn't %s", mkdir->string);
	}

    /* Download it - either directly via ftp, or indirectly via cgi. */
    if (fileExists(finalJpg))
	{
	verbose(1, "already have %s\n", imageFile);
	fprintf(fLog, "%s already downloaded\n", finalJpg);
	}
    else
        {
	if (endsWith(imageFile, ".jpg"))
	    {
	    safef(wgetSource, sizeof(wgetSource), "%s/%s", ftpUri, imageFile);
	    if (safeGetOne(wgetSource, finalJpg))
	        fprintf(fLog, "Got via ftp %s\n", finalJpg);
	    }
	else
	    {
	    safef(wgetSource, sizeof(wgetSource), "%s%d", jpgCgiUri, id);
	    if (safeGetOne(wgetSource, finalJpg))
	        fprintf(fLog, "Got via cgi %s\n", finalJpg);
	    }
	}
    }
carefulClose(&fLog);
}
Ejemplo n.º 21
0
/*	convolve() - perform the task on the input data
 *	I would like to rearrange this business here, and instead of
 *	reading in the data and leaving it in the hash for all other
 *	routines to work with, it would be best to get it immediately
 *	into an array.  That makes the work of the other routines much
 *	easier.
 */
static void convolve(int argc, char *argv[])
{
int i;
struct lineFile *lf;			/* for line file utilities	*/

for (i = 1; i < argc; ++i)
    {
    int lineCount = 0;			/* counting input lines	*/
    char *line = (char *)NULL;		/* to receive data input line	*/
    char *words[128];			/* to split data input line	*/
    int wordCount = 0;			/* result of split	*/
    struct hash *histo0;	/*	first histogram	*/
    struct hash *histo1;	/*	second histogram	*/
    int medianBin0 = 0;		/*	bin at median for histo0	*/
    double medianLog_2 = -500.0;	/*	log at median	*/
    int bin = 0;		/*	0 to N-1 for N bins	*/
    int convolutions = 0;	/*	loop counter for # of convolutions */

    histo0 = newHash(0);

    lf = lineFileOpen(argv[i], TRUE);	/*	input file	*/
    verbose(1, "Processing %s\n", argv[1]);
    while (lineFileNext(lf, &line, NULL))
	{
	int j;			/*	loop counter over words	*/
	int inputValuesCount = 0;
	struct histoGram *hg;	/*	an allocated hash element	*/

	++lineCount;
	chopPrefixAt(line, '#'); /* ignore any comments starting with # */
	if (strlen(line) < 3)	/*	anything left on this line ? */
	    continue;		/*	no, go to next line	*/
	wordCount = chopByWhite(line, words, 128);
	if (wordCount < 1)
warn("Expecting at least a word at line %d, file: %s, found %d words",
	lineCount, argv[i], wordCount);
	if (wordCount == 128)
warn("May have more than 128 values at line %d, file: %s", lineCount, argv[i]);

	verbose(2, "Input data read from file: %s\n", argv[i]);
	for (j = 0; j < wordCount; ++j)
	    {
	    char binName[128];
	    double dataValue;
	    double probInput;
	    double log_2;
	    dataValue = strtod(words[j], NULL);
	    ++inputValuesCount;
	    if (logs)
		{
		log_2 = dataValue;
		probInput = pow(2.0,log_2);
		} else {
		if (dataValue > 0.0)
		    {
		    log_2 = log2(dataValue);
		    probInput = dataValue;
		    } else {
		    log_2 = -500.0;	/*	arbitrary limit	*/
		    probInput = pow(2.0,log_2);
		    }
		}
	    if (log_2 > medianLog_2)
		{
		medianLog_2 = log_2;
		medianBin0 = bin;
		}
	    verbose(2, "bin %d: %g %0.5g\n",
		    inputValuesCount-1, probInput, log_2);

	    AllocVar(hg);	/*	the histogram element	*/
	    hg->bin = bin;
	    hg->prob = probInput;
	    hg->log_2 = log_2;
	    snprintf(binName, sizeof(binName), "%d", hg->bin);
	    hashAdd(histo0, binName, hg);

	    ++bin;
	    }	/*	for each word on an input line	*/
	}	/*	for each line in a file	*/

	/*	file read complete, echo input	*/
	if (verboseLevel() >= 2)
	    printHistogram(histo0, medianBin0);

	/*	perform convolutions to specified count
	 *	the iteration does histo0 with itself to produce histo1
	 *	Then histo0 is freed and histo1 copied to it for the
	 *	next loop.
	 */
	for (convolutions = 0; convolutions < convolve_count; ++convolutions)
	    {
	    int medianBin;
	    histo1 = newHash(0);
	    medianBin = iteration(histo0, histo1);
	    if (verboseLevel() >= 2)
		printHistogram(histo1, medianBin);
	    freeHashAndVals(&histo0);
	    histo0 = histo1;
	    }

    }		/*	for each input file	*/
}	/*	convolve()	*/
Ejemplo n.º 22
0
void txGeneAlias(char *genomeDb, char *uniProtDb, char *xrefFile, 
	char *evFile, char *oldToNew, char *aliasFile, char *protAliasFile)
/* txGeneAlias - Make kgAlias and kgProtAlias tables.. */
{
/* Read and hash oldToNew */
struct hash *newToOldHash = loadNewToOldHash(oldToNew);

/* Load evidence into hash */
struct hash *evHash = newHash(18);
struct txRnaAccs *ev, *evList = txRnaAccsLoadAll(evFile);
for (ev = evList; ev != NULL; ev = ev->next)
    hashAdd(evHash, ev->name, ev);

/* Open connections to our databases */
struct sqlConnection *gConn = sqlConnect(genomeDb);
struct sqlConnection *uConn = sqlConnect(uniProtDb);
struct sqlResult *sr;
char **row;
char query[256];

/* Open files. */
struct lineFile *lf = lineFileOpen(xrefFile, TRUE);
FILE *fAlias = mustOpen(aliasFile, "w");
FILE *fProt = mustOpen(protAliasFile, "w");

/* Stream through xref file, which has much of the info we need,
 * and which contains a line for each gene. */
char *words[KGXREF_NUM_COLS];
while (lineFileRowTab(lf, words))
    {
    /* Load the xref, and output most of it's fields as aliases. */
    struct kgXref *x = kgXrefLoad(words);
    char *id = x->kgID;
    outAlias(fAlias, id, x->kgID);
    outAlias(fAlias, id, x->mRNA);
    outAlias(fAlias, id, x->spID);
    outAlias(fAlias, id, x->spDisplayID);
    outAlias(fAlias, id, x->geneSymbol);
    outAlias(fAlias, id, x->refseq);
    outAlias(fAlias, id, x->protAcc);
    char *old = hashFindVal(newToOldHash, id);
    if (old != NULL)
        outAlias(fAlias, id, old);

    /* If we've got a uniProt ID, use that to get more info from uniProt. */
    char *acc = x->spID;
    if (acc[0] != 0)
        {
	/* Get current accession and output a bunch of easy protein aliases. */
	acc = spLookupPrimaryAcc(uConn, acc);
	outProt(fProt, id, acc, acc);
	outProt(fProt, id, acc, x->spDisplayID);
	outProt(fProt, id, acc, x->geneSymbol);
	outProt(fProt, id, acc, x->protAcc);
	if (old != NULL)
	    outProt(fProt, id, acc, old);

	/* Throw in old swissProt accessions. */
	sqlSafef(query, sizeof(query), "select val from otherAcc where acc = '%s'", acc);
	sr = sqlGetResult(uConn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    outAlias(fAlias, id, row[0]);
	    outProt(fProt, id, acc, row[0]);
	    }

	/* Throw in gene names that SwissProt knows about */
	struct slName *gene, *geneList = spGenes(uConn, acc);
	for (gene = geneList; gene != NULL; gene = gene->next)
	    {
	    outAlias(fAlias, id, gene->name);
	    outProt(fProt, id, acc, gene->name);
	    }
	slFreeList(&geneList);
	}
    /* Throw in gene names from genbank. */
    /* At some point we may want to restrict this to the primary transcript in a cluster. */
    ev = hashFindVal(evHash,  id);
    if (ev != NULL)
	{
	int i;
	for (i=0; i<ev->accCount; ++i)
	    {
	    sqlSafef(query, sizeof(query), "select geneName from gbCdnaInfo where acc='%s'", acc);
	    int nameId = sqlQuickNum(gConn, query);
	    if (nameId != 0)
		{
		char name[64];
		sqlSafef(query, sizeof(query), "select name from geneName where id=%d", nameId);
		if (sqlQuickQuery(gConn, query, name, sizeof(name)))
		    outAlias(fAlias, id, name);
		}
	    }
	}

    kgXrefFree(&x);
    }

carefulClose(&fAlias);
carefulClose(&fProt);
}
Ejemplo n.º 23
0
void readPatch(char *fileName, struct hash *cloneHash, 
	struct ntContig **retNtList, struct hash **retNtHash)
/* Read nt.agp file into clone/hash.  */
{
struct ntContig *ntList = NULL, *nt = NULL;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[9];
struct agpFrag frag;
struct clone *clone, *ntClone, *lastClone = NULL;
struct cloneRef *ref;
struct hash *ntHash = newHash(0);
char cloneName[128];
char fragName[128];
char c;
int ntOrder = 0;

while (lineFileRow(lf, row))
    {
    agpFragStaticLoad(row, &frag);
    // file is 1-based but agpFragLoad() now assumes 0-based:
    frag.chromStart -= 1;
    frag.fragStart  -= 1;
    if (nt == NULL || !sameString(frag.chrom, nt->name))
        {
	AllocVar(nt);
	slAddHead(&ntList, nt);
	if (hashLookup(ntHash, frag.chrom) != NULL)
	    errAbort("NT contig %s repeated line %d of %s", row[0], lf->lineIx, lf->fileName);
	hashAddSaveName(ntHash, frag.chrom, nt, &nt->name);
	lastClone = NULL;
	ntOrder = 0;
	}
    strcpy(cloneName, frag.frag);
    chopSuffix(cloneName);
    clone = hashMustFindVal(cloneHash, cloneName);
    clone->ntStart = frag.chromStart;
    clone->ntEnd = frag.chromEnd;
    if (clone->nt != NULL)
	{
        warn("Clone %s trying to be in two NT contigs (%s and %s) line %d of %s",
		clone->name, clone->nt->name, nt->name, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    clone->nt = nt;
    c = frag.strand[0];
    if (c == '-')
	clone->ntOrientation = -1;
    else if (c == '+')
	clone->ntOrientation = +1;
    else
	errAbort("Expecting +1 or -1 field 5, line %d, file %s", lf->lineIx, lf->fileName);
    c = frag.type[0];
    if (c == 'F' || c == 'D' || c == 'P')
	clone->seqType =  c;
    else
	errAbort("Expecting F, D, or P  field 6, line %d, file %s", lf->lineIx, lf->fileName);
    sprintf(fragName, "%s_1", frag.frag);
    clone->fragName = cloneString(fragName);
    clone->goldStart = frag.fragStart;
    clone->goldEnd = frag.fragEnd;
    clone->ntOrder = ntOrder++;

    /* Add ref to NT. */
    AllocVar(ref);
    ref->ref = clone;
    slAddTail(&nt->cloneList, ref);

    /* Do a few tests. */
    if (clone->goldStart >= clone->goldEnd)
	{
	warn("Clone %s end before start (%d before %d) line %d of %s", 
		clone->name, clone->goldStart, clone->goldEnd, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    if (clone->ntStart >= clone->ntEnd)
	{
	warn("Clone %s NT end before NT start line %d of %s", 
		clone->name, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    if (clone->goldEnd > clone->size)
	{
	if (sameString(clone->startFrag, clone->endFrag))
	    {
	    warn("Clone %s end position %d, clone size %d, line %d of %s", 
		clone->name, clone->goldEnd, clone->size, lf->lineIx, lf->fileName);
	    nt->problem = TRUE;
	    }
	}
    if (clone->ntEnd - clone->ntStart != clone->goldEnd - clone->goldStart)
        {
	warn("Size not the same in NT contig as in clone %s (%d vs %d) line %d of %s",
		clone->name,
		clone->ntEnd - clone->ntStart, clone->goldEnd-clone->goldStart,
		lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    nt->sumSize += clone->goldEnd - clone->goldStart;
    ntClone = hashFindVal(cloneHash, nt->name);
    if (ntClone != NULL && clone->ntEnd > ntClone->size)
	{
	warn("Clone %s NT end position %d, NT size %d, line %d of %s", 
	    clone->name, clone->ntEnd, ntClone->size, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    if (ntClone != NULL)
	nt->size = ntClone->size;
    else
        nt->size = clone->size;		/* This happens for single-clone NT contigs only. */
    if (lastClone != NULL)
        {
	if (lastClone->ntEnd != clone->ntStart)
	    {
	    warn("last clone (%s)'s end doesn't match with current clone (%s)'s start line %d of %s",
	    	lastClone->name, clone->name, lf->lineIx, lf->fileName);
	    }
	}
    lastClone = clone;
    }

lineFileClose(&lf);
slReverse(&ntList);

for (nt = ntList; nt != NULL; nt = nt->next)
    {
    if (nt->sumSize != nt->size)
        {
	warn("Sum of fragments of %s is %d, but size is supposed to be %d",
		nt->name, nt->sumSize, nt->size);
	nt->problem = TRUE;
	}
    }
*retNtList = ntList;
*retNtHash = ntHash;
}
static struct joinedTables *joinedTablesCreate( struct joiner *joiner,
        char *primaryDb, char *primaryTable,
        struct joinerDtf *fieldList, struct joinerDtf *filterTables,
        int maxRowCount, struct region *regionList)
/* Create joinedTables structure from fields. */
{
    struct tableJoiner *tj, *tjList = bundleFieldsIntoTables(fieldList, filterTables);
    struct joinerPair *routeList = NULL, *route;
    struct joinedTables *joined = NULL;
    struct hash *tableHash = newHash(8);
    int totalKeyCount = 0, totalFieldCount = 0;
    int curKeyCount = 0, curFieldCount = 0;
    struct joinerDtf *tableDtfs;

    for (tj = tjList; tj != NULL; tj = tj->next)
    {
        char buf[256];
        safef(buf, sizeof(buf), "%s.%s", tj->database, tj->table);
        hashAdd(tableHash, buf, tj);
    }
    orderTables(&tjList, primaryDb, primaryTable);
    tableDtfs = tableToDtfs(tjList);
    routeList = joinerFindRouteThroughAll(joiner, tableDtfs);
    if (routeList == NULL)
        errAbort("Can't find route from %s to %s via all.joiner", primaryTable, tjList->next->table);
    addOutKeys(tableHash, routeList, &tjList);

    /* If first table is non-positional then it will lead to a lot
     * of n/a's in later fields unless we treat the genome-wide. */
    if (!isPositional(tjList->database, tjList->table))
        regionList = getRegionsFullGenome();
    /* Count up total fields and keys. */
    for (tj = tjList; tj != NULL; tj = tj->next)
    {
        totalKeyCount += slCount(tj->keysOut);
        totalFieldCount += slCount(tj->fieldList);
    }

    /* Do first table.  This one uses identifier hash if any. */
    {
        joined = tjLoadFirst(regionList,
                             tjList, totalFieldCount, totalKeyCount, maxRowCount);
        curKeyCount = slCount(tjList->keysOut);
        curFieldCount = slCount(tjList->fieldList);
    }

    /* Follow routing list for rest. */
    if (!sameString(tjList->database, routeList->a->database))
        internalErr();
    if (!sameString(tjList->table, routeList->a->table))
        internalErr();
    for (route = routeList; route != NULL; route = route->next)
    {
        struct tableJoiner *tj = findTableJoiner(tjList,
                                 route->b->database, route->b->table);
        struct joinerField *jfA = NULL, *jfB = NULL;
        if (tj == NULL)
            internalErr();
        jfA = findJoinerField(route->identifier, route->a);
        if (jfA == NULL)
        {
            internalErr();
        }
        jfB = findJoinerField(route->identifier, route->b);
        if (jfB == NULL)
            internalErr();
        if (!tj->loaded)
        {
            int keyIx;
            struct hash *keyHash = NULL;
            keyIx = findDtfIndex(joined->keyList, route->a);
            if (keyIx < 0)
                internalErr();
            keyHash = hashKeyField(joined, keyIx, jfA);
            tjLoadSome(regionList, joined, curFieldCount, curKeyCount,
                       route->b->field, keyHash,
                       jfB->chopBefore, jfB->chopAfter,
                       tj, isPositional(tj->database, tj->table),  FALSE);
            curKeyCount += slCount(tj->keysOut);
            curFieldCount += slCount(tj->fieldList);
            hashFree(&keyHash);
        }
    }
    joinerDtfFreeList(&tableDtfs);
    hashFree(&tableHash);
    tableJoinerFreeList(&tjList);
    return joined;
}
void doExpRatio(struct trackDb *tdb, char *item, struct customTrack *ct)
/* Generic expression ratio deatils using microarrayGroups.ra file */
/* and not the expRecord tables. */
{
char *expScale = trackDbRequiredSetting(tdb, "expScale");
char *expStep = trackDbRequiredSetting(tdb, "expStep");
double maxScore = atof(expScale);
double stepSize = atof(expStep);
struct bed *bedList;
char *itemName = cgiUsualString("i2","none");
char *expName = (item == NULL) ? itemName : item;
char *tdbSetting = trackDbSettingOrDefault(tdb, "expColor", "redGreen");
char *colorVal = NULL;
enum expColorType colorScheme;
char colorVarName[256];
safef(colorVarName, sizeof(colorVarName), "%s.color", tdb->track);
colorVal = cartUsualString(cart, colorVarName, tdbSetting);
colorScheme = getExpColorType(colorVal);

if (sameWord(tdb->grp, "cancerGenomics"))
    {
    /* set global flag */
    isCancerGenomicsTrack = TRUE;
    }

if (!ct)
    {
    genericHeader(tdb, itemName);
    bedList = loadMsBed(tdb, tdb->table, seqName, winStart, winEnd);
    }
else if (ct->dbTrack)
    {
    genericHeader(tdb, itemName);
    printCustomUrl(tdb, itemName, TRUE);
    bedList = ctLoadMultScoresBedDb(ct, seqName, winStart, winEnd);
    }
else
    bedList = bedFilterListInRange(ct->bedList, NULL, seqName, winStart, winEnd);
if (bedList == NULL)
    printf("<b>No Expression Data in this Range.</b>\n");
else if (expName && sameString(expName, "zoomInMore"))
    printf("<b>Too much data to display in detail in this range.</b>\n");
else
    {
    struct microarrayGroups *groupings = NULL;
    struct maGrouping *combineGroup;
    struct hash *erHash = newHash(6);
    int i;
    if (!ct)
	{
	groupings = maGetTrackGroupings(database, tdb);
	combineGroup = maCombineGroupingFromCart(groupings, cart, tdb->track);
	}
    else
	combineGroup = maGetGroupingFromCt(ct);
    maBedClumpGivenGrouping(bedList, combineGroup);
    for (i = 0; i < combineGroup->numGroups; i++)
	{
	/* make stupid exprecord hash.perhaps eventually this won't be needed */
	char id[16];
	struct expRecord *er = basicExpRecord(combineGroup->names[i], i, 2);
	safef(id, sizeof(id), "%d", i);
	hashAdd(erHash, id, er);
	}
    puts("<h2></h2><p>\n");
    msBedPrintTable(bedList, erHash, itemName, expName, -1*maxScore, maxScore,
	stepSize, 2, msBedDefaultPrintHeader, msBedExpressionPrintRow,
	printExprssnColorKey, getColorForExprBed, colorScheme);
    hashTraverseEls(erHash, erHashElFree);
    hashFree(&erHash);
    microarrayGroupsFree(&groupings);
    }
puts("<h2></h2><p>\n");
bedFreeList(&bedList);
}
Ejemplo n.º 26
0
void dupeFoo(char *pslName, char *faName, char *regionFile)
/* dupeFoo - Do some duplication analysis. */
{
struct lineFile *lf;
struct frag *fragList = NULL, *frag;
struct hash *fragHash = newHash(16);
struct psl *psl;
int fragCount=0,missCount=0,dupeCount=0,kSub=0,
   k1=0, k10=0,k100=0,k1000=0,k10000=0,diffChrom=0,distance;

/* Read in fragment list and put it in hash. */
fragList = readFragList(faName);
for (frag = fragList; frag != NULL; frag = frag->next)
    hashAdd(fragHash, frag->name, frag);

/* Read psl's and store under the fragment the belong to. */
lf = pslFileOpen(pslName);
while ((psl = pslNext(lf)) != NULL)
    {
    if ((frag = hashFindVal(fragHash, psl->qName)) == NULL)
        errAbort("Couldn't find %s in %s line %d of %s", 
		psl->qName, faName, lf->lineIx, lf->fileName);
    slAddHead(&frag->pslList, psl);
    }
lineFileClose(&lf);

/* Look through fragments and report missing and dupes. */
for (frag = fragList; frag != NULL; frag = frag->next)
    {
    ++fragCount;
    if ((psl = frag->pslList) == NULL)
        {
	++missCount;
	printf("missing %s\n", frag->name);
	}
    else
        {
	for (psl = frag->pslList; psl != NULL; psl = psl->next)
	    {
	    if (sameString(psl->tName, frag->chrom))
	        {
		distance = frag->start - psl->tStart;
		if (distance != 0)
		    {
		    if (distance < 0) distance = -distance;
		    if (distance >= 10000000) ++k10000;
		    else if (distance >= 1000000) ++k1000;
		    else if (distance >= 100000) ++k100;
		    else if (distance >= 10000) ++k10;
		    else if (distance >= 1000) ++k1;
		    else ++kSub;
		    }
		}
	    else
	        {
		++diffChrom;
		}
	    }
	}
    }
printPercent("Total", fragCount, fragCount);
printPercent("Unaligned", missCount, fragCount);
printPercent("Other Chrom", diffChrom, fragCount);
printPercent("Same Chrom >10M", k10000, fragCount);
printPercent("Same Chrom >1M", k1000, fragCount);
printPercent("Same Chrom >10Ok", k100, fragCount);
printPercent("Same Chrom >1Ok", k10, fragCount);
printPercent("Same Chrom >1k", k1, fragCount);
printPercent("Self-overlap", kSub, fragCount);
writeRegions(fragList, regionFile);
}
Ejemplo n.º 27
0
static void showLinkedTables(struct joiner *joiner, struct dbTable *inList,
	char *varPrefix, char *buttonName, char *buttonText)
/* Print section with list of linked tables and check boxes to turn them
 * on. */
{
struct dbTable *outList = NULL, *out, *in;
char dtName[256];
struct hash *uniqHash = newHash(0);
struct hash *inHash = newHash(8);

/* Build up list of tables we link to in outList. */
for (in = inList; in != NULL; in = in->next)
    {
    struct sqlConnection *conn = NULL;
    if (!trackHubDatabase(database))
	conn = hAllocConn(in->db);
    struct joinerPair *jpList, *jp;

    /* Keep track of tables in inList. */
    safef(dtName, sizeof(dtName), "%s.%s", inList->db, inList->table);
    hashAdd(inHash, dtName, NULL);

    /* First table in input is not allowed in output. */
    if (in == inList)
        hashAdd(uniqHash, dtName, NULL);

    /* Scan through joining information and add tables,
     * avoiding duplicate additions. */
    jpList = joinerRelate(joiner, in->db, in->table);
    for (jp = jpList; jp != NULL; jp = jp->next)
        {
	safef(dtName, sizeof(dtName), "%s.%s",
		jp->b->database, jp->b->table);
	if (!hashLookup(uniqHash, dtName) &&
	   !cartTrackDbIsAccessDenied(jp->b->database, jp->b->table))
	    {
	    hashAdd(uniqHash, dtName, NULL);
	    out = dbTableNew(jp->b->database, jp->b->table);
	    slAddHead(&outList, out);
	    }
	}
    joinerPairFreeList(&jpList);
    hFreeConn(&conn);
    }
slSort(&outList, dbTableCmp);

/* Print html. */
if (outList != NULL)
    {
    webNewSection("Linked Tables");
    hTableStart();
    for (out = outList; out != NULL; out = out->next)
	{
	struct sqlConnection *conn = hAllocConn(out->db);
	struct asObject *asObj = asForTable(conn, out->table);
	char *var = dbTableVar(varPrefix, out->db, out->table);
	hPrintf("<TR>");
	hPrintf("<TD>");
	cgiMakeCheckBox(var, varOn(var));
	hPrintf("</TD>");
	hPrintf("<TD>%s</TD>", out->db);
	hPrintf("<TD>%s</TD>", out->table);
	hPrintf("<TD>");
	if (asObj != NULL)
	    hPrintf("%s", asObj->comment);
	else
	    hPrintf("&nbsp;");
	hPrintf("</TD>");
	hPrintf("</TR>");
	hFreeConn(&conn);
	}
    hTableEnd();
    hPrintf("<BR>");

    cgiMakeButton(buttonName, buttonText);
    }
}
Ejemplo n.º 28
0
void rcvs(char *codingTable, char *clusterTable)
/* rcvs - Compare riken noncoding vs. nonspliced. */
{
struct hash *idHash = newHash(16); // Key id1, val id2
struct hash *nonCodingHash = newHash(16);  // Key clusterId, value 
struct hash *splicedHash = newHash(16);  // Key id2, present if spliced
struct sqlConnection *conn = sqlConnect("mgsc");
struct sqlResult *sr;
char **row;
char *words[16];
int wordCount;
struct lineFile *lf;
int codingSpliced = 0;
int noncodingSpliced = 0;
int codingNonspliced = 0;
int noncodingNonspliced = 0;

/* Read id's into hash */
sr = sqlGetResult(conn, NOSQLINJ "select id1,id2 from rikenIds");
while ((row = sqlNextRow(sr)) != NULL)
    hashAdd(idHash, row[0], cloneString(row[1]));
sqlFreeResult(&sr);

/* Read spliced into hash */
sr = sqlGetResult(conn,
	NOSQLINJ "select name from rikenOrientInfo where intronOrientation != 0");
while ((row = sqlNextRow(sr)) != NULL)
    hashAdd(splicedHash, row[0], NULL);
sqlFreeResult(&sr);

/* Read noncoding clusters into hash */
lf = lineFileOpen(codingTable, TRUE);
while (lineFileNextRow(lf, words, 2))
    {
    if (sameString(words[1], "NoPProt"))
        hashAdd(nonCodingHash, words[0], NULL);
    }
lineFileClose(&lf);

/* Stream through cluster table counting and correlating. */
lf = lineFileOpen(clusterTable, TRUE);
while (lineFileNextRow(lf, words, 2))
    {
    char *cluster = words[0];
    char *id1 = words[1];
    char *id2 = hashMustFindVal(idHash, id1);
    if (hashLookup(nonCodingHash, cluster))
        {
	if (hashLookup(splicedHash, id2))
	    ++noncodingSpliced;
	else
	    ++noncodingNonspliced;
	}
    else
        {
	if (hashLookup(splicedHash, id2))
	    ++codingSpliced;
	else
	    ++codingNonspliced;
	}
    }
printf("noncodingNonspliced %d\n", noncodingNonspliced);
printf("noncodingSpliced %d\n", noncodingSpliced);
printf("codingNonspliced %d\n", codingNonspliced);
printf("codingSpliced %d\n", codingSpliced);
printf("total %d\n", noncodingNonspliced + noncodingSpliced + codingNonspliced + codingSpliced);
}
void startRedoHash()
{
    redoHash = newHash(12);
}
Ejemplo n.º 30
0
static void tfBindLevelSection(struct tfData *tfList, struct sqlConnection *conn,
	char *motifTable, char *tfToConditionTable)
/* Print info on individual transcription factors that bind
 * with e-val between minVal and maxVal. */
{
struct tfData  *tf;
struct transRegCode *trc;

webNewSection("Transcription Factors Showing IP Over this Probe ");
hTableStart();
printf("<TR>");
colLabel("Transcription", 1);
colLabel("Growth Condition", 3);
colLabel("Motif Information", 3);
printf("</TR>\n");
printf("<TR>");
colLabel("Factor", 1);
colLabel("Good IP (P<0.001)", 1);
colLabel("Weak IP (P<0.005)", 1);
colLabel("No IP (P>0.005)", 1);
colLabel("Hits", 1);
colLabel("Scores", 1);
colLabel("Conservation (2 max)", 1);
printf("</TR>\n");

for (tf = tfList; tf != NULL; tf = tf->next)
    {
    struct hash *boundHash = newHash(8);
    slSort(&tf->conditionList, tfCondCmpName);
    printf("<TR>");

    /* Print transcription name. */
    printf("<TD>");
    sacCerHgGeneLinkName(conn, tf->name);
    printf("</TD>");

    /* Print stong and weak growth conditions. */
    ipPrintInRange(tf->conditionList, 0.0, 0.002, boundHash);
    ipPrintInRange(tf->conditionList, 0.002, 0.006, boundHash);

    /* Grab list of all conditions tested from database and
     * print out ones not in strong or weak as none. */
         {
	 char query[256], **row;
	 struct sqlResult *sr;
	 boolean isFirst = TRUE;
	 boolean gotAny = FALSE;
	 sqlSafef(query, sizeof(query),
	 	"select growthCondition from %s where name='%s'",
		tfToConditionTable, tf->name);
	 sr = sqlGetResult(conn, query);
	 printf("<TD>");
	 while ((row = sqlNextRow(sr)) != NULL)
	     {
	     if (!hashLookup(boundHash, row[0]))
	         {
		 if (isFirst)
		     isFirst = FALSE;
		 else
		     printf(", ");
		 printf("%s", row[0]);
		 gotAny = TRUE;
		 }
	     }
	 sqlFreeResult(&sr);
	if (!gotAny)
	    printf("&nbsp;");
	 printf("</TD>");
	 }


    /* Print motif info. */
    if (tf->trcList == NULL)
	printf("<TD>0</TD><TD>n/a</TD><TD>n/a</TD>\n");
    else
	{
	printf("<TD>%d</TD>", slCount(tf->trcList));
	/* Print scores. */
	printf("<TD>");
	for (trc = tf->trcList; trc != NULL; trc = trc->next)
	    {
	    double score;
	    if (trc != tf->trcList)
		printf(", ");
	    score = motifScoreHere(
		trc->chrom, trc->chromStart, trc->chromEnd,
		trc->name, motifTable);
	    transRegCodeAnchor(trc);
	    printf("%3.1f</A>", score);
	    }
	printf("</TD><TD>");
	for (trc = tf->trcList; trc != NULL; trc = trc->next)
	    {
	    if (trc != tf->trcList)
		printf(", ");
	    printf("%d", trc->consSpecies);
	    }
	printf("</TD>");
	}
    printf("</TR>\n");
    hashFree(&boundHash);
    }
hTableEnd();
}