int compress(FILE* inputFile, FILE* outputFile, int compressionLevel)
{
    struct BitwiseBufferedFile* w = openBitwiseBufferedFile(NULL, 1, -1, outputFile);
    uint8_t readByte[LOCAL_BYTE_BUFFER_LENGTH];
    size_t indexLength = INITIAL_INDEX_LENGTH;
    size_t bufferedBytes;
    int byteIndex = 0;
    struct LZ78HashTableEntry* hashTable;
    uint32_t childIndex = 257;
    uint32_t lookupIndex = ROOT_INDEX;
    uint32_t indexLengthMask = (1 << INITIAL_INDEX_LENGTH) - 1;
    uint32_t child;
    if((compressionLevel < MIN_COMPRESSION_LEVEL || compressionLevel > MAX_COMPRESSION_LEVEL) || inputFile == NULL || w == NULL)
    {
        errno = EINVAL;
        if(w != NULL) closeBitwiseBufferedFile(w);
        return -1;
    }
    uint32_t hashTableEntries = compressionParameters[compressionLevel - MIN_COMPRESSION_LEVEL].hashTableEntries;
    uint32_t moduloMask = hashTableEntries - 1;
    uint32_t maxChild = compressionParameters[compressionLevel - MIN_COMPRESSION_LEVEL].maxChild;
    hashTable = hashCreate(hashTableEntries, moduloMask);
    if(hashTable == NULL)
    {
        closeBitwiseBufferedFile(w);
        return -1;
    }
    if(writeBitBuffer(w, (uint32_t)compressionLevel, 3) == -1)
        goto exceptionHandler;
    while(!feof(inputFile) && !ferror(inputFile))
    {
        bufferedBytes = fread(readByte, 1, LOCAL_BYTE_BUFFER_LENGTH, inputFile);
        for(byteIndex = 0; byteIndex < bufferedBytes; byteIndex++)
        {
            child = hashLookup(hashTable, lookupIndex, readByte[byteIndex], moduloMask);
            if(child != ROOT_INDEX) lookupIndex = child; //ROOT_INDEX means NOT FOUND
            else
            {
                if(writeBitBuffer(w, lookupIndex, indexLength) == -1 || hashInsert(hashTable, lookupIndex, readByte[byteIndex], childIndex, moduloMask) == -1)
                    goto exceptionHandler;
                childIndex++;
                if((childIndex & indexLengthMask) == 0) //A power of 2 is reached
                {
                    //The length of the transmitted index is incremented
                    indexLength++;
                    //The next power of 2 mask is set
                    indexLengthMask = (indexLengthMask << 1) | 1;
                }
                //readByte value is also the right index to start with next time
                //because you have to start from the last character recognized
                lookupIndex = readByte[byteIndex] + 1; //ROOT_INDEX = 0 so ASCII characters are indexed in [1, 257]
                if (childIndex == maxChild) //hash table is full
                {
                    if(hashReset(hashTable, hashTableEntries, moduloMask) == NULL)
                        goto exceptionHandler; //hash table was not successfully created
                    childIndex = FIRST_CHILD; //starts from the beginning
                    indexLength = INITIAL_INDEX_LENGTH;
                    indexLengthMask = (1 << INITIAL_INDEX_LENGTH) - 1;
                }
            }
        }
    }
    if(ferror(inputFile))
    {
        errno = EBADFD;
        goto exceptionHandler;
    }
    if(writeBitBuffer(w, lookupIndex, indexLength) == -1 || writeBitBuffer(w, ROOT_INDEX, indexLength) == -1)
        goto exceptionHandler;
    hashDestroy(hashTable, hashTableEntries);
    return closeBitwiseBufferedFile(w);

    exceptionHandler:
        hashDestroy(hashTable, hashTableEntries);
        closeBitwiseBufferedFile(w);
        return -1;
}
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
/* txInfoAssemble - Assemble information from various sources into txInfo table.. */
{
/* Build up hash of evidence keyed by transcript name. */
struct hash *cdsEvHash = hashNew(18);
struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
    hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);

/* Build up hash of bestorf structures keyed by transcript name */
struct hash *predictHash = hashNew(18);
struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
for (predict = predictList; predict != NULL; predict = predict->next)
     hashAddUnique(predictHash, predict->name, predict);
verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);

/* Build up structure for random access of retained introns */
struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);

/* Read in exception info. */
struct hash *selenocysteineHash, *altStartHash;
genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);

/* Read in polyA sizes */
struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);

/* Read in psls */
struct hash *pslHash = hashNew(20);
struct psl *psl, *pslList = pslLoadAll(pslFile);
for (psl = pslList; psl != NULL; psl = psl->next)
    hashAdd(pslHash, psl->qName, psl);
verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);

/* Read in accessions that we flipped for better splice sites. */
struct hash *flipHash = hashWordsInFile(flipFile, 0);

/* Open primary gene input and output. */
struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
FILE *f = mustOpen(outFile, "w");

/* Main loop - process each gene */
char *row[12];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoad12(row);
    verbose(3, "Processing %s\n", bed->name);

    /* Initialize info to zero */
    struct txInfo info;
    ZeroVar(&info);

    /* Figure out name, sourceAcc, and isRefSeq from bed->name */
    info.name = bed->name;
    info.category = "n/a";
    info.sourceAcc = txAccFromTempName(bed->name);
    info.isRefSeq = startsWith("NM_", info.sourceAcc);

    if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc))
        {
	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
	info.sourceSize = bedTotalBlockSize(bed);
	info.aliCoverage = 1.0;
	info.aliIdRatio = 1.0;
	info. genoMapCount = 1;
	}
    else
	{
	/* Loop through all psl's associated with our RNA.  Figure out
	 * our overlap with each, and pick best one. */
	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
	if (firstPslHel == NULL)
	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
	int mapCount = 0;
	struct psl *psl, *bestPsl = NULL;
	int coverage, bestCoverage = 0;
	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
	    {
	    psl = hel->val;
	    mapCount += 1;
	    coverage = pslBedOverlap(psl, bed);
	    if (coverage > bestCoverage)
		{
		bestCoverage = coverage;
		bestPsl = psl;
		}
	    /* If we flipped it, try it on the opposite strand too. */
	    if (isFlipped)
		{
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		coverage = pslBedOverlap(psl, bed);
		if (coverage > bestCoverage)
		    {
		    bestCoverage = coverage;
		    bestPsl = psl;
		    }
		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
		}
	    }
	if (bestPsl == NULL)
	    errAbort("%s has no overlapping alignments with %s in %s", 
		    bed->name, info.sourceAcc, pslFile);

	/* Figure out and save alignment statistics. */
	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
	info.sourceSize = bestPsl->qSize - polyA;
	info.aliCoverage = (double)bestCoverage / info.sourceSize;
	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
	info. genoMapCount = mapCount;
	}


    /* Get orf size and start/end complete from cdsEv. */
    if (bed->thickStart < bed->thickEnd)
	{
	cdsEv = hashFindVal(cdsEvHash, bed->name);
	if (cdsEv != NULL)
	    {
	    info.orfSize = cdsEv->end - cdsEv->start;
	    info.startComplete = cdsEv->startComplete;
	    info.endComplete = cdsEv->endComplete;
	    }
	}

    /* Get score from prediction. */
    predict = hashFindVal(predictHash, bed->name);
    if (predict != NULL)
        info.cdsScore = predict->score;

    /* Figure out nonsense-mediated-decay from bed itself. */
    info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);

    /* Figure out if retained intron from bed and alt-splice keeper hash */
    info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
    info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
    info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
    info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);

    /* Look up selenocysteine info. */
    info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);

    /* Loop through bed looking for small gaps indicative of frame shift/stop */
    int i, lastBlock = bed->blockCount-1;
    int exonCount = 1;
    for (i=0; i < lastBlock; ++i)
        {
	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
	int gapEnd = bed->chromStarts[i+1];
	int gapSize = gapEnd - gapStart;
	switch (gapSize)
	    {
	    case 1:
	    case 2:
	        info.genomicFrameShift = TRUE;
		break;
	    case 3:
	        info.genomicStop = TRUE;
		break;
	    default:
	        exonCount += 1;
		break;
	    }
	}
    info.exonCount = exonCount;

    /* Write info, free bed. */
    txInfoTabOut(&info, f);
    bedFree(&bed);
    }

/* Clean up and go home. */
carefulClose(&f);
}
Beispiel #3
0
static void gapToLift(char *db, char *outFile)
/* gapToLift - create lift file from gap table(s). */
{
FILE *out = mustOpen(outFile, "w");
struct sqlConnection *conn = sqlConnect(db);
struct chromInfo *cInfoList = loadChromInfo(conn);
struct agpGap *gapList = loadAllGaps(conn, db, cInfoList);
struct agpGap *gap;
int start = 0;
int end = 0;
char *prevChr = NULL;
int liftCount = 0;
int chrSize = 0;
static struct hash *chrDone = NULL;

chrDone = newHash(0);

if (isNotEmpty(bedFileName))
    {
    bedFile = mustOpen(bedFileName, "w");
    verbose(2,"#\tbed output requested to %s\n", bedFileName);
    }

for (gap = gapList; gap; gap = gap->next)
    {
    verbose(3,"#\t%s\t%d\t%d\t%s\n", gap->chrom, gap->chromStart,
	gap->chromEnd, gap->bridge);

    if (prevChr && sameWord(prevChr, gap->chrom))
	{	/* continuing same segment, check for gap break,
		 *	or gap at end of chrom */
	if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize))
	    {
	    end = gap->chromStart;
	    liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize);
	    start = gap->chromEnd;
	    end = start;
	    }
	else
	    end = gap->chromEnd;
	}
    else	/* new chrom encountered */
	{ /* output last segment of previous chrom when necessary */
	if (prevChr && differentWord(prevChr, gap->chrom))
	    {
	    if (end < chrSize)
		liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize);
	    }
	liftCount = 0;
	chrSize = hashIntVal(cInfoHash, gap->chrom);
	hashAddInt(chrDone, gap->chrom, 1);
	if (gap->chromStart > 0)
	    {	/* starting first segment at position 0 */
	    start = 0;
	    end = gap->chromStart;
	    /* does the first gap break it ?  Or gap goes to end of chrom. */
	    if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize))
		{
		liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize);
		start = gap->chromEnd;
		end = start;
		}
	    }
	else	/* first gap is actually the beginning of the chrom */
	    {	/* thus, first segment starts after this first gap */
	    start = gap->chromEnd;
	    end = start;
	    }
	}
    prevChr = gap->chrom;	/* remember prev chrom to detect next chrom */
    }
/* potentially a last one */
if (end < chrSize)
    liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize);
/* check that all chroms have been used */
struct hashCookie cookie = hashFirst(cInfoHash);
struct hashEl *hel;
while ((hel = hashNext(&cookie)) != NULL)
    {
    if (NULL == hashLookup(chrDone, hel->name))
	{
	chrSize = hashIntVal(cInfoHash, hel->name);
	verbose(2, "#\tno gaps on chrom: %s, size: %d\n", hel->name, chrSize);
	liftCount = liftOutLine(out, hel->name, 0, chrSize, 0, chrSize);
	}
    }
carefulClose(&out);
sqlDisconnect(&conn);
}
struct hash *readKeyHash(char *db, struct joiner *joiner,
                         struct joinerField *keyField, struct keyHitInfo **retList)
/* Read key-field into hash.  Check for dupes if need be. */
{
    struct sqlConnection *conn = sqlWarnConnect(db);
    struct hash *keyHash = NULL;
    struct keyHitInfo *khiList = NULL, *khi;
    if (conn == NULL)
    {
        return NULL;
    }
    else
    {
        struct slName *table;
        struct slName *tableList = getTablesForField(conn,keyField->splitPrefix,
                                   keyField->table,
                                   keyField->splitSuffix);
        int rowCount = totalTableRows(conn, tableList);
        int hashSize = digitsBaseTwo(rowCount)+1;
        char query[256], **row;
        struct sqlResult *sr;
        int itemCount = 0;
        int dupeCount = 0;
        char *dupe = NULL;

        if (rowCount > 0)
        {
            if (hashSize > hashMaxSize)
                hashSize = hashMaxSize;
            keyHash = hashNew(hashSize);
            for (table = tableList; table != NULL; table = table->next)
            {
                safef(query, sizeof(query), "select %s from %s",
                      keyField->field, table->name);
                sr = sqlGetResult(conn, query);
                while ((row = sqlNextRow(sr)) != NULL)
                {
                    char *id = doChopsAndUpper(keyField, row[0]);
                    if (hashLookup(keyHash, id))
                    {
                        if (keyField->unique)
                        {
                            if (keyField->exclude == NULL ||
                                    !slNameInList(keyField->exclude, id))
                            {
                                if (dupeCount == 0)
                                    dupe = cloneString(id);
                                ++dupeCount;
                            }
                        }
                    }
                    else
                    {
                        AllocVar(khi);
                        hashAddSaveName(keyHash, id, khi, &khi->name);
                        slAddHead(&khiList, khi);
                        ++itemCount;
                    }
                }
                sqlFreeResult(&sr);
            }
            if (dupe != NULL)
            {
                warn("Error: %d duplicates in %s.%s.%s including '%s'",
                     dupeCount, db, keyField->table, keyField->field, dupe);
                freez(&dupe);
            }
            verbose(2, " %s.%s.%s - %d unique identifiers\n",
                    db, keyField->table, keyField->field, itemCount);
        }
        slFreeList(&tableList);
    }
    sqlDisconnect(&conn);
    *retList = khiList;
    return keyHash;
}
Beispiel #5
0
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable)
/* hgExpDistance - Create table that measures expression distance between pairs. */
{
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char query[256];
char **row;
struct hash *expHash = hashNew(16);
int realExpCount = -1;
struct microData *geneList = NULL, *curGene, *gene;
int geneIx, geneCount = 0;
struct microData **geneArray = NULL;
float *weights = NULL;
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);
long time1, time2;

time1 = clock1000();

/* Get list/hash of all items with expression values. */

/* uglyf("warning: temporarily limited to 1000 records\n"); */

sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *name = row[0];
    if (!hashLookup(expHash, name))
	{
	int expCount = sqlUnsigned(row[1]);
	int commaCount;
	float *expScores = NULL;

	sqlFloatDynamicArray(row[2], &expScores, &commaCount);
	if (expCount != commaCount)
	    errAbort("expCount and expScores don't match on %s in %s", name, posTable);
	if (realExpCount == -1)
	    realExpCount = expCount;
	if (expCount != realExpCount)
	    errAbort("In %s some rows have %d experiments others %d", 
	    	name, expCount, realExpCount);
	AllocVar(gene);
	gene->expCount = expCount;
	gene->expScores = expScores;
	hashAddSaveName(expHash, name, gene, &gene->name);
	slAddHead(&geneList, gene);
	}
    }
sqlFreeResult(&sr);
conn = sqlConnect(database);
slReverse(&geneList);
geneCount = slCount(geneList);
printf("Have %d elements in %s\n", geneCount, posTable);

weights = getWeights(realExpCount);

if (optionExists("lookup"))
    geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList);
geneCount = slCount(geneList);
printf("Got %d unique elements in %s\n", geneCount, posTable);

sqlDisconnect(&conn);	/* Disconnect because next step is slow. */


if (geneCount < 1)
    errAbort("ERROR: unique gene count less than one ?");

time2 = clock1000();
verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0);

/* Get an array for sorting. */
AllocArray(geneArray, geneCount);
for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx)
    geneArray[geneIx] = gene;

/* Print out closest 1000 in tab file. */
for (curGene = geneList; curGene != NULL; curGene = curGene->next)
    {
    calcDistances(curGene, geneList, weights);
    qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance);
    for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx)
        {
	gene = geneArray[geneIx];
	fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance);
	}
    dotOut();
    }

printf("Made %s.tab\n", outTable);

time1 = time2;
time2 = clock1000();
verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0);

/* Create and load table. */
conn = sqlConnect(database);
distanceTableCreate(conn, outTable);
hgLoadTabFile(conn, tempDir, outTable, &f);
printf("Loaded %s\n", outTable);

/* Add indices. */
sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable);
sqlUpdate(conn, query);
printf("Made query index\n");
if (optionExists("targetIndex"))
    {
    sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable);
    sqlUpdate(conn, query);
    printf("Made target index\n");
    }

hgRemoveTabFile(tempDir, outTable);

time1 = time2;
time2 = clock1000();
verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0);

}
void processSnps(char *chromName)
/* read through all rows in snpTmp */
/* look up class and observed */
/* write to output file */
{
char query[512];
struct sqlConnection *conn = hAllocConn();
struct sqlResult *sr;
char **row;
char tableName[64];
char fileName[64];
FILE *f;
struct hashEl *univarElement = NULL;
struct snpData *dataElement = NULL;
int classInt = 0;
char *classString = NULL;
int loc_type = 0;
int skipCount = 0;

safef(tableName, ArraySize(tableName), "%s_snpTmp", chromName);
safef(fileName, ArraySize(fileName), "%s_snpTmp.tab", chromName);
f = mustOpen(fileName, "w");

safef(query, sizeof(query), 
     "select snp_id, chromStart, chromEnd, loc_type, orientation, allele, refUCSC, refUCSCReverseComp, weight from %s ", tableName);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    loc_type = sqlUnsigned(row[3]);
    /* get univarElement from snpHash */
    univarElement = hashLookup(snpHash, row[0]);
    if (univarElement == NULL)
       {
        {
	fprintf(errorFileHandle, "no data for %s (dropping)\n", row[0]);
	skipCount++;
	continue;
	}
       }
    dataElement = (struct snpData *)univarElement->val;
    classInt = dataElement->classInt;
    // verbose(1, "classInt = %d\n", classInt);
    assert(classInt >= 1 && classInt <= classCount);
    /* lookup classString */
    classString = classStrings[classInt-1];
    /* special handling for class = in-del; split into classes of our own construction */
    if (sameString(classString, "in-del"))
        {
        if (loc_type == 3) 
	    classString = cloneString("insertion");
	if (loc_type == 1 || loc_type == 2) 
	    classString = cloneString("deletion");
	}
    fprintf(f, "%s\t%s\t%s\t%d\t%s\t", row[0], row[1], row[2], loc_type, classString);
    fprintf(f, "%s\t%s\t%s\t%s\t%s\t%s\n", row[4], row[5], row[6], row[7], dataElement->observed, row[8]);
    }

sqlFreeResult(&sr);
hFreeConn(&conn);
carefulClose(&f);
if (skipCount > 0)
    verbose(1, "%d rows dropped\n", skipCount);
}
INLINE boolean fieldOk(char *field, struct hash *fieldHash)
/* Return TRUE if fieldHash is NULL or field exists in fieldHash. */
{
return (fieldHash == NULL || hashLookup(fieldHash, field));
}
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable)
/* hgExpDistance - Create table that measures expression distance between pairs. */
{
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char query[256];
char **row;
struct hash *expHash = hashNew(16);
int realExpCount = -1;
struct microData *gene;
int rc, t;
pthread_t *threads = NULL;
pthread_attr_t attr;
int *threadID = NULL;
void *status;
char *tempDir = ".";
int arrayNum; 
struct microDataDistance *geneDistPtr = NULL;	
struct microDataDistance *geneDistArray = NULL;	
int geneIx;
FILE *f = NULL;

/* Get list/hash of all items with expression values. */
safef(query, sizeof(query), "select name,expCount,expScores from %s", posTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *name = row[0];
    if (!hashLookup(expHash, name))
	{
	int expCount = sqlUnsigned(row[1]);
	int commaCount;
	float *expScores = NULL;

	sqlFloatDynamicArray(row[2], &expScores, &commaCount);
	if (expCount != commaCount)
	    errAbort("expCount and expScores don't match on %s in %s", name, posTable);
	if (realExpCount == -1)
	    realExpCount = expCount;
	if (expCount != realExpCount)
	    errAbort("In %s some rows have %d experiments others %d", 
	    	name, expCount, realExpCount);
	AllocVar(gene);
	gene->expCount = expCount;
	gene->expScores = expScores;
	hashAddSaveName(expHash, name, gene, &gene->name);
	slAddHead(&geneList, gene);
	}
    }
sqlFreeResult(&sr);
conn = sqlConnect(database);
slReverse(&geneList);
geneCount = slCount(geneList);
printf("Have %d elements in %s\n", geneCount, posTable);

weights = getWeights(realExpCount);

if (optionExists("lookup"))
    geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList);
geneCount = slCount(geneList);
printf("Got %d unique elements in %s\n", geneCount, posTable);

sqlDisconnect(&conn);	/* Disconnect because next step is slow. */


if (geneCount < 1)
    errAbort("ERROR: unique gene count less than one ?");

f = hgCreateTabFile(tempDir, outTable);
synQ = synQueueNew();

/* instantiate threads */
AllocArray( threadID, numThreads );
AllocArray( threads, numThreads );
pthread_attr_init( &attr );
pthread_mutex_init( &mutexDotOut, NULL );
pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE );

for (t = 0; t < numThreads; t++) {
	threadID[t] = t;
	rc = pthread_create( &threads[t], &attr, computeDistance, 
						(void *) &threadID[t]);
	if (rc)
		errAbort("ERROR: in pthread_create() %d\n", rc );
} 

/* this thread will write to the file from the queue */
for (arrayNum = 0; arrayNum < geneCount; arrayNum++) {
	geneDistArray = (struct microDataDistance *)synQueueGet( synQ );
	geneDistPtr = geneDistArray;
    	/* Print out closest GENEDISTS distances in tab file. */
    	for (geneIx=0; geneIx < GENEDISTS && geneIx < geneCount; 
						++geneIx, geneDistPtr++)
		if (geneDistPtr != NULL)
			fprintf(f, "%s\t%s\t%f\n", geneDistPtr->name1, 
				geneDistPtr->name2, geneDistPtr->distance);
		else
			errAbort("ERROR: writing distance %d to file\n", 
							geneIx);
	freeMem( geneDistArray );
}

/* synchronize all threads */
for (t = 0; t < numThreads; t++) {
	rc = pthread_join( threads[t], &status);
	if (rc)
		errAbort("ERROR: in pthread_join() %d\n", rc );
} 

printf("Made %s.tab\n", outTable);

slFreeList( &geneList );

pthread_mutex_destroy( &mutexDotOut );
pthread_attr_destroy( &attr );

/* Create and load table. */
conn = sqlConnect(database);
distanceTableCreate(conn, outTable);
hgLoadTabFile(conn, tempDir, outTable, &f);
printf("Loaded %s\n", outTable);

/* Add indices. */
safef(query, sizeof(query), "alter table %s add index(query(12))", outTable);
sqlUpdate(conn, query);
printf("Made query index\n");
if (optionExists("targetIndex"))
    {
    safef(query, sizeof(query), "alter table %s add index(target(12))", outTable);
    sqlUpdate(conn, query);
    printf("Made target index\n");
    }

hgRemoveTabFile(tempDir, outTable);
}
Beispiel #9
0
static struct grp *makeGroupList(char *db, struct trackDb *trackList, struct grp **pHubGrpList,
                                 boolean allTablesOk)
/* Get list of groups that actually have something in them. */
{
struct grp *groupsAll, *groupList = NULL, *group;
struct hash *groupsInTrackList = newHash(0);
struct hash *groupsInDatabase = newHash(0);
struct trackDb *track;

/* Stream through track list building up hash of active groups. */
for (track = trackList; track != NULL; track = track->next)
    {
    if (!hashLookup(groupsInTrackList,track->grp))
        hashAdd(groupsInTrackList, track->grp, NULL);
    }

/* Scan through group table, putting in ones where we have data. */
groupsAll = hLoadGrps(db);
for (group = slPopHead(&groupsAll); group != NULL; group = slPopHead(&groupsAll))
    {
    if (hashLookup(groupsInTrackList, group->name))
	{
	slAddTail(&groupList, group);
	hashAdd(groupsInDatabase, group->name, group);
	}
    else
        grpFree(&group);
    }

/* if we have custom tracks, we want to add the track hubs
 * after that group */
struct grp *addAfter = NULL;
if ((groupList != NULL) && sameString(groupList->name, "user"))
    addAfter = groupList;

/* Add in groups from hubs. */
for (group = slPopHead(pHubGrpList); group != NULL; group = slPopHead(pHubGrpList))
    {
    // if the group isn't represented in any track, don't add it to list
    if (!hashLookup(groupsInTrackList,group->name))
	continue;
    /* check to see if we're inserting hubs rather than
     * adding them to the front of the list */
    struct grp *newGrp = grpDup(group);
    if (addAfter != NULL)
	{
	newGrp->next = addAfter->next;
	addAfter->next = newGrp;
	}
    else
	slAddHead(&groupList, newGrp);
    hashAdd(groupsInDatabase, newGrp->name, newGrp);
    }

/* Do some error checking for tracks with group names that are
 * not in database.  Just warn about them. */
if (!trackHubDatabase(db))
    for (track = trackList; track != NULL; track = track->next)
    {
    if (!hashLookup(groupsInDatabase, track->grp))
         warn("Track %s has group %s, which isn't in grp table",
	 	track->table, track->grp);
    }

/* Create dummy group for all tracks. */
AllocVar(group);
group->name = cloneString("allTracks");
group->label = cloneString("All Tracks");
slAddTail(&groupList, group);

/* Create another dummy group for all tables. */
if (allTablesOk)
    {
    AllocVar(group);
    group->name = cloneString("allTables");
    group->label = cloneString("All Tables");
    slAddTail(&groupList, group);
    }

hashFree(&groupsInTrackList);
hashFree(&groupsInDatabase);
return groupList;
}
static void agpMergeChromScaf(char *agpFile, char *agpOut, boolean filtering)
/* Create a combined agp file from the chrom.agp and scaffold.agp, 
 *  merging in only scaffolds from scaffold.agp
 *  that are not already in chroms. */
{
struct lineFile *lf = lineFileOpen(agpFile, TRUE);
char *line, *words[16];
int lineSize, wordCount;
unsigned lastPos = 0;
struct agpFrag *agp;
struct agpGap *gap;
FILE *f;
char *lastObj = NULL;
f = mustOpen(agpOut, filtering ? "a" : "w");
char *newChrom = NULL;
static struct hash *hash = NULL;
boolean skipping = FALSE;

if (!hash)
    hash = hashNew(0);

verbose(2,"#\tprocessing AGP file: %s\n", agpFile);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == 0 || line[0] == '#' || line[0] == '\n')
        continue;
    //verbose(2,"#\tline: %d\n", lf->lineIx);
    wordCount = chopLine(line, words);
    if (wordCount < 5)
        errAbort("Bad line %d of %s: need at least 5 words, got %d\n",
		 lf->lineIx, lf->fileName, wordCount);

    if (!lastObj || !sameString(words[0],lastObj))
	{
	freez(&newChrom);
	newChrom = cloneString(words[0]);
	lastPos = 0;
	}

    	
    skipping = FALSE;
    if (filtering)
	{
	if (hashLookup(hash, words[0]))
	    skipping = TRUE;
	}
		 
    if (words[4][0] != 'N')
	{
	lineFileExpectAtLeast(lf, 9, wordCount);
	agp = agpFragLoad(words);
	/* agp is 1-based but agp loaders do not adjust for 0-based: */
    	agp->chromStart -= 1;
	agp->fragStart  -= 1;
	if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
	    errAbort("Sizes don't match in %s and %s line %d of %s\n",
		agp->chrom, agp->frag, lf->lineIx, lf->fileName);
        if (!filtering)
	    {
	    char *root = cloneString(agp->frag);
	    chopSuffixAt(root, '.');
	    hashStore(hash, root);
	    freeMem(root);
	    }
	}
    else
        {
	lineFileExpectAtLeast(lf, 8, wordCount);
	gap = agpGapLoad(words);
	/* to be consistent with agpFrag */
	gap->chromStart -= 1;
	agp = (struct agpFrag*)gap;
	}

    if (agp->chromStart != lastPos)
	errAbort("Start doesn't match previous end line %d of %s\n"
	    "agp->chromStart: %u\n" 
	    "agp->chromEnd: %u\n" 
	    "lastPos: %u\n" 
	    ,lf->lineIx, lf->fileName
	    ,agp->chromStart
	    ,agp->chromEnd
	    ,lastPos
	    );

    lastPos = agp->chromEnd;
    freez(&lastObj);
    lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */
	
    if (words[4][0] != 'N')
	{
	/* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */
	if (!skipping)
    	    agpFragOutput(agp, f, '\t', '\n');
	agpFragFree(&agp);
	}
    else
        {
	/* restore back to 1-based for agp 
	 * because agpGapOutput doesn't compensate */
	gap->chromStart += 1;
	if (!skipping)
	    agpGapOutput(gap, f, '\t', '\n');
	agpGapFree(&gap);
	}
	
    }

carefulClose(&f);
}
Beispiel #11
0
Datei: bam.c Projekt: bowhan/kent
void bamTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f)
/* Print out selected fields from BAM.  If fields is NULL, then print out all fields. */
{
struct hTableInfo *hti = NULL;
hti = getHti(db, table, conn);
struct hash *idHash = NULL;
char *idField = getIdField(db, curTrack, table, hti);
int idFieldNum = 0;

/* if we know what field to use for the identifiers, get the hash of names */
if (idField != NULL)
    idHash = identifierHash(db, table);

if (f == NULL)
    f = stdout;

/* Convert comma separated list of fields to array. */
int fieldCount = chopByChar(fields, ',', NULL, 0);
char **fieldArray;
AllocArray(fieldArray, fieldCount);
chopByChar(fields, ',', fieldArray, fieldCount);

/* Get list of all fields in big bed and turn it into a hash of column indexes keyed by
 * column name. */
struct hash *fieldHash = hashNew(0);
struct slName *bb, *bbList = bamGetFields(table);
int i;
for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i)
    {
    /* if we know the field for identifiers, save it away */
    if ((idField != NULL) && sameString(idField, bb->name))
	idFieldNum = i;
    hashAddInt(fieldHash, bb->name, i);
    }

/* Create an array of column indexes corresponding to the selected field list. */
int *columnArray;
AllocArray(columnArray, fieldCount);
for (i=0; i<fieldCount; ++i)
    {
    columnArray[i] = hashIntVal(fieldHash, fieldArray[i]);
    }

/* Output row of labels */
fprintf(f, "#%s", fieldArray[0]);
for (i=1; i<fieldCount; ++i)
    fprintf(f, "\t%s", fieldArray[i]);
fprintf(f, "\n");

struct asObject *as = bamAsObj();
struct asFilter *filter = NULL;

if (anyFilter())
    {
    filter = asFilterFromCart(cart, db, table, as);
    if (filter)
        {
	fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
	}
    }

/* Loop through outputting each region */
struct region *region, *regionList = getRegions();

int maxOut = bigFileMaxOutput();
for (region = regionList; region != NULL && (maxOut > 0); region = region->next)
    {
    struct lm *lm = lmInit(0);
    char *fileName = bamFileName(table, conn, region->chrom);
    struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom,
    	region->start, region->end, lm);
    char *row[SAMALIGNMENT_NUM_COLS];
    char numBuf[BAM_NUM_BUF_SIZE];
    for (sam = samList; sam != NULL && (maxOut > 0); sam = sam->next)
        {
	samAlignmentToRow(sam, numBuf, row);
	if (asFilterOnRow(filter, row))
	    {
	    /* if we're looking for identifiers, check if this matches */
	    if ((idHash != NULL)&&(hashLookup(idHash, row[idFieldNum]) == NULL))
		continue;

	    int i;
	    fprintf(f, "%s", row[columnArray[0]]);
	    for (i=1; i<fieldCount; ++i)
		fprintf(f, "\t%s", row[columnArray[i]]);
	    fprintf(f, "\n");
	    maxOut --;
	    }
	}
    freeMem(fileName);
    lmCleanup(&lm);
    }

if (maxOut == 0)
    warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
/* Clean up and exit. */
hashFree(&fieldHash);
freeMem(fieldArray);
freeMem(columnArray);
}
void ffaToFa(char *inFile, char *outDir, char *outTabName)
/* convert Greg Schulers .ffa fasta files to our .fa files */
{
struct lineFile *in;
FILE *out = NULL, *tab;
int lineSize;
char *line;
char ucscName[128];
char path[512];
static char lastPath[512];
int outFileCount = 0;
struct hash *uniqClone = newHash(16);
struct hash *uniqFrag = newHash(19);
boolean ignore = FALSE;

makeDir(outDir);
errLog = mustOpen("ffaToFa.err", "w");
tab = mustOpen(outTabName, "w");
printf("Converting %s", inFile);
fflush(stdout);
if (sameString(inFile, "stdin"))
    in = lineFileStdin(TRUE);
else
    in = lineFileOpen(inFile, TRUE);
while (lineFileNext(in, &line, &lineSize))
    {
    if (line[0] == '>')
	{
	ignore = FALSE;
	gsToUcsc(line+1, ucscName);
	faRecNameToFaFileName(outDir, ucscName, path);
	if (hashLookup(uniqFrag, ucscName))
	    {
	    ignore = TRUE;
	    warn("Duplicate %s in %s, ignoring all but first",
	    	ucscName, inFile);
	    }
	else
	    {
	    hashAdd(uniqFrag, ucscName, NULL);
	    }
	if (!sameString(path, lastPath))
	    {
	    strcpy(lastPath, path);
	    carefulClose(&out);
	    if (hashLookup(uniqClone, path))
		{
		warn("Duplicate %s in %s ignoring all but first", 
		    ucscName, inFile);
		}
	    else
		{
		hashAdd(uniqClone, path, NULL);
		out = mustOpen(path, "w");
		++outFileCount;
		if ((outFileCount&7) == 0)
		    {
		    putc('.', stdout);
		    fflush(stdout);
		    }
		}
	    }
	if (out != NULL && !ignore)
	    {
	    fprintf(out, ">%s\n", ucscName);
	    fprintf(tab, "%s\t%s\n", ucscName, line+1);
	    }
	}
    else
	{
	if (out != NULL && !ignore)
	    {
	    fputs(line, out);
	    fputc('\n', out);
	    }
	}
    }
carefulClose(&out);
fclose(tab);
lineFileClose(&in);
printf("Made %d .fa files in %s\n", outFileCount, outDir);
}
Beispiel #13
0
void doGenePredNongenomic(struct sqlConnection *conn, int typeIx)
/* Get mrna or protein associated with selected genes. */
{
/* Note this does do the whole genome at once rather than one
 * chromosome at a time, but that's ok because the gene prediction
 * tracks this serves are on the small side. */
char *typeWords[3];
char *table;
struct lm *lm = lmInit(64*1024);
int fieldCount;
struct bed *bed, *bedList = cookedBedsOnRegions(conn, curTable, getRegions(),
	lm, &fieldCount);
int typeWordCount;

textOpen();

/* Figure out which table to use. */
if (isRefGeneTrack(curTable))
    {
    if (typeIx == 1) /* Protein */
        doRefGeneProteinSequence(conn, bedList);
    else
        doRefGeneMrnaSequence(conn, bedList);
    }
else
    {
    char *dupType = cloneString(findTypeForTable(database, curTrack, curTable, ctLookupName));
    typeWordCount = chopLine(dupType, typeWords);
    if (typeIx >= typeWordCount)
	internalErr();
    table = typeWords[typeIx];
    if (sqlTableExists(conn, table))
	{
	struct sqlResult *sr;
	char **row;
	char query[256];
	struct hash *hash = newHash(18);
	boolean gotResults = FALSE;

	/* Make hash of all id's passing filters. */
	for (bed = bedList; bed != NULL; bed = bed->next)
	    hashAdd(hash, bed->name, NULL);

	/* Scan through table, outputting ones that match. */
	sqlSafef(query, sizeof(query), "select name, seq from %s", table);
	sr = sqlGetResult(conn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    if (hashLookup(hash, row[0]))
		{
		hPrintf(">%s\n", row[0]);
		writeSeqWithBreaks(stdout, row[1], strlen(row[1]), 60);
		gotResults = TRUE;
		}
	    }
	sqlFreeResult(&sr);
	hashFree(&hash);
	if (!gotResults)
	    hPrintf(NO_RESULTS);
	}
    else
	{
	internalErr();
	}
    freez(&dupType);
    }
lmCleanup(&lm);
}
Beispiel #14
0
void knownToVisiGene(char *database)
/* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */
{
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);
struct sqlConnection *hConn = sqlConnect(database);
struct sqlConnection *iConn = sqlConnect(visiDb);
struct sqlResult *sr;
char **row;
struct hash *geneImageHash = newHash(18);
struct hash *locusLinkImageHash = newHash(18);
struct hash *refSeqImageHash = newHash(18);
struct hash *genbankImageHash = newHash(18);
struct hash *probeImageHash = newHash(18);
struct hash *knownToLocusLinkHash = newHash(18);
struct hash *knownToRefSeqHash = newHash(18);
struct hash *knownToGeneHash = newHash(18);
struct hash *favorHugoHash = newHash(18);
struct hash *knownToProbeHash = newHash(18);
struct hash *knownToAllProbeHash = newHash(18);
struct genePred *knownList = NULL, *known;
struct hash *dupeHash = newHash(17);


probesDb  = optionVal("probesDb", database);
struct sqlConnection *probesConn = sqlConnect(probesDb);
vgProbes = sqlTableExists(probesConn,"vgProbes");
vgAllProbes = sqlTableExists(probesConn,"vgAllProbes");

/* Go through and make up hashes of images keyed by various fields. */
sr = sqlGetResult(iConn,
        "NOSQLINJ select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank"
	",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id"
	" from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap"
	" where image.imageFile = imageFile.id"
	" and image.id = imageProbe.image"
	" and imageProbe.probe = probe.id"
	" and probe.gene = gene.id"
	" and image.submissionSet=submissionSet.id"
	" and vgPrbMap.probe = probe.id");

while ((row = sqlNextRow(sr)) != NULL)
    {
    int id = sqlUnsigned(row[0]);
    float priority = atof(row[1]);
    int privateUser = sqlSigned(row[7]);
    char vgPrb_Id[256];
    safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]);
    int geneId = sqlUnsigned(row[9]);
    if (privateUser == 0)
	{
	addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id);
	addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]);
	addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]);
	addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]);
	addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]);
	}
    }
verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d"
           ", genbankImageHash %d probeImageHash %d\n", 
            geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, 
	    genbankImageHash->elCount, probeImageHash->elCount);
sqlFreeResult(&sr);

/* Build up list of known genes. */
sr = sqlGetResult(hConn, "NOSQLINJ select * from knownGene");
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *known = genePredLoad(row);
    if (!hashLookup(dupeHash, known->name))
        {
	hashAdd(dupeHash, known->name, NULL);
	slAddHead(&knownList, known);
	}
    }
slReverse(&knownList);
sqlFreeResult(&sr);
verbose(2, "Got %d known genes\n", slCount(knownList));

/* Build up hashes from knownGene to other things. */
if (vgProbes)
    bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash);
if (vgAllProbes)
    bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash);

foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE);
foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE);
foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE);
foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);
foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);

verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", 
   knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount,
   knownToProbeHash->elCount, knownToAllProbeHash->elCount);

/* Try and find an image for each gene. */
for (known = knownList; known != NULL; known = known->next)
    {
    char *name = known->name;
    struct prioritizedImage *best = NULL;
    {
    best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash);
    if (!best)
	best = bestImage(name, knownToRefSeqHash, refSeqImageHash);
    if (!best)
	{
	best = hashFindVal(genbankImageHash, name);
	}
    if (!best)
	best = bestImage(name, knownToGeneHash, geneImageHash);
    if (vgProbes && !best)
	best = bestImage(name, knownToProbeHash, probeImageHash);
    if (vgAllProbes && !best)
	best = bestImage(name, knownToAllProbeHash, probeImageHash);
    }	    
    if (best)
        {
	fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId);
	}
    }

createTable(hConn, outTable);
hgLoadTabFile(hConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}
Beispiel #15
0
void checkMetaFiles(struct mdbObj *mdbObj, char *downDir, struct hash *allNames)
{
verbose(1, "----------------------------------------------\n");
verbose(1, "Checking that files specified in metaDb exist in download dir\n");
verbose(1, "----------------------------------------------\n");
for(; mdbObj != NULL; mdbObj=mdbObj->next)
    {
    struct mdbVar *mdbVar = hashFindVal(mdbObj->varHash, "objType");
    if (mdbVar == NULL)
        {
        warn("objType not found in object %s", mdbObj->obj);
        continue;
        }
    if (sameString(mdbVar->val, "composite"))  // skip objType composite
        continue;

    mdbObj->deleteThis = FALSE;
    mdbVar = hashFindVal(mdbObj->varHash, "composite");

    if (mdbVar == NULL)
        {
        warn("composite not found in object %s", mdbObj->obj);
        continue;
        }
    // char *composite = mdbVar->val;

    mdbVar = hashFindVal(mdbObj->varHash, "fileName");

    if (mdbVar == NULL)
        {
        mdbObj->deleteThis = TRUE;
        warn("fileName not found in object %s", mdbObj->obj);
        continue;
        }

    char *fileName = mdbVar->val;
    char buffer[10 * 1024];

    struct hash *bamNames = hashNew(8);
    struct slName *list = slNameListFromString(fileName, ','), *el;
    for(el=list; el; el=el->next)
	{
	if (hashLookup(allNames, el->name))
	    {
	    warn("duplicate fileName entry: %s", el->name);
	    }
	else
	    {
	    hashAdd(allNames, el->name, NULL);
	    }
        if (endsWith(el->name,".bam"))
	    {
	    hashAdd(bamNames, el->name, NULL);
	    }
        if (endsWith(el->name,".bam.bai"))
	    {
	    el->name[strlen(el->name)-4] = 0;
	    struct hashEl *hel = hashLookup(bamNames, el->name);
	    el->name[strlen(el->name)] = '.';
	    if (hel == NULL)
		{
		warn(".bam.bai without corresponding .bam: %s", el->name);
		}
	    else
		{
		hel->val = (void *)1;
		}
	    }
	}


    // see if we have to add any .bam.bai to the list 
    for(el=list; el; el=el->next)
	{
        if (endsWith(el->name,".bam"))
	    {
	    struct hashEl *hel = hashLookup(bamNames, el->name);
	    if (hel->val == NULL)
		{  // we have to add a .bam.bai to the list 
		char *bambai = addSuffix(el->name, ".bai");
		warn(".bam.bai not found for corresponding .bam in meta.fileName: %s", el->name);
		slNameAddTail(&list, bambai);		
		if (hashLookup(allNames, bambai))
		    {
		    warn("duplicate fileName entry: %s", bambai);
		    }
		else
		    hashAdd(allNames, bambai, NULL);		
		}
	    }
	}

    // make sure the files are there
    for(el=list; el; el=el->next)
	{
	if (!startsWith(mdbObj->obj, el->name))
	    {
	    warn("fileName %s does not start with object name %s", el->name, mdbObj->obj);
	    }

	safef(buffer, sizeof buffer, "%s/%s", downDir, el->name);

	verbose(2, "checking for fileExists %s\n", buffer);
	if (!fileExists(buffer))
	    {
	    mdbObj->deleteThis = TRUE;
	    warn("metaDb file %s not found in download dir %s",buffer, downDir);
	    }
	}
    }
}
Beispiel #16
0
void submitRefToFiles(struct sqlConnection *conn, struct sqlConnection *conn2, struct sqlConnection *connSp,
    char *ref, char *fileRoot, char *inJax)
/* Create a .ra and a .tab file for given reference. */
{
/* Initially the tab file will have some duplicate lines, so
 * write to temp file, and then filter. */
char raName[PATH_LEN], tabName[PATH_LEN], capName[PATH_LEN];
FILE *ra = NULL, *tab = NULL, *cap = NULL;
struct dyString *query = dyStringNew(0);
struct sqlResult *sr;
char **row;
char *pubMed;
struct slName *list, *el;
boolean gotAny = FALSE;
struct hash *uniqImageHash = newHash(0);
struct hash *captionHash = newHash(0);
int imageWidth = 0, imageHeight = 0;
char path[PATH_LEN];
struct dyString *caption = dyStringNew(0);
struct dyString *copyright = dyStringNew(0);
struct dyString *probeNotes = dyStringNew(0);
boolean lookedForCopyright = FALSE;
	
safef(raName, sizeof(raName), "%s.ra", fileRoot);
safef(tabName, sizeof(tabName), "%s.tab", fileRoot);
safef(capName, sizeof(capName), "%s.txt", fileRoot);
tab = mustOpen(tabName, "w");
cap = mustOpen(capName, "w");


sqlDyStringPrintf(query, "select authors,journal,title,year from BIB_Refs where ");
sqlDyStringPrintf(query, "_Refs_key = '%s'", ref);
sr = sqlGetResultVerbose(conn, query->string);
row = sqlNextRow(sr);
if (row == NULL)
    errAbort("Can't find _Refs_key %s in BIB_Refs", ref);

/* Make ra file with stuff common to whole submission set. */
ra = mustOpen(raName, "w");
fprintf(ra, "submissionSource MGI\n");
fprintf(ra, "acknowledgement Thanks to the Gene Expression Database group at "
            "Mouse Genome Informatics (MGI) for collecting, annotating and sharing "
	    "this image. The MGI images were last updated in VisiGene on March 28, 2006. "
	    "Additional and more up to date annotations and images may be available "
	    "directly at <A HREF='http://www.informatics.jax.org' target='_blank'>MGI.</A>\n");
fprintf(ra, "submitSet jax%s\n", ref);
fprintf(ra, "taxon 10090\n");	/* Mus musculus taxon */
fprintf(ra, "fullDir http://hgwdev.gi.ucsc.edu/visiGene/full/inSitu/Mouse/jax\n");
fprintf(ra, "thumbDir http://hgwdev.gi.ucsc.edu/visiGene/200/inSitu/Mouse/jax\n");
fprintf(ra, "setUrl http://www.informatics.jax.org/\n");
fprintf(ra, "itemUrl http://www.informatics.jax.org/searches/image.cgi?%%s\n");
fprintf(ra, "abUrl http://www.informatics.jax.org/searches/antibody.cgi?%%s\n");
fprintf(ra, "journal %s\n", row[1]);
fprintf(ra, "publication %s\n", row[2]);
fprintf(ra, "year %s\n", row[3]);

/* The contributor (author) list is in format Kent WJ; Haussler DH; format in
 * Jackson.  We convert it to Kent W.J.,Haussler D.H., format for visiGene. */
fprintf(ra, "contributor ");
list = charSepToSlNames(row[0], ';');
for (el = list; el != NULL; el = el->next)
    {
    char *lastName = skipLeadingSpaces(el->name);
    char *initials = strrchr(lastName, ' ');
    if (initials == NULL)
	initials = "";
    else
	*initials++ = 0;
    fprintf(ra, "%s", lastName);
    if (initials[0] != 0)
	{
	char c;
	fprintf(ra, " ");
	while ((c = *initials++) != 0)
	    fprintf(ra, "%c.", c);
	}
    fprintf(ra, ",");
    }
fprintf(ra, "\n");
slNameFreeList(&list);
sqlFreeResult(&sr);

/* Add in link to PubMed record on publication. */
dyStringClear(query);
sqlDyStringPrintf(query,
   "select ACC_Accession.accID from ACC_Accession,ACC_LogicalDB "
   "where ACC_Accession._Object_key = %s "
   "and ACC_Accession._LogicalDB_key = ACC_LogicalDB._LogicalDB_key "
   "and ACC_LogicalDB.name = 'PubMed'", ref);
pubMed = sqlQuickStringVerbose(conn, query->string);
if (pubMed != NULL)
    fprintf(ra, "pubUrl https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=%s\n", pubMed);
freez(&pubMed);

dyStringClear(query);
sqlDyStringPrintf(query, 
	"select distinct MRK_Marker.symbol as gene,"
               "GXD_Specimen.sex as sex,"
	       "GXD_Specimen.age as age,"
	       "GXD_Specimen.ageMin as ageMin,"
	       "GXD_Specimen.ageMax as ageMax,"
	       "IMG_ImagePane.paneLabel as paneLabel,"
	       "ACC_Accession.numericPart as fileKey,"
	       "IMG_Image._Image_key as imageKey,"
	       "GXD_Assay._ProbePrep_key as probePrepKey,"
	       "GXD_Assay._AntibodyPrep_key as antibodyPrepKey,"
	       "GXD_Assay._ReporterGene_key as reporterGeneKey,"
	       "GXD_FixationMethod.fixation as fixation,"
	       "GXD_EmbeddingMethod.embeddingMethod as embedding,"
	       "GXD_Assay._Assay_key as assayKey,"
	       "GXD_Specimen.hybridization as sliceType,"
	       "GXD_Specimen._Genotype_key as genotypeKey,"
	       "IMG_ImagePane._ImagePane_key as imagePaneKey\n"
	"from MRK_Marker,"
	     "GXD_Assay,"
	     "GXD_Specimen,"
	     "GXD_InSituResult,"
	     "GXD_InSituResultImage,"
	     "GXD_FixationMethod,"
	     "GXD_EmbeddingMethod,"
	     "IMG_ImagePane,"
	     "IMG_Image,"
	     "ACC_Accession\n"
	"where MRK_Marker._Marker_key = GXD_Assay._Marker_key "
	  "and GXD_Assay._Assay_key = GXD_Specimen._Assay_key "
	  "and GXD_Specimen._Specimen_key = GXD_InSituResult._Specimen_key "
	  "and GXD_InSituResult._Result_key = GXD_InSituResultImage._Result_key "
	  "and GXD_InSituResultImage._ImagePane_key = IMG_ImagePane._ImagePane_key "
	  "and GXD_FixationMethod._Fixation_key = GXD_Specimen._Fixation_key "
	  "and GXD_EmbeddingMethod._Embedding_key = GXD_Specimen._Embedding_key "
	  "and IMG_ImagePane._Image_key = IMG_Image._Image_key "
	  "and IMG_Image._Image_key = ACC_Accession._Object_key "
	  "and ACC_Accession.prefixPart = 'PIX:' "
	  "and GXD_Assay._ImagePane_key is NULL "
	);
sqlDyStringPrintf(query, "and GXD_Assay._Refs_key = '%s'", ref);
sr = sqlGetResultVerbose(conn, query->string);

fprintf(tab, "#");
fprintf(tab, "gene\t");
fprintf(tab, "probeColor\t");
fprintf(tab, "sex\t");
fprintf(tab, "age\t");
fprintf(tab, "ageMin\t");
fprintf(tab, "ageMax\t");
fprintf(tab, "paneLabel\t");
fprintf(tab, "fileName\t");
fprintf(tab, "submitId\t");
fprintf(tab, "fPrimer\t");
fprintf(tab, "rPrimer\t");
fprintf(tab, "abName\t");
fprintf(tab, "abTaxon\t");
fprintf(tab, "abSubmitId\t");
fprintf(tab, "fixation\t");
fprintf(tab, "embedding\t");
fprintf(tab, "bodyPart\t");
fprintf(tab, "sliceType\t");
fprintf(tab, "genotype\t");
fprintf(tab, "strain\t");
fprintf(tab, "priority\t");
fprintf(tab, "captionId\t");
fprintf(tab, "imageWidth\t");
fprintf(tab, "imageHeight\n");

while ((row = sqlNextRow(sr)) != NULL)
    {
    char *gene = row[0];
    char *sex = row[1];
    char *age = row[2];
    char *ageMin = row[3];
    char *ageMax = row[4];
    char *paneLabel = row[5];
    char *fileKey = row[6];
    char *imageKey = row[7];
    char *probePrepKey = row[8];
    char *antibodyPrepKey = row[9];
    char *reporterGeneKey = row[10];
    char *fixation = row[11];
    char *embedding = row[12];
    char *assayKey = row[13];
    char *sliceType = row[14];
    char *genotypeKey = row[15];
    char *imagePaneKey = row[16];
    double calcAge = -1;
    char *probeColor = "";
    char *bodyPart = "";
    char *abName = NULL;
    char *rPrimer = NULL, *fPrimer = NULL;
    char *genotype = NULL;
    char *strain = NULL;
    char *priority = NULL;
    char abTaxon[32];
    char *captionId = "";
    char *abSubmitId = NULL;

    verbose(3, "   "); dumpRow(row, 16);

    if (age == NULL)
        continue;

    if (!lookedForCopyright)
	{
	struct sqlResult *sr = NULL;
	char **row;
	lookedForCopyright = TRUE;

	dyStringClear(query);
	sqlDyStringPrintf(query, 
	     "select note from MGI_NoteChunk,MGI_Note,MGI_NoteType,ACC_MGIType "
	     "where MGI_Note._Object_key = %s "
	     "and ACC_MGIType.name = 'Image' "
	     "and ACC_MGIType._MGIType_key = MGI_Note._MGIType_key "
	     "and MGI_NoteType.noteType='Copyright' "
	     "and MGI_Note._NoteType_key = MGI_NoteType._NoteType_key "
	     "and MGI_Note._Note_key = MGI_NoteChunk._Note_key "
	     "order by sequenceNum"
	     , imageKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	while ((row = sqlNextRow(sr)) != NULL)
	   dyStringAppend(copyright, row[0]);
	sqlFreeResult(&sr);

	verbose(2,"imageKey=%s\n",imageKey);

	if (copyright->stringSize != 0)
	    {
	    fprintf(ra, "copyright %s\n", copyright->string);
	    }
	}

    /* Massage sex */
        {
	if (sameString(sex, "Male"))
	    sex = "male";
	else if (sameString(sex, "Female"))
	    sex = "female";
	else
	    sex = "";
	}

    /* Massage age */
	{
	char *embryoPat = "embryonic day ";
	char *newbornPat = "postnatal newborn";
	char *dayPat = "postnatal day ";
	char *weekPat = "postnatal week ";
	char *adultPat = "postnatal adult";
	double calcMinAge = atof(ageMin);
	double calcMaxAge = atof(ageMax);
	double mouseBirthAge = 21.0;
	//double mouseAdultAge = 63.0;	/* Relative to conception, not birth */

	if (age[0] == 0)
	    {
	    warn("age null, ageMin %s, ageMax %s\n", ageMin, ageMax);
	    calcAge = (calcMinAge + calcMaxAge) * 0.5;
	    }
	else if (startsWith(embryoPat, age))
	    calcAge = atof(age+strlen(embryoPat));
	else if (sameString(newbornPat, age))
	    calcAge = mouseBirthAge;
	else if (startsWith(dayPat, age))
	    calcAge = atof(age+strlen(dayPat)) + mouseBirthAge;
        else if (startsWith(weekPat, age))
	    calcAge = 7.0 * atof(age+strlen(weekPat)) + mouseBirthAge;
	else if (sameString(adultPat, age) && calcMaxAge - calcMinAge > 1000 
		&& calcMinAge < 365)
	    calcAge = 365;	/* Most adult mice are relatively young */
	else
	    {
	    warn("Calculating age from %s", age);
	    calcAge = (calcMinAge + calcMaxAge) * 0.5;
	    }
	if (calcAge < calcMinAge)
	    calcAge = calcMinAge;
	if (calcAge > calcMaxAge)
	    calcAge = calcMaxAge;
	}
    
    /* Massage probeColor */
        {
	if (!isStrNull(reporterGeneKey))
	    {
	    /* Fixme: make sure that reporterGene's end up in probeType table. */
	    char *name = NULL;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, 
	    	"select term from VOC_Term where _Term_key = %s", 
	    	reporterGeneKey);
	    name = sqlQuickStringVerbose(conn2, query->string);
	    if (name == NULL)
	        warn("Can't find _ReporterGene_key %s in VOC_Term", 
			reporterGeneKey);
	    else if (sameString(name, "GFP"))
	        probeColor = "green";
	    else if (sameString(name, "lacZ"))
	        probeColor = "blue";
	    else 
	        warn("Don't know color of reporter gene %s", name);
	    freez(&name);
	    }
	if (!isStrNull(probePrepKey))
	    {
	    char *name = NULL;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, 
	      "select GXD_VisualizationMethod.visualization "
	      "from GXD_VisualizationMethod,GXD_ProbePrep "
	      "where GXD_ProbePrep._ProbePrep_key = %s "
	      "and GXD_ProbePrep._Visualization_key = GXD_VisualizationMethod._Visualization_key"
	      , probePrepKey);
	    name = sqlQuickStringVerbose(conn2, query->string);
	    if (name == NULL)
	        warn("Can't find visualization from _ProbePrep_key %s", probePrepKey);
	    probeColor = colorFromLabel(name, gene);
	    freez(&name);
	    if (probeColor[0] == 0)
	        {
		dyStringClear(query);
		sqlDyStringPrintf(query, 
			"select GXD_Label.label from GXD_Label,GXD_ProbePrep "
		        "where GXD_ProbePrep._ProbePrep_key = %s " 
			"and GXD_ProbePrep._Label_key = GXD_Label._Label_key"
		        , probePrepKey);
		name = sqlQuickStringVerbose(conn2, query->string);
		if (name == NULL)
		    warn("Can't find label from _ProbePrep_key %s", 
		    	probePrepKey);
		probeColor = colorFromLabel(name, gene);
		}
	    freez(&name);
	    }
	if (!isStrNull(antibodyPrepKey) && probeColor[0] == 0 )
	    {
	    char *name = NULL;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, 
		  "select GXD_Label.label from GXD_Label,GXD_AntibodyPrep "
		  "where GXD_AntibodyPrep._AntibodyPrep_key = %s "
		  "and GXD_AntibodyPrep._Label_key = GXD_Label._Label_key"
		  , antibodyPrepKey);
	    name = sqlQuickStringVerbose(conn2, query->string);
	    if (name == NULL)
		warn("Can't find label from _AntibodyPrep_key %s", antibodyPrepKey);
	    probeColor = colorFromLabel(name, gene);
	    freez(&name);
	    }
	}

    /* Get abName, abTaxon, abSubmitId */
    abTaxon[0] = 0;
    if (!isStrNull(antibodyPrepKey))
        {
	struct sqlResult *sr = NULL;
	int orgKey = 0;
	char **row;
	dyStringClear(query);
	sqlDyStringPrintf(query, 
		"select antibodyName,_Organism_key,GXD_Antibody._Antibody_key "
		"from GXD_AntibodyPrep,GXD_Antibody "
		"where GXD_AntibodyPrep._AntibodyPrep_key = %s "
		"and GXD_AntibodyPrep._Antibody_key = GXD_Antibody._Antibody_key"
		, antibodyPrepKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	row = sqlNextRow(sr);
	if (row != NULL)
	    {
	    abName = cloneString(row[0]);
	    orgKey = atoi(row[1]);
	    abSubmitId = cloneString(row[2]);
	    }
	sqlFreeResult(&sr);

	if (orgKey > 0)
	    {
	    char *latinName = NULL, *commonName = NULL;
	    int spTaxon = 0;
	    dyStringClear(query);
	    sqlDyStringPrintf(query, "select latinName from MGI_Organism "
	                          "where _Organism_key = %d", orgKey);
	    latinName = sqlQuickStringVerbose(conn2, query->string);
	    if (latinName != NULL 
		&& !sameString(latinName, "Not Specified")
		&& !sameString(latinName, "Not Applicable"))
		{
		char *e = strchr(latinName, '/');
		if (e != NULL) 
		   *e = 0;	/* Chop off / and after. */
		spTaxon = spBinomialToTaxon(connSp, latinName);
		}
	    else
	        {
		dyStringClear(query);
		sqlDyStringPrintf(query, "select commonName from MGI_Organism "
	                          "where _Organism_key = %d", orgKey);
		commonName = sqlQuickStringVerbose(conn2, query->string);
		if (commonName != NULL 
		    && !sameString(commonName, "Not Applicable")
		    && !sameString(commonName, "Not Specified"))
		    {
		    spTaxon = spCommonToTaxon(connSp, commonName);
		    }
		}
	    if (spTaxon != 0)
	        safef(abTaxon, sizeof(abTaxon), "%d", spTaxon);
	    freez(&latinName);
	    freez(&commonName);
	    }
	}
    if (abName == NULL)
        abName = cloneString("");
    if (abSubmitId == NULL)
        abSubmitId = cloneString("");

    /* Get rPrimer, lPrimer */
    if (!isStrNull(probePrepKey))
        {
	struct sqlResult *sr = NULL;
	char **row;
	dyStringClear(query);
	sqlDyStringPrintf(query,
	    "select primer1sequence,primer2sequence "
	    "from PRB_Probe,GXD_ProbePrep "
	    "where PRB_Probe._Probe_key = GXD_ProbePrep._Probe_key "
	    "and GXD_ProbePrep._ProbePrep_key = %s"
	    , probePrepKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	row = sqlNextRow(sr);
	if (row != NULL)
	    {
	    fPrimer = cloneString(row[0]);
	    rPrimer = cloneString(row[1]);
	    }
	sqlFreeResult(&sr);
	}

    /* Note Jackson database actually stores the primers very
     * erratically.  In all the cases I can find for in situs
     * the primers are actually stored in free text in the PRB_Notes
     * e.g.  ... primers CGCGGATCCAGGGGAAACAGAAGGGCTGCG and CCCAAGCTTAGACTGTACAGGCTGAGCC ...
     */
    if (fPrimer == NULL || fPrimer[0]==0)
        {
	struct sqlResult *sr = NULL;
	char **row;
	dyStringClear(query);
	sqlDyStringPrintf(query,
	    "select PRB_Notes.note from GXD_ProbePrep, PRB_Notes"
	    " where GXD_ProbePrep._ProbePrep_key = %s"
	    "  and GXD_ProbePrep._Probe_key = PRB_Notes._Probe_key"
	    " order by PRB_Notes.sequenceNum"
	    , probePrepKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	dyStringClear(probeNotes);
	while ((row = sqlNextRow(sr)) != NULL)
	   dyStringAppend(probeNotes, row[0]);
	sqlFreeResult(&sr);

	if (probeNotes->stringSize > 0)
	    {
	    char f[256];
	    char r[256];
	    int i = 0;
	    char *s = strstr(probeNotes->string," primers ");
	    if (s)
		{
		s += strlen(" primers ");
		i = 0;
		while (strchr("ACGT",*s) && (i<sizeof(f)))
		    f[i++] = *s++;
		f[i]=0;
		if (strstr(s," and ")==s)
		    {
		    s += strlen(" and ");
		    i = 0;
    		    while (strchr("ACGT",*s) && (i<sizeof(r)))
    			r[i++] = *s++;
    		    r[i]=0;
		    if (strlen(f) >= 10 && strlen(r) >= 10)
			{
			fPrimer = cloneString(f);
			rPrimer = cloneString(r);
			}
		    else
			{
			verbose(1, "bad primer parse:_ProbePrep_key=%s fPrimer=[%s], rPrimer=[%s]\n",
			    probePrepKey,f,r);
			}
		    }
		}
	    }
	}
	
    if (fPrimer == NULL)
        fPrimer = cloneString("");
    if (rPrimer == NULL)
        rPrimer = cloneString("");

    fixation = blankOutUnknown(fixation);
    embedding = blankOutUnknown(embedding);

    /* Massage body part and slice type.  We only handle whole mounts. */
    if (sameString(sliceType, "whole mount"))
	{
	bodyPart = "whole";
	priority = "100";
	}
    else
	{
        sliceType = "";
	priority = "1000";
	}

    genotypeAndStrainFromKey(genotypeKey, conn2, &genotype, &strain);

    if (isStrNull(paneLabel))
	paneLabel = cloneString("");	  /* trying to suppress nulls in output */
    stripChar(paneLabel, '"');	/* Get rid of a difficult quote to process. */
    
    /* Fetch image dimensions from file. */
    imageWidth=0;
    imageHeight=0;
    safef(path, sizeof(path), "%s/%s.jpg", inJax, fileKey);
    if (fileExists(path))
    	jpegSize(path,&imageWidth,&imageHeight);  /* will errAbort if no valid .jpeg exists */
    else
	warn("Picture Missing! %s ",path);
    
    /* Deal caption if any.  Most of the work only happens the
     * first time see the image. */
    if (!hashLookup(uniqImageHash, imageKey))
        {
	struct sqlResult *sr = NULL;
	char **row;
	hashAdd(uniqImageHash, imageKey, NULL);
	dyStringClear(caption);
	dyStringClear(query);
	sqlDyStringPrintf(query, 
	     "select note from MGI_NoteChunk,MGI_Note,MGI_NoteType,ACC_MGIType "
	     "where MGI_Note._Object_key = %s "
	     "and ACC_MGIType.name = 'Image' "
	     "and ACC_MGIType._MGIType_key = MGI_Note._MGIType_key "
	     "and MGI_NoteType.noteType='Caption' "
	     "and MGI_Note._NoteType_key = MGI_NoteType._NoteType_key "
	     "and MGI_Note._Note_key = MGI_NoteChunk._Note_key "
	     "order by sequenceNum"
	     , imageKey);
	sr = sqlGetResultVerbose(conn2, query->string);
	while ((row = sqlNextRow(sr)) != NULL)
	   dyStringAppend(caption, row[0]);
	sqlFreeResult(&sr);

	if (caption->stringSize > 0)
	    {
	    subChar(caption->string, '\t', ' ');
	    subChar(caption->string, '\n', ' ');
	    fprintf(cap, "%s\t%s\n", imageKey, caption->string);
	    hashAdd(captionHash, imageKey, imageKey);
	    }
	}
    if (hashLookup(captionHash, imageKey))
        captionId = imageKey;
    else
        captionId = "";

    fprintf(tab, "%s\t", gene);
    fprintf(tab, "%s\t", probeColor);
    fprintf(tab, "%s\t", sex);
    fprintf(tab, "%3.2f\t", calcAge);
    fprintf(tab, "%s\t", ageMin);
    fprintf(tab, "%s\t", ageMax);
    fprintf(tab, "%s\t", paneLabel);   /* may have to change NULL to empty string or "0" ? */
    fprintf(tab, "%s.jpg\t", fileKey);
    fprintf(tab, "%s\t", imageKey);
    fprintf(tab, "%s\t", fPrimer);
    fprintf(tab, "%s\t", rPrimer);
    fprintf(tab, "%s\t", abName);
    fprintf(tab, "%s\t", abTaxon);
    fprintf(tab, "%s\t", abSubmitId);
    fprintf(tab, "%s\t", fixation);
    fprintf(tab, "%s\t", embedding);
    fprintf(tab, "%s\t", bodyPart);
    fprintf(tab, "%s\t", sliceType);
    fprintf(tab, "%s\t", genotype);
    fprintf(tab, "%s\t", strain);
    fprintf(tab, "%s\t", priority);
    fprintf(tab, "%s\t", captionId);
    fprintf(tab, "%d\t", imageWidth);
    fprintf(tab, "%d\n", imageHeight);

    printExpression(tab,  conn2,  imagePaneKey, assayKey);
    gotAny = TRUE;
    freez(&genotype);
    freez(&abName);
    freez(&abSubmitId);
    freez(&rPrimer);
    freez(&fPrimer);
    }
sqlFreeResult(&sr);

carefulClose(&ra);
carefulClose(&tab);
carefulClose(&cap);

if (!gotAny)
    {
    remove(raName);
    remove(capName);
    remove(tabName);
    }
dyStringFree(&probeNotes);
dyStringFree(&copyright);
dyStringFree(&caption);
dyStringFree(&query);
hashFree(&uniqImageHash);
hashFree(&captionHash);

}
Beispiel #17
0
void checkMetaTrackDb(struct mdbObj *mdbObj, struct hash *trackHash)
{
verbose(1, "----------------------------------------------\n");
verbose(1, "Checking that tables specified in metaDb exist in trackDb\n");
verbose(1, "----------------------------------------------\n");
for(; mdbObj != NULL; mdbObj=mdbObj->next)
    {
    struct mdbVar *mdbVar = hashFindVal(mdbObj->varHash, "objType");
    if (mdbVar == NULL)
        {
        warn("objType not found in object %s", mdbObj->obj);
        continue;
        }
    if (differentString(mdbVar->val, "table"))
        continue;

    if (mdbObj->deleteThis)
        continue;

    mdbVar = hashFindVal(mdbObj->varHash, "tableName");
    if (mdbVar == NULL)
        {
        warn("tableName not found in object %s", mdbObj->obj);
        continue;
        }

    char *tableName = mdbVar->val;

    struct mdbVar *atticVar = hashFindVal(mdbObj->varHash, "attic");
    struct mdbVar *statusVar = hashFindVal(mdbObj->varHash, "objStatus");
    char *reason = NULL;
    if (atticVar)
	reason = "attic";
    if (statusVar)
	{
	if (startsWith("renamed", statusVar->val))
	    reason = "renamed";
	if (startsWith("replaced", statusVar->val))
	    reason = "replaced";
	if (startsWith("revoked", statusVar->val))
	    reason = "revoked";
	}

    if (hashLookup(trackHash, tableName))
        {
	if (reason)
	    {
	    warn("%s table %s: should NOT be found in trackDb", reason, tableName);
	    continue;
	    }
        }
    else
        {
	if (reason)
	    { // ok because attic, replaced, revoked, renamed should not be in trackDb
	    continue;
	    }
        warn("table %s: not found in trackDb",tableName);
        }
    }
}
void trimUniq(bioSeq *seqList)
/* Check that all seq's in list have a unique name.  Try and
 * abbreviate longer sequence names. */
{
struct hash *hash = newHash(0);
bioSeq *seq;

for (seq = seqList; seq != NULL; seq = seq->next)
    {
    char *saferString = needMem(strlen(seq->name)+1);
    char *c, *s;

    /*	Some chars are safe to allow through, other chars cause
     *	problems.  It isn't necessarily a URL safe string that is
     *	being calculated here.  The original problem was a user had
     *	the fasta header line of:
     *	chr8|59823648:59825047|+
     *	The plus sign was being taken as the query name and this
     *	created problems as that name was passed on to hgc via
     *	the ss cart variable.  The + sign became part of a URL
     *	eventually.  This loop allows only isalnum and =_/.:;_|
     *	to get through as part of the header name.  These characters
     *	all proved to be safe as single character names, or all
     *	together.
     */
    s = saferString;
    for (c = seq->name; *c != '\0'; ++c)
	{
	if (c && (*c != '\0'))
	    {
	    if ( isalnum(*c) || (*c == '=') || (*c == '-') || (*c == '/') ||
		(*c == '.') || (*c == ':') || (*c == ';') || (*c == '_') ||
		    (*c == '|') )
		*s++ = *c;
	    }
	}
    *s = '\0';
    freeMem(seq->name);
    if (*saferString == '\0')
	{
	freeMem(saferString);
	saferString = cloneString("YourSeq");
	}
    seq->name = saferString;

    if (strlen(seq->name) > 14)	/* Try and get rid of long NCBI .fa cruft. */
        {
	char *nameClone = NULL;
	char *abbrv = NULL;
	char *words[32];
	int wordCount;
	boolean isEns = (stringIn("ENSEMBL:", seq->name) != NULL);

	nameClone = cloneString(seq->name);
	wordCount = chopString(nameClone, "|", words, ArraySize(words));
	if (wordCount > 1)	/* Looks like it's an Ensembl/NCBI 
		                 * long name alright. */
	    {
	    if (isEns)
		{
	        abbrv = words[0];
		if (abbrv[0] == 0) abbrv = words[1];
		}
	    else if (sameString(words[1], "dbSNP"))
	        {
		if (wordCount > 2)
		    abbrv = words[2];
		else
		    abbrv = nameClone;
		}
	    else
		{
		abbrv = words[wordCount-1];
		if (abbrv[0] == 0) abbrv = words[wordCount-2];
		}
	    if (hashLookup(hash, abbrv) == NULL)
	        {
		freeMem(seq->name);
		seq->name = cloneString(abbrv);
		}
	    freez(&nameClone);
	    }
	}
    hashAddUnique(hash, seq->name, hash);
    }
freeHash(&hash);
}
Beispiel #19
0
boolean outputProtein(struct cdsEvidence *cds, struct dnaSeq *txSeq, FILE *f)
/* Translate txSeq to protein guided by cds, and output to file.
 * The implementation is a little complicated by checking for internal
 * stop codons and other error conditions. Return True if a sequence was
 * generated and was not impacted by error conditions, False otherwise */
{
boolean selenocysteine = FALSE;
if (selenocysteineHash != NULL)
    {
    if (hashLookup(selenocysteineHash, txSeq->name))
	selenocysteine = TRUE;
    }
struct dyString *dy = dyStringNew(4*1024);
int blockIx;
for (blockIx=0; blockIx<cds->cdsCount; ++blockIx)
    {
    DNA *dna = txSeq->dna + cds->cdsStarts[blockIx];
    int rnaSize = cds->cdsSizes[blockIx];
    if (rnaSize%3 != 0)
        {
	warn("size of block (%d) #%d not multiple of 3 in %s (source %s %s)",
		 rnaSize, blockIx, cds->name, cds->source, cds->accession);
	return FALSE;
	}
    int aaSize = rnaSize/3;
    int i;
    for (i=0; i<aaSize; ++i)
        {
	AA aa = lookupCodon(dna);
	if (aa == 0) 
	    {
	    aa = '*';
	    if (selenocysteine)
	        {
		if (!isReallyStopCodon(dna, TRUE))
		    aa = 'U';
		}
	    }
	dyStringAppendC(dy, aa);
	dna += 3;
	}
    }
int lastCharIx = dy->stringSize-1;
if (dy->string[lastCharIx] == '*')
    {
    dy->string[lastCharIx] = 0;
    dy->stringSize = lastCharIx;
    }
char *prematureStop = strchr(dy->string, '*');
if (prematureStop != NULL)
    {
    warn("Stop codons in CDS at position %d for %s, (source %s %s)", 
	 (int)(prematureStop - dy->string), cds->name, cds->source, cds->accession);
    return(FALSE);
    }
else
    {
    faWriteNext(f, cds->name, dy->string, dy->stringSize);
    dyStringFree(&dy);
    return(TRUE);
    }
}
Beispiel #20
0
static void getRepeatsUnsplit(struct sqlConnection *conn,
	struct hash *chromHash, struct hash *arHash)
/* Return a tree of ranges for sequence gaps all chromosomes, 
 * assuming an unsplit table -- when the table is unsplit, it's 
 * probably for a scaffold assembly where we *really* don't want 
 * to do one query per scaffold! */
{
struct sqlResult *sr;
char **row;
struct rbTreeNode **stack = lmAlloc(qLm, 256 * sizeof(stack[0]));
struct rbTree *allTree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack);
struct rbTreeNode **newstack = lmAlloc(qLm, 256 * sizeof(newstack[0]));
struct rbTree *newTree = rbTreeNewDetailed(simpleRangeCmp, qLm, newstack);
char *prevChrom = NULL;
struct simpleRange *prevRange = NULL, *prevNewRange = NULL;

sr = sqlGetResult(conn,
    "NOSQLINJ select genoName,genoStart,genoEnd,repName,repClass,repFamily from rmsk "
    "order by genoName,genoStart");
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct simpleRange *range;
    char arKey[512];
    if (prevChrom == NULL)
	prevChrom = cloneString(row[0]);
    else if (! sameString(prevChrom, row[0]))
	{
	rbTreeAdd(allTree, prevRange);
	if (prevNewRange != NULL)
	    rbTreeAdd(newTree, prevNewRange);
	setRepeats(prevChrom, chromHash, allTree, newTree);
	freeMem(prevChrom);
	prevRange = prevNewRange = NULL;
	stack = lmAlloc(qLm, 256 * sizeof(stack[0]));
	allTree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack);
	if (arHash != NULL)
	    {
	    stack = lmAlloc(qLm, 256 * sizeof(stack[0]));
	    newTree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack);
	    }
	prevChrom = cloneString(row[0]);
	}
    lmAllocVar(allTree->lm, range);
    range->start = sqlUnsigned(row[1]);
    range->end = sqlUnsigned(row[2]);
    if (prevRange == NULL)
	prevRange = range;
    else if (overlap(range, prevRange))
	{
	/* merge r into prevR & discard; prevR gets passed forward. */
	if (range->end > prevRange->end)
	    prevRange->end = range->end;
	if (range->start < prevRange->start)
	    prevRange->start = range->start;
	}
    else
	{
	rbTreeAdd(allTree, prevRange);
	prevRange = range;
	}
    sprintf(arKey, "%s.%s.%s", row[3], row[4], row[5]);
    if (arHash != NULL && hashLookup(arHash, arKey))
        {
	lmAllocVar(newTree->lm, range);
	range->start = sqlUnsigned(row[1]);
	range->end = sqlUnsigned(row[2]);
	if (prevNewRange == NULL)
	    prevNewRange = range;
	else if (overlap(range, prevNewRange))
	    {
	    /* merge r into prevR & discard; prevR gets passed forward. */
	    if (range->end > prevNewRange->end)
		prevNewRange->end = range->end;
	    if (range->start < prevNewRange->start)
		prevNewRange->start = range->start;
	    }
	else
	    {
	    rbTreeAdd(newTree, prevNewRange);
	    prevNewRange = range;
	    }
	}
    }
if (prevChrom != NULL)
    {
    rbTreeAdd(allTree, prevRange);
    if (prevNewRange != NULL)
	rbTreeAdd(newTree, prevNewRange);
    setRepeats(prevChrom, chromHash, allTree, newTree);
    freeMem(prevChrom);
    }
sqlFreeResult(&sr);
}
static struct jsonWrite *rTdbToJw(struct trackDb *tdb, struct hash *fieldHash,
                                  struct hash *excludeTypesHash, int depth, int maxDepth)
/* Recursively build and return a new jsonWrite object with JSON for tdb and its children,
 * or NULL if tdb or all children have been filtered out by excludeTypesHash.
 * If excludeTypesHash is non-NULL, omit any tracks/views/subtracks with type in excludeTypesHash.
 * If fieldHash is non-NULL, include only the field names indexed in fieldHash. */
{
if (maxDepth >= 0 && depth > maxDepth)
    return NULL;
boolean doSubtracks = (tdb->subtracks && fieldOk("subtracks", fieldHash));
// If excludeTypesHash is given and tdb is a leaf track/subtrack, look up the first word
// of tdb->type in excludeTypesHash; if found, return NULL.
if (excludeTypesHash && !doSubtracks)
    {
    char typeCopy[PATH_LEN];
    safecpy(typeCopy, sizeof(typeCopy), tdb->type);
    if (hashLookup(excludeTypesHash, firstWordInLine(typeCopy)))
        return NULL;
    }
boolean gotSomething = !doSubtracks;
struct jsonWrite *jwNew = jsonWriteNew();
jsonWriteObjectStart(jwNew, NULL);
writeTdbSimple(jwNew, tdb, fieldHash);
if (tdb->parent && fieldOk("parent", fieldHash))
    {
    // We can't link to an object in JSON and better not recurse here or else infinite loop.
    if (tdbIsSuperTrackChild(tdb))
        {
        // Supertracks have been omitted from fullTrackList, so add the supertrack object's
        // non-parent/child info here.
        jsonWriteObjectStart(jwNew, "parent");
        writeTdbSimple(jwNew, tdb->parent, fieldHash);
        jsonWriteObjectEnd(jwNew);
        }
    else
        // Just the name so we don't have infinite loops.
        jsonWriteString(jwNew, "parent", tdb->parent->track);
    }
if (doSubtracks)
    {
    jsonWriteListStart(jwNew, "subtracks");
    slSort(&tdb->subtracks, trackDbViewCmp);
    struct trackDb *subTdb;
    for (subTdb = tdb->subtracks;  subTdb != NULL;  subTdb = subTdb->next)
        {
        struct jsonWrite *jwSub = rTdbToJw(subTdb, fieldHash, excludeTypesHash, depth+1, maxDepth);
        if (jwSub)
            {
            gotSomething = TRUE;
            jsonWriteAppend(jwNew, NULL, jwSub);
            jsonWriteFree(&jwSub);
            }
        }
    jsonWriteListEnd(jwNew);
    }
jsonWriteObjectEnd(jwNew);
if (! gotSomething)
    // All children were excluded; clean up and null out jwNew.
    jsonWriteFree(&jwNew);
return jwNew;
}
Beispiel #22
0
static void getRepeats(struct sqlConnection *conn, struct hash *arHash,
    char *chrom, struct rbTree **retAllRepeats,
	struct rbTree **retNewRepeats)
/* Return a tree of ranges for sequence gaps in chromosome */
{
char *db = sqlGetDatabase(conn);
struct sqlResult *sr;
char **row;
struct rbTree *allTree = rbTreeNew(simpleRangeCmp);
struct rbTree *newTree = rbTreeNew(simpleRangeCmp);
char tableName[64];
char query[256];
boolean splitRmsk = TRUE;
struct simpleRange *prevRange = NULL, *prevNewRange = NULL;

safef(tableName, sizeof(tableName), "%s_rmsk", chrom);
if (! sqlTableExists(conn, tableName))
    {
    safef(tableName, sizeof(tableName), "rmsk");
    if (! sqlTableExists(conn, tableName))
	errAbort("Can't find rmsk table for %s (%s.%s_rmsk or %s.rmsk)\n",
		 chrom, db, chrom, db);
    splitRmsk = FALSE;
    }
if (splitRmsk)
    sqlSafef(query, sizeof query,
	    "select genoStart,genoEnd,repName,repClass,repFamily from %s",
	    tableName);
else
    sqlSafef(query, sizeof query,
	    "select genoStart,genoEnd,repName,repClass,repFamily from %s "
	    "where genoName = \"%s\"",
	    tableName, chrom);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct simpleRange *range;
    char arKey[512];
    lmAllocVar(allTree->lm, range);
    range->start = sqlUnsigned(row[0]);
    range->end = sqlUnsigned(row[1]);
    if (prevRange == NULL)
	prevRange = range;
    else if (overlap(range, prevRange))
	{
	/* merge r into prevR & discard; prevR gets passed forward. */
	if (range->end > prevRange->end)
	    prevRange->end = range->end;
	if (range->start < prevRange->start)
	    prevRange->start = range->start;
	}
    else
	{
	rbTreeAdd(allTree, prevRange);
	prevRange = range;
	}
    sprintf(arKey, "%s.%s.%s", row[2], row[3], row[4]);
    if (arHash != NULL && hashLookup(arHash, arKey))
        {
	lmAllocVar(newTree->lm, range);
	range->start = sqlUnsigned(row[0]);
	range->end = sqlUnsigned(row[1]);
	if (prevNewRange == NULL)
	    prevNewRange = range;
	else if (overlap(range, prevNewRange))
	    {
	    /* merge r into prevR & discard; prevR gets passed forward. */
	    if (range->end > prevNewRange->end)
		prevNewRange->end = range->end;
	    if (range->start < prevNewRange->start)
		prevNewRange->start = range->start;
	    }
	else
	    {
	    rbTreeAdd(allTree, prevNewRange);
	    prevNewRange = range;
	    }
	}
    }
if (prevRange != NULL)
    rbTreeAdd(allTree, prevRange);
if (prevNewRange != NULL)
    rbTreeAdd(newTree, prevNewRange);
sqlFreeResult(&sr);
*retAllRepeats = allTree;
*retNewRepeats = newTree;
}
/* Small FSM. States indicate what the machine is looking for *next*, 
 * so eg _cfgKEYSTART means "looking for the token that indicates the start
 * of a key"
*/
struct configFile *_cfgParseConfigFile (struct configFile *cfg)
{
	char *currentSectionString="DEFAULT";
	char *currentStringStart=NULL;
	char *currentKey=NULL;
	unsigned int filePos=0, state=_cfgKEYSTART;
	hash_table *tempHash;
	
	/* Create the default section. */
	tempHash=hashConstructTable (31);
	hashInsert (currentSectionString, tempHash, cfg->sections);
	
	while (filePos < cfg->bbdgSize) {
		switch (state) {
			case _cfgKEYSTART:
				if (cfg->bbdg[filePos]=='[') {
					filePos++;
					currentStringStart=(char *) &(cfg->bbdg[filePos]);
					state=_cfgSECTIONEND;
					break;
				}
				if (isCommentStart(cfg->bbdg[filePos])) {
					filePos++;
					state=_cfgCOMMENTEND;
					break;
				}
				if ( !isspace (cfg->bbdg[filePos]) ) {
					currentStringStart=(char *) &(cfg->bbdg[filePos]);
					state=_cfgKEYEND;
				} else {
					filePos ++;
				}
				break;
			case _cfgCOMMENTEND:
				if (cfg->bbdg[filePos]=='\n') {
					state=_cfgKEYSTART;
				}
				filePos++;
				break;
			case _cfgSECTIONEND:
				if (cfg->bbdg[filePos]==']') {
					cfg->bbdg[filePos]='\0';
					currentSectionString=currentStringStart;
					state=_cfgKEYSTART;
				}
				filePos++;
				break;
			case _cfgKEYEND:
				if (isspace (cfg->bbdg[filePos]) || isKeyValSep(cfg->bbdg[filePos])) {
					if (isKeyValSep(cfg->bbdg[filePos])) {
						cfg->bbdg[filePos]='\0';
					} else {
						cfg->bbdg[filePos]='\0';
						filePos++;
					}
					currentKey=currentStringStart;
					state=_cfgCOLON;
				} else {
					//Do this in search routine instead (with strcasecmp)
					//cfg->bbdg[filePos] = tolower(cfg->bbdg[filePos]);
					filePos++;
				}
				break;
			case _cfgCOLON:
				if (isKeyValSep(cfg->bbdg[filePos]) || cfg->bbdg[filePos]=='\0') {
					state=_cfgVALSTART;
				}
				filePos++;
				break;
			case _cfgVALSTART:
				if (!myisblank(cfg->bbdg[filePos])) {
					currentStringStart=(char *) &(cfg->bbdg[filePos]);
					state=_cfgVALEND;
				} else {
					filePos ++;
				}
				break;
			case _cfgVALEND:
				if (cfg->bbdg[filePos]=='\n' || isCommentStart(cfg->bbdg[filePos])) {
					/* First see if the current section exists. */
					tempHash=hashLookup (currentSectionString, cfg->sections);
					if (tempHash==NULL) {
						tempHash=hashConstructTable (31);
						hashInsert (currentSectionString, tempHash, cfg->sections);
					}
					/* Now stick it in the table. */
					if (isCommentStart(cfg->bbdg[filePos])) {
						cfg->bbdg[filePos]='\0';
						hashInsert (currentKey, currentStringStart, tempHash);
						state=_cfgCOMMENTEND;
					} else {
						cfg->bbdg[filePos]='\0';
						hashInsert (currentKey, currentStringStart, tempHash);
						state=_cfgKEYSTART;
					}
				}
				filePos++;
				break;
		}
		
	}
	return cfg;
}
static void filterBed(struct track *tg, struct linkedFeatures **pLfList)
/* Apply filters if any to mRNA linked features. */
{
struct linkedFeatures *lf, *next, *newList = NULL, *oldList = NULL;
struct mrnaUiData *mud = tg->extraUiData;
struct mrnaFilter *fil;
char *type;
boolean anyFilter = FALSE;
boolean colorIx = 0;
boolean isExclude = FALSE;
boolean andLogic = TRUE;

if (*pLfList == NULL || mud == NULL)
    return;

/* First make a quick pass through to see if we actually have
 * to do the filter. */
for (fil = mud->filterList; fil != NULL; fil = fil->next)
    {
    fil->pattern = cartUsualString(cart, fil->key, "");
    if (fil->pattern[0] != 0)
        anyFilter = TRUE;
    }
if (!anyFilter)
    return;

type = cartUsualString(cart, mud->filterTypeVar, "red");
if (sameString(type, "exclude"))
    isExclude = TRUE;
else if (sameString(type, "include"))
    isExclude = FALSE;
else
    colorIx = getFilterColor(type, MG_BLACK);
type = cartUsualString(cart, mud->logicTypeVar, "and");
andLogic = sameString(type, "and");

/* Make a pass though each filter, and start setting up search for
 * those that have some text. */
for (fil = mud->filterList; fil != NULL; fil = fil->next)
    {
    fil->pattern = cartUsualString(cart, fil->key, "");
    if (fil->pattern[0] != 0)
	{
	fil->hash = newHash(10);
	}
    }

/* Scan tables id/name tables to build up hash of matching id's. */
for (fil = mud->filterList; fil != NULL; fil = fil->next)
    {
    struct hash *hash = fil->hash;
    int wordIx, wordCount;
    char *words[128];

    if (hash != NULL)
	{
	boolean anyWild;
	char *dupPat = cloneString(fil->pattern);
	wordCount = chopLine(dupPat, words);
	for (wordIx=0; wordIx <wordCount; ++wordIx)
	    {
	    char *pattern = cloneString(words[wordIx]);
	    if (lastChar(pattern) != '*')
		{
		int len = strlen(pattern)+1;
		pattern = needMoreMem(pattern, len, len+1);
		pattern[len-1] = '*';
		}
	    anyWild = (strchr(pattern, '*') != NULL || strchr(pattern, '?') != NULL);
	    touppers(pattern);
	    for(lf = *pLfList; lf != NULL; lf=lf->next)
		{
		char copy[SMALLBUF];
		boolean gotMatch;
		safef(copy, sizeof(copy), "%s", lf->name);
		touppers(copy);
		if (anyWild)
		    gotMatch = wildMatch(pattern, copy);
		else
		    gotMatch = sameString(pattern, copy);
		if (gotMatch)
		    {
		    hashAdd(hash, lf->name, NULL);
		    }
		}
	    freez(&pattern);
	    }
	freez(&dupPat);
	}
    }

/* Scan through linked features coloring and or including/excluding ones that
 * match filter. */
for (lf = *pLfList; lf != NULL; lf = next)
    {
    boolean passed = andLogic;
    next = lf->next;
    for (fil = mud->filterList; fil != NULL; fil = fil->next)
	{
	if (fil->hash != NULL)
	    {
	    if (hashLookup(fil->hash, lf->name) == NULL)
		{
		if (andLogic)
		    passed = FALSE;
		}
	    else
		{
		if (!andLogic)
		    passed = TRUE;
		}
	    }
	}
    if (passed ^ isExclude)
	{
	slAddHead(&newList, lf);
	if (colorIx > 0)
	    lf->filterColor = colorIx;
	}
    else
        {
	slAddHead(&oldList, lf);
	}
    }

slReverse(&newList);
slReverse(&oldList);
if (colorIx > 0)
   {
   /* Draw stuff that passes filter first in full mode, last in dense. */
   if (tg->visibility == tvDense)
       {
       newList = slCat(oldList, newList);
       }
   else
       {
       newList = slCat(newList, oldList);
       }
   }
*pLfList = newList;
tg->limitedVisSet = FALSE;	/* Need to recalculate this after filtering. */

/* Free up hashes, etc. */
for (fil = mud->filterList; fil != NULL; fil = fil->next)
    {
    hashFree(&fil->hash);
    }
}
struct bed *searchStrand(struct hash *sixers, struct cutter *ACGTo[], struct dnaSeq *seq, int startOffset, char strand)
/* Cheesey function that checks a strand for the enzymes after they're put in the hash/array structures.  
   This used to be a part of the matchEnzymes function but I do it twice now. */
{
struct cutter *enz;
struct bed *bedList = NULL;
int seqPos;

if (ACGTo[0] || ACGTo[1] || ACGTo[2] || ACGTo[3] || (sixers->elCount > 0)) 
    {
    for (seqPos = 0; seqPos < seq->size; seqPos++)
	{
	struct cutter *enzList = NULL;
	char sixer[7];
	int bedPos = (strand == '-') ? (seq->size - seqPos) : seqPos;
	if (seq->size - seqPos >= 6)
	    {
	    struct hashEl *el = NULL;
	    sixer[6] = '\0';
	    memcpy(sixer, seq->dna+seqPos, 6);
	    el = hashLookup(sixers, sixer);
	    if (el)
		{	    
		struct bed *add;
		enz = el->val;
		add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand);
		slAddHead(&bedList, add);
		/* Just in case there's another one with the same sequence. */
		while ((el = hashLookupNext(el)))
		    {
		    enz = el->val;
		    add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand);
		    slAddHead(&bedList, add);
		    }
		}
	    }
	/* Use a certain list depending on which letter we're on in the sequence. */
	if (seq->dna[seqPos] == 'A')
	    enzList = ACGTo[0];
	else if (seq->dna[seqPos] == 'C')
	    enzList = ACGTo[1];
	else if (seq->dna[seqPos] == 'G')
	    enzList = ACGTo[2];
	else if (seq->dna[seqPos] == 'T')
	    enzList = ACGTo[3];
	for (enz = enzList; enz != NULL; enz = enz->next)
	    {
	    int enzPos = 0;
	    int seqCurPos = seqPos;	
	    while (enzPos < enz->size && seqCurPos < seq->size && matchingBase(enz->seq[enzPos],seq->dna[seqCurPos]))
		{
		enzPos++; 
		seqCurPos++;
		}
	    if (enzPos == enz->size)
		{
		struct bed *add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand);
		slAddHead(&bedList, add);
		}
	    }
	}
    }
return bedList;
}
Beispiel #26
0
void writeGap(struct gapInfo *gap, struct xaAli *xa, 
    int symStart, int symEnd, char geneStrand, FILE *f)
/* Write out info on one gap to file. */
{
char qStart[totSize+1], qEnd[totSize+1];
char tStart[totSize+1], tEnd[totSize+1];
char hStart[totSize+1], hEnd[totSize+1];
int s, e, size;
int midSize;
int i;
char *threePrime, *fivePrime;
boolean isQgap;

fprintf(f, "%s %s %s hom  %s:%d-%d %c %s:%d-%d %c slide %d\n",
    gapTypeStrings[gap->type], 
    (gap->hasIntronEnds ? " intron" : "!intron"),
    (gap->hasStrongHomology ? "heavy" : "light"),
    gap->query, gap->qStart, gap->qEnd, xa->qStrand,
    gap->target, gap->tStart, gap->tEnd, geneStrand, gap->slideCount);
s = symStart-exSize;
e = symStart + inSize;
if (s < 0)
    s = 0;
size = e-s;

uglyf("s %d size %d e %d totSize %d\n", s, size, e, totSize);

strncpy(qStart, xa->qSym+s, size);
strncpy(tStart, xa->tSym+s, size);
strncpy(hStart, xa->hSym+s, size);
qStart[size] = tStart[size] = hStart[size] = 0;

// uglyf - crashes by here

s = symEnd-inSize;
midSize = s - e;
e = symEnd+exSize;
if (e > xa->symCount)
    e = xa->symCount;
size = e-s;
strncpy(qEnd, xa->qSym+s, size);
strncpy(tEnd, xa->tSym+s, size);
strncpy(hEnd, xa->hSym+s, size);
qEnd[size] = tEnd[size] = hEnd[size] = 0;

if (gap->isRc)
    {
    swapBytes(qStart, qEnd, totSize);
    swapBytes(tStart, tEnd, totSize);
    swapBytes(hStart, hEnd, totSize);
    reverseComplement(qStart, totSize);
    reverseComplement(qEnd, totSize);
    reverseComplement(tStart, totSize);
    reverseComplement(tEnd, totSize);
    reverseBytes(hStart, totSize);
    reverseBytes(hEnd, totSize);
    }

/* Write out ends of gap to file. */
fprintf(f, "%s  ...%d... %s\n", qStart, midSize, qEnd);
fprintf(f, "%s  ...%d... %s\n", tStart, midSize, tEnd);
fprintf(f, "%s  ...%d... %s\n\n", hStart, midSize, hEnd);

/* Add intron ends to consensus sequence histogram. */
if (gap->hasIntronEnds && gap->type == cCodingGap)
    {
    isQgap = (qStart[exSize] == '-');
    if (isQgap)
        {
        fivePrime = tStart;
        threePrime = tEnd;
        }
    else
        {
        fivePrime = qStart;
        threePrime = qEnd;
        }
    if (noInserts(threePrime, totSize) && noInserts(fivePrime, totSize) )
        {
        int *homoCount;
        for (i=0; i<totSize; ++i)
            {
            hist5[i][histIx(fivePrime[i])] += 1;
            hist3[i][histIx(threePrime[i])] += 1;
            }
        ++histCount;
        if (isQgap)
            {
            ++ceOnlyCount;
            homoCount = ceOnlyHomoCount;
            }
        else
            {
            ++cbOnlyCount;
            homoCount = cbOnlyHomoCount;
            }
        ++bothCount;
        for (i=0; i<totSize; ++i)
            {
            if (fivePrime[i] == threePrime[i])
                {
                homoCount[i] += 1;
                bothHomoCount[i] += 1;
                }
            }
        /* Add introns to list. */
            {
            char idBuf[2*intronEndsSize+1];
            struct intronList *il;
            struct hashEl *hel;
            memcpy(idBuf, fivePrime+exSize, intronEndsSize);
            memcpy(idBuf+intronEndsSize, threePrime, intronEndsSize);
            idBuf[ sizeof(idBuf)-1 ] = 0;
            if ((hel = hashLookup(intronHash, idBuf)) != NULL)
                {
                il = hel->val;
                il->count += 1;
                fprintf(f, ">>>%d of set<<<\n", il->count);
                if (il->isQgap != isQgap)
                    {
                    il->onBoth = TRUE;
                    }
                }
            else
                {
                AllocVar(il);
                strcpy(il->ends, idBuf);
                il->count = 1;
                il->isQgap = isQgap;
                slAddHead(&intronList, il);
                hashAdd(intronHash, idBuf, il);
                }    
            }
        }
    else
        {
        static insertCount = 0;
        warn("Skipping intron with flanking inserts %d", ++insertCount);
        }
    }
}
int main(int argc, char *argv[])
{
char *genoListName;
char *cdnaListName;
char *oocFileName;
char *pairFileName;
struct patSpace *patSpace;
long startTime, endTime;
char **genoList;
int genoListSize;
char *genoListBuf;
char **cdnaList;
int cdnaListSize;
char *cdnaListBuf;
char *genoName;
int i;
int estIx = 0;
struct dnaSeq **seqListList = NULL, *seq;
static char hitFileName[512], mergerFileName[512], okFileName[512];
char *outRoot;
struct hash *pairHash;


if ((hostName = getenv("HOST")) == NULL)
    hostName = "";

if (argc != 6)
    usage();

pushWarnHandler(warnHandler);
startTime = clock1000();
dnaUtilOpen();
genoListName = argv[1];
cdnaListName = argv[2];
oocFileName = argv[3];
pairFileName = argv[4];
outRoot = argv[5];

sprintf(hitFileName, "%s.hit", outRoot);
sprintf(mergerFileName, "%s.glu", outRoot);
sprintf(okFileName, "%s.ok", outRoot);

readAllWords(genoListName, &genoList, &genoListSize, &genoListBuf);
readAllWords(cdnaListName, &cdnaList, &cdnaListSize, &cdnaListBuf);
pairHash = makePairHash(pairFileName);

hitOut = mustOpen(hitFileName, "w");
mergerOut = mustOpen(mergerFileName, "w");
seqListList = needMem(genoListSize*sizeof(seqListList[0]) );
fprintf(hitOut, "Pattern space 0.2 cDNA matcher\n");
fprintf(hitOut, "cDNA files: ");
for (i=0; i<cdnaListSize; ++i)
    fprintf(hitOut, " %s", cdnaList[i]);
fprintf(hitOut, "\n");
fprintf(hitOut, "%d genomic files\n", genoListSize);
for (i=0; i<genoListSize; ++i)
    {
    genoName = genoList[i];
    if (!startsWith("//", genoName)  )
        {
        seqListList[i] = seq = faReadAllDna(genoName);
        fprintf(hitOut, "%d els in %s ", slCount(seq), genoList[i]);
        for (; seq != NULL; seq = seq->next)
            fprintf(hitOut, "%d ", seq->size);
        fprintf(hitOut, "\n");
        }
    }

patSpace = makePatSpace(seqListList, genoListSize, 10, oocFileName, 4, 100000);

for (i=0; i<cdnaListSize; ++i)
    {
    FILE *f;
    char *estFileName;
    DNA *dna;
    char *estName;
    int size;
    int c;
    int maxSizeForFuzzyFind = 20000;
    int dotCount = 0;

    estFileName = cdnaList[i];
    if (startsWith("//", estFileName)  )
		continue;

    f = mustOpen(estFileName, "rb");
    while ((c = fgetc(f)) != EOF)
    if (c == '>')
        break;
    printf("%s", cdnaList[i]);
    fflush(stdout);
    while (fastFaReadNext(f, &dna, &size, &estName))
        {
	aliSeqName = estName;
        if (size < maxSizeForFuzzyFind)  /* Some day need to fix this somehow... */
            {
            struct hashEl *hel;
            struct cdnaAliList *calList = NULL;

            hel = hashLookup(pairHash, estName);
            if (hel != NULL)    /* Do pair processing. */
                {
                struct estPair *ep;
                struct seq *thisSeq, *otherSeq;

                ep = hel->val;
                if (hel->name == ep->name3)
                    {
                    thisSeq = &ep->seq3;
                    otherSeq = &ep->seq5;
                    }
                else
                    {
                    thisSeq = &ep->seq5;
                    otherSeq = &ep->seq3;
                    }
                if (otherSeq->dna == NULL)  /* First in pair - need to save sequence. */
                    {
                    thisSeq->size = size;
                    thisSeq->dna = needMem(size);
                    memcpy(thisSeq->dna, dna, size);
                    }
                else                        /* Second in pair - do gluing and free partner. */
                    {
                    char mergedName[64];
                    thisSeq->dna = dna;
                    thisSeq->size = size;
                    sprintf(mergedName, "%s_AND_%s", ep->name5, ep->name3);

                    glueFindOne(patSpace, ep->seq5.dna, ep->seq5.size,
                        '+', '5', ep->name5, &calList);
                    reverseComplement(ep->seq5.dna, ep->seq5.size);
                    glueFindOne(patSpace, ep->seq5.dna, ep->seq5.size,
                        '-', '5', ep->name5, &calList);
                    glueFindOne(patSpace, ep->seq3.dna, ep->seq3.size,
                        '+', '3', ep->name3, &calList);
                    reverseComplement(ep->seq3.dna, ep->seq3.size);
                    glueFindOne(patSpace, ep->seq3.dna, ep->seq3.size,
                        '-', '3', ep->name3, &calList);
                    slReverse(&calList);
                    writeMergers(calList, mergedName, genoList);

                    freez(&otherSeq->dna);
                    thisSeq->dna = NULL;
                    thisSeq->size =otherSeq->size = 0;
                    }
                }
            else
                {
                glueFindOne(patSpace, dna, size, '+', '5', estName, &calList);
                reverseComplement(dna, size);
                glueFindOne(patSpace, dna, size, '-', '5', estName, &calList);
                slReverse(&calList);
                writeMergers(calList, estName, genoList);
                }
            ++estIx;
            if ((estIx & 0xfff) == 0)
                {
                printf(".");
                ++dotCount;
                fflush(stdout);
                }
            }
        }
    printf("\n");
    }
aliSeqName = "";
printf("ffSubmitted %3d ffAccepted %3d ffOkScore %3d ffSolidMatch %2d\n",
    ffSubmitted, ffAccepted, ffOkScore, ffSolidMatch);

endTime = clock1000();

printf("Total time is %4.2f\n", 0.001*(endTime-startTime));

/* Write out file who's presence say's we succeeded */
    {
    FILE *f = mustOpen(okFileName, "w");
    fputs("ok", f);
    fclose(f);
    }

return 0;
}
Beispiel #28
0
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, 
	struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount)
/* Go through bed file and collect chromosomes and statistics. */
{
char *row[3];
struct hash *uniqHash = hashNew(0);
struct bbiChromUsage *usage = NULL, *usageList = NULL;
int lastStart = -1;
bits32 id = 0;
bits64 totalBases = 0, bedCount = 0;
int minDiff = BIGNUM;

lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    int rowSize = lineFileChopNext(lf, row, ArraySize(row));
    if (rowSize == 0)
        break;
    lineFileExpectWords(lf, 3, rowSize);
    char *chrom = row[0];
    int start = lineFileNeedNum(lf, row, 1);
    int end = lineFileNeedNum(lf, row, 2);
    if (start > end)
        {
	    errAbort("end (%d) before start (%d) line %d of %s",
	    	end, start, lf->lineIx, lf->fileName);
	}
    ++bedCount;
    totalBases += (end - start);
    if (usage == NULL || differentString(usage->name, chrom))
        {
	if (hashLookup(uniqHash, chrom))
	    {
	    errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
	    	lf->fileName, lf->lineIx);
	    }
	hashAdd(uniqHash, chrom, NULL);
	struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom);
	if (chromHashEl == NULL)
	    errAbort("%s is not found in chromosome sizes file", chrom);
	int chromSize = ptToInt(chromHashEl->val);
	AllocVar(usage);
	usage->name = cloneString(chrom);
	usage->id = id++;
	usage->size = chromSize;
	slAddHead(&usageList, usage);
	lastStart = -1;
	}
    if (end > usage->size)
        errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName);
    usage->itemCount += 1;
    if (lastStart >= 0)
        {
	int diff = start - lastStart;
	if (diff < minDiff)
	    {
	    if (diff < 0)
		errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
		    lf->fileName, lf->lineIx);
	    minDiff = diff;
	    }
	}
    lastStart = start;
    }
slReverse(&usageList);
*retMinDiff = minDiff;
*retAveSize = (double)totalBases/bedCount;
*retBedCount = bedCount;
freeHash(&uniqHash);
return usageList;
}
struct edwQaWigSpot *edwQaWigSpotFromNextRa(struct lineFile *lf, struct raToStructReader *reader)
/* Return next stanza put into an edwQaWigSpot. */
{
enum fields
    {
    spotRatioField,
    enrichmentField,
    basesInGenomeField,
    basesInSpotsField,
    sumSignalField,
    spotSumSignalField,
    };
if (!raSkipLeadingEmptyLines(lf, NULL))
    return NULL;

struct edwQaWigSpot *el;
AllocVar(el);

bool *fieldsObserved = reader->fieldsObserved;
bzero(fieldsObserved, reader->fieldCount);

char *tag, *val;
while (raNextTagVal(lf, &tag, &val, NULL))
    {
    struct hashEl *hel = hashLookup(reader->fieldIds, tag);
    if (hel != NULL)
        {
	int id = ptToInt(hel->val);
	if (fieldsObserved[id])
	     errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName);
	fieldsObserved[id] = TRUE;
	switch (id)
	    {
	    case spotRatioField:
	        {
	        el->spotRatio = sqlDouble(val);
		break;
	        }
	    case enrichmentField:
	        {
	        el->enrichment = sqlDouble(val);
		break;
	        }
	    case basesInGenomeField:
	        {
	        el->basesInGenome = sqlLongLong(val);
		break;
	        }
	    case basesInSpotsField:
	        {
	        el->basesInSpots = sqlLongLong(val);
		break;
	        }
	    case sumSignalField:
	        {
	        el->sumSignal = sqlDouble(val);
		break;
	        }
	    case spotSumSignalField:
	        {
	        el->spotSumSignal = sqlDouble(val);
		break;
	        }
	    default:
	        internalErr();
		break;
	    }
	}
    }

raToStructReaderCheckRequiredFields(reader, lf);
return el;
}
Beispiel #30
0
void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
    char *fileName, int lineIx)
/* Process one row of GFF file (a non-comment line parsed by tabs normally). */
{
struct hashEl *hel;
struct gffLine *gl;

if (wordCount < 8)
    gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
AllocVar(gl);

if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
    {
    struct gffSeqName *el;
    AllocVar(el);
    hel = hashAdd(gff->seqHash, words[0], el);
    el->name = hel->name;
    slAddHead(&gff->seqList, el);
    }
gl->seq = hel->name;

if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
    {
    struct gffSource *el;
    AllocVar(el);
    hel = hashAdd(gff->sourceHash, words[1], el);
    el->name = hel->name;
    slAddHead(&gff->sourceList, el);
    }
gl->source = hel->name;

if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
    {
    struct gffFeature *el;
    AllocVar(el);
    hel = hashAdd(gff->featureHash, words[2], el);
    el->name = hel->name;
    slAddHead(&gff->featureList, el);
    }
gl->feature = hel->name;

if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
   gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");	
gl->start = atoi(words[3])-1 + baseOffset;
gl->end = atoi(words[4]) + baseOffset;
gl->score = atof(words[5]);
gl->strand = words[6][0];
gl->frame = words[7][0];

if (wordCount >= 9)
    {
    if (!gff->typeKnown)
	{
	gff->typeKnown = TRUE;
	gff->isGtf = isGtfGroup(words[8]);
	}
    if (gff->isGtf)
	{
	parseGtfEnd(words[8], gff, gl, fileName, lineIx);
	}
    else
	{
	char *tnName = gffTnName(gl->seq, trimSpaces(words[8]));
	if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
	    {
	    struct gffGroup *group;
	    AllocVar(group);
	    hel = hashAdd(gff->groupHash, tnName, group);
	    group->name = hel->name;
	    group->seq = gl->seq;
	    group->source = gl->source;
	    slAddHead(&gff->groupList, group);
	    }
	gl->group = hel->name;
	}
    }
slAddHead(&gff->lineList, gl);
}