Ejemplo n.º 1
0
struct cdwBamFile *cdwBamFileFromNextRa(struct lineFile *lf, struct raToStructReader *reader)
/* Return next stanza put into an cdwBamFile. */
{
enum fields
    {
    isPairedField,
    isSortedByTargetField,
    readCountField,
    readBaseCountField,
    mappedCountField,
    uniqueMappedCountField,
    readSizeMeanField,
    readSizeStdField,
    readSizeMinField,
    readSizeMaxField,
    u4mReadCountField,
    u4mUniquePosField,
    u4mUniqueRatioField,
    targetBaseCountField,
    targetSeqCountField,
    };
if (!raSkipLeadingEmptyLines(lf, NULL))
    return NULL;

struct cdwBamFile *el;
AllocVar(el);

bool *fieldsObserved = reader->fieldsObserved;
bzero(fieldsObserved, reader->fieldCount);

char *tag, *val;
while (raNextTagVal(lf, &tag, &val, NULL))
    {
    struct hashEl *hel = hashLookup(reader->fieldIds, tag);
    if (hel != NULL)
        {
	int id = ptToInt(hel->val);
	if (fieldsObserved[id])
	     errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName);
	fieldsObserved[id] = TRUE;
	switch (id)
	    {
	    case isPairedField:
	        {
	        el->isPaired = sqlSigned(val);
		break;
	        }
	    case isSortedByTargetField:
	        {
	        el->isSortedByTarget = sqlSigned(val);
		break;
	        }
	    case readCountField:
	        {
	        el->readCount = sqlLongLong(val);
		break;
	        }
	    case readBaseCountField:
	        {
	        el->readBaseCount = sqlLongLong(val);
		break;
	        }
	    case mappedCountField:
	        {
	        el->mappedCount = sqlLongLong(val);
		break;
	        }
	    case uniqueMappedCountField:
	        {
	        el->uniqueMappedCount = sqlLongLong(val);
		break;
	        }
	    case readSizeMeanField:
	        {
	        el->readSizeMean = sqlDouble(val);
		break;
	        }
	    case readSizeStdField:
	        {
	        el->readSizeStd = sqlDouble(val);
		break;
	        }
	    case readSizeMinField:
	        {
	        el->readSizeMin = sqlSigned(val);
		break;
	        }
	    case readSizeMaxField:
	        {
	        el->readSizeMax = sqlSigned(val);
		break;
	        }
	    case u4mReadCountField:
	        {
	        el->u4mReadCount = sqlSigned(val);
		break;
	        }
	    case u4mUniquePosField:
	        {
	        el->u4mUniquePos = sqlSigned(val);
		break;
	        }
	    case u4mUniqueRatioField:
	        {
	        el->u4mUniqueRatio = sqlDouble(val);
		break;
	        }
	    case targetBaseCountField:
	        {
	        el->targetBaseCount = sqlLongLong(val);
		break;
	        }
	    case targetSeqCountField:
	        {
	        el->targetSeqCount = sqlUnsigned(val);
		break;
	        }
	    default:
	        internalErr();
		break;
	    }
	}
    }

raToStructReaderCheckRequiredFields(reader, lf);
return el;
}
Ejemplo n.º 2
0
struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, 
	double forceJoinScore, double weakLevel)
/* Convert a list of items to a list of clusters of items.  This may break up clusters that
 * have weakly linked parts. 
      [                ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   gets tranformed into
       [    ]   [    ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   The strategy is to build a rangeTree of coverage, which might look something like so:
      123333211123333211 
   then define cluster ends that exceed the minimum limit, which is either weakLevel
   (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is 
   more than forceJoinScore.  This will go to something like so:
        [---]   [----]   
   Finally the items that are overlapping a cluster are assigned to it.  Note that this
   may mean that an item may be in multiple clusters.
        [ABC]   [ ADE]
 */
{
int easyMax = round(1.0/weakLevel);
int itemCount = slCount(itemList);
struct peakCluster *clusterList = NULL;
if (itemCount < easyMax)
    {
    struct peakItem *item = itemList;
    int chromStart = item->chromStart;
    int chromEnd = item->chromEnd;
    for (item = item->next; item != NULL; item = item->next)
        {
	if (item->chromStart < chromStart) chromStart = item->chromStart;
	if (item->chromEnd > chromEnd) chromEnd = item->chromEnd;
	}
    addCluster(lm, itemList, chromStart, chromEnd, &clusterList);
    }
else
    {
    /* Make up coverage tree. */
    struct rbTree *covTree = rangeTreeNew();
    struct peakItem *item;
    for (item = itemList; item != NULL; item = item->next)
	rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
    struct range *range, *rangeList = rangeTreeList(covTree);

    /* Figure out maximum coverage. */
    int maxCov = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > maxCov) maxCov = cov;
	}

    /* Figure coverage threshold. */
    int threshold = round(maxCov * weakLevel);
    if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;

    /* Loop through emitting sections over threshold as clusters */
    boolean inRange = FALSE;
    boolean start = 0, end = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > threshold)
	    {
	    if (inRange)
	       end = range->end;
	    else
	       {
	       inRange = TRUE;
	       start = range->start;
	       end = range->end;
	       }
	    }
	else
	    {
	    if (inRange)
		{
		addCluster(lm, itemList, start, end, &clusterList);
		inRange = FALSE;
		}
	    }
	}
    if (inRange)
        addCluster(lm, itemList, start, end, &clusterList);
    }
slReverse(&clusterList);
return clusterList;
}
Ejemplo n.º 3
0
static int bedToGffLines(struct bed *bedList, struct slName *exonFramesList, struct hTableInfo *hti,
			 int fieldCount, char *source, boolean gtf2StopCodons)
/* Translate a (list of) bed into gff and print out.
 * Note that field count (perhaps reduced by bitwise intersection)
 * can in effect override hti. */
{
if (! bedList)
    return 0;
struct hash *nameHash = newHash(20);
struct bed *bed;
struct slName *exonFrames = exonFramesList;
int i, exonStart, exonEnd;
char txName[256];
int itemCount = 0;
static int namelessIx = 0;

for (bed = bedList;  bed != NULL;  bed = bed->next)
    {
    /* Enforce unique transcript_ids. */
    if (bed->name != NULL)
	{
	struct hashEl *hel = hashLookup(nameHash, bed->name);
	int dupCount = (hel != NULL ? ptToInt(hel->val) : 0);
	if (dupCount > 0)
	    {
	    safef(txName, sizeof(txName), "%s_dup%d", bed->name, dupCount);
	    hel->val = intToPt(dupCount + 1);
	    }
	else
	    {
	    safef(txName, sizeof(txName), "%s", bed->name);
	    hashAddInt(nameHash, bed->name, 1);
	    }
	}
    else
	safef(txName, sizeof(txName), "tx%d", ++namelessIx);
    if (hti->hasBlocks && hti->hasCDS && fieldCount > 4)
	{
	/* first pass: compute frames, in order dictated by strand. */
	int startIndx = 0, stopIndx = 0;
	char *frames = NULL;
	char *ef = NULL;
	if (exonFramesList)
    	    ef = exonFrames->name;
	frames = computeFrames(bed, ef, &startIndx, &stopIndx);

	/* second pass: one exon (possibly CDS, start/stop_codon) per block. */
	for (i=0;  i < bed->blockCount;  i++)
	    {
	    exonStart = bed->chromStart + bed->chromStarts[i];
	    exonEnd = exonStart + bed->blockSizes[i];
	    if ((exonStart < bed->thickEnd) && (exonEnd > bed->thickStart))
		{
		int exonCdsStart = max(exonStart, bed->thickStart);
		int exonCdsEnd = min(exonEnd, bed->thickEnd);
		addCdsStartStop(bed, source, exonCdsStart, exonCdsEnd,
				frames, i, startIndx, stopIndx, gtf2StopCodons, txName);
		}
	    addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName);
	    }
	freeMem(frames);
	}
    else if (hti->hasBlocks && fieldCount > 4)
	{
	for (i=0;  i < bed->blockCount;  i++)
	    {
	    exonStart = bed->chromStart + bed->chromStarts[i];
	    exonEnd = exonStart + bed->blockSizes[i];
	    addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName);
	    }
	}
    else if (hti->hasCDS && fieldCount > 4)
	{
	if (bed->thickStart == 0 && bed->thickEnd == 0)
	    bed->thickStart = bed->thickEnd = bed->chromStart;
	if (bed->thickStart > bed->chromStart)
	    {
	    addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->thickStart, '.', txName);
	    }
	if (bed->thickEnd > bed->thickStart)
	    addGffLineFromBed(bed, source, "CDS", bed->thickStart, bed->thickEnd, '0', txName);
	if (bed->thickEnd < bed->chromEnd)
	    {
	    addGffLineFromBed(bed, source, "exon", bed->thickEnd, bed->chromEnd, '.', txName);
	    }
	}
    else
	{
	addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->chromEnd, '.', txName);
	}
    itemCount++;
    if (exonFrames)
    	exonFrames = exonFrames->next;
    }
hashFree(&nameHash);
return itemCount;
}
Ejemplo n.º 4
0
struct cdwQaPairedEndFastq *cdwQaPairedEndFastqFromNextRa(struct lineFile *lf, struct raToStructReader *reader)
/* Return next stanza put into an cdwQaPairedEndFastq. */
{
enum fields
    {
    fileId1Field,
    concordanceField,
    distanceMeanField,
    distanceStdField,
    distanceMinField,
    distanceMaxField,
    };
if (!raSkipLeadingEmptyLines(lf, NULL))
    return NULL;

struct cdwQaPairedEndFastq *el;
AllocVar(el);

bool *fieldsObserved = reader->fieldsObserved;
bzero(fieldsObserved, reader->fieldCount);

char *tag, *val;
while (raNextTagVal(lf, &tag, &val, NULL))
    {
    struct hashEl *hel = hashLookup(reader->fieldIds, tag);
    if (hel != NULL)
        {
	int id = ptToInt(hel->val);
	if (fieldsObserved[id])
	     errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName);
	fieldsObserved[id] = TRUE;
	switch (id)
	    {
	    case fileId1Field:
	        {
	        el->fileId1 = sqlUnsigned(val);
		break;
	        }
	    case concordanceField:
	        {
	        el->concordance = sqlDouble(val);
		break;
	        }
	    case distanceMeanField:
	        {
	        el->distanceMean = sqlDouble(val);
		break;
	        }
	    case distanceStdField:
	        {
	        el->distanceStd = sqlDouble(val);
		break;
	        }
	    case distanceMinField:
	        {
	        el->distanceMin = sqlDouble(val);
		break;
	        }
	    case distanceMaxField:
	        {
	        el->distanceMax = sqlDouble(val);
		break;
	        }
	    default:
	        internalErr();
		break;
	    }
	}
    }

raToStructReaderCheckRequiredFields(reader, lf);
return el;
}
void bioImageLoad(char *setRaFile, char *itemTabFile)
/* bioImageLoad - Load data into bioImage database. */
{
struct hash *raHash = raReadSingle(setRaFile);
struct hash *rowHash;
struct lineFile *lf = lineFileOpen(itemTabFile, TRUE);
char *line, *words[256];
struct sqlConnection *conn = sqlConnect(database);
int rowSize;
int submissionSetId;
struct hash *fullDirHash = newHash(0);
struct hash *screenDirHash = newHash(0);
struct hash *thumbDirHash = newHash(0);
struct hash *treatmentHash = newHash(0);
struct hash *bodyPartHash = newHash(0);
struct hash *sliceTypeHash = newHash(0);
struct hash *imageTypeHash = newHash(0);
struct hash *sectionSetHash = newHash(0);
struct dyString *dy = dyStringNew(0);

/* Read first line of tab file, and from it get all the field names. */
if (!lineFileNext(lf, &line, NULL))
    errAbort("%s appears to be empty", lf->fileName);
if (line[0] != '#')
    errAbort("First line of %s needs to start with #, and then contain field names",
    	lf->fileName);
rowHash = hashRowOffsets(line+1);
rowSize = rowHash->elCount;
if (rowSize >= ArraySize(words))
    errAbort("Too many fields in %s", lf->fileName);

/* Check that have all required fields */
    {
    char *fieldName;
    int i;

    for (i=0; i<ArraySize(requiredSetFields); ++i)
        {
	fieldName = requiredSetFields[i];
	if (!hashLookup(raHash, fieldName))
	    errAbort("Field %s is not in %s", fieldName, setRaFile);
	}

    for (i=0; i<ArraySize(requiredItemFields); ++i)
        {
	fieldName = requiredItemFields[i];
	if (!hashLookup(rowHash, fieldName))
	    errAbort("Field %s is not in %s", fieldName, itemTabFile);
	}

    for (i=0; i<ArraySize(requiredFields); ++i)
        {
	fieldName = requiredFields[i];
	if (!hashLookup(rowHash, fieldName) && !hashLookup(raHash, fieldName))
	    errAbort("Field %s is not in %s or %s", fieldName, setRaFile, itemTabFile);
	}
    }

/* Create/find submission record. */
submissionSetId = saveSubmissionSet(conn, raHash);

/* Process rest of tab file. */
while (lineFileNextRowTab(lf, words, rowSize))
    {
    int fullDir = cachedId(conn, "location", "name", 
    	fullDirHash, "fullDir", raHash, rowHash, words);
    int screenDir = cachedId(conn, "location", "name", 
    	screenDirHash, "screenDir", raHash, rowHash, words);
    int thumbDir = cachedId(conn, "location", 
    	"name", thumbDirHash, "thumbDir", raHash, rowHash, words);
    int bodyPart = cachedId(conn, "bodyPart", 
    	"name", bodyPartHash, "bodyPart", raHash, rowHash, words);
    int sliceType = cachedId(conn, "sliceType", 
    	"name", sliceTypeHash, "sliceType", raHash, rowHash, words);
    int imageType = cachedId(conn, "imageType", 
    	"name", imageTypeHash, "imageType", raHash, rowHash, words);
    int treatment = cachedId(conn, "treatment", 
    	"conditions", treatmentHash, "treatment", raHash, rowHash, words);
    char *fileName = getVal("fileName", raHash, rowHash, words, NULL);
    char *submitId = getVal("submitId", raHash, rowHash, words, NULL);
    char *taxon = getVal("taxon", raHash, rowHash, words, NULL);
    char *isEmbryo = getVal("isEmbryo", raHash, rowHash, words, NULL);
    char *age = getVal("age", raHash, rowHash, words, NULL);
    char *sectionSet = getVal("sectionSet", raHash, rowHash, words, "");
    char *sectionIx = getVal("sectionIx", raHash, rowHash, words, "0");
    char *gene = getVal("gene", raHash, rowHash, words, "");
    char *locusLink = getVal("locusLink", raHash, rowHash, words, "");
    char *refSeq = getVal("refSeq", raHash, rowHash, words, "");
    char *genbank = getVal("genbank", raHash, rowHash, words, "");
    char *priority = getVal("priority", raHash, rowHash, words, "200");
    int sectionId = 0;
    int oldId;
    // char *xzy = getVal("xzy", raHash, rowHash, words, xzy);

    if (sectionSet[0] != 0 && !sameString(sectionSet, "0"))
        {
	struct hashEl *hel = hashLookup(sectionSetHash, sectionSet);
	if (hel != NULL)
	    sectionId = ptToInt(hel->val);
	else
	    {
	    sqlUpdate(conn, "insert into sectionSet values(default)");
	    sectionId = sqlLastAutoId(conn);
	    hashAdd(sectionSetHash, sectionSet, intToPt(sectionId));
	    }
	}

    dyStringClear(dy);
    dyStringAppend(dy, "select id from image ");
    dyStringPrintf(dy, "where fileName = '%s' ", fileName);
    dyStringPrintf(dy, "and fullLocation = %d",  fullDir);
    oldId = sqlQuickNum(conn, dy->string);
    if (oldId != 0)
        {
	if (replace)
	    {
	    dyStringClear(dy);
	    dyStringPrintf(dy, "delete from image where id = %d", oldId);
	    sqlUpdate(conn, dy->string);
	    }
	else
	    errAbort("%s is already in database line %d of %s", 
	    	fileName, lf->lineIx, lf->fileName);
	}

    dyStringClear(dy);
    dyStringAppend(dy, "insert into image set\n");
    dyStringPrintf(dy, " id = default,\n");
    dyStringPrintf(dy, " fileName = '%s',\n", fileName);
    dyStringPrintf(dy, " fullLocation = %d,\n", fullDir);
    dyStringPrintf(dy, " screenLocation = %d,\n", screenDir);
    dyStringPrintf(dy, " thumbLocation = %d,\n", thumbDir);
    dyStringPrintf(dy, " submissionSet = %d,\n", submissionSetId);
    dyStringPrintf(dy, " sectionSet = %d,\n", sectionId);
    dyStringPrintf(dy, " sectionIx = %s,\n", sectionIx);
    dyStringPrintf(dy, " submitId = '%s',\n", submitId);
    dyStringPrintf(dy, " gene = '%s',\n", gene);
    dyStringPrintf(dy, " locusLink = '%s',\n", locusLink);
    dyStringPrintf(dy, " refSeq = '%s',\n", refSeq);
    dyStringPrintf(dy, " genbank = '%s',\n", genbank);
    dyStringPrintf(dy, " priority = %s,\n", priority);
    dyStringPrintf(dy, " taxon = %s,\n", taxon);
    dyStringPrintf(dy, " isEmbryo = %s,\n", isEmbryo);
    dyStringPrintf(dy, " age = %s,\n", age);
    dyStringPrintf(dy, " bodyPart = %d,\n", bodyPart);
    dyStringPrintf(dy, " sliceType = %d,\n", sliceType);
    dyStringPrintf(dy, " imageType = %d,\n", imageType);
    dyStringPrintf(dy, " treatment = %d\n", treatment);

    sqlUpdate(conn, dy->string);
    }
}
Ejemplo n.º 6
0
void bedItemOverlapCount(struct hash *chromHash, char *infile, char *outfile){
unsigned maxChromSize = 0;
unitSize *counts = (unitSize *)NULL;
FILE *f = mustOpen(outfile, "w");
struct hashCookie hc = hashFirst(chromHash);
struct hashEl *hel;
while( (hel = hashNext(&hc)) != NULL) {
    unsigned num = (unsigned) ptToInt(hel->val);
    maxChromSize = max(num, maxChromSize);
}
verbose(2,"#\tmaxChromSize: %u\n", maxChromSize);
if (maxChromSize < 1)
    errAbort("maxChromSize is zero ?");

/*	Allocate just once for the largest chrom and reuse this array */
counts = needHugeMem(sizeof(unitSize) * maxChromSize);

/*	Reset the array to be zero to be reused */
memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize);

unsigned chromSize = 0;
char *prevChrom = (char *)NULL;
boolean outputToDo = FALSE;
struct hash *seenHash = newHash(5);

    struct lineFile *bf = lineFileOpen(infile , TRUE);
    struct bed *bed = (struct bed *)NULL;
    char *row[12];
    int numFields = doBed12 ? 12 : 3;

    while (lineFileNextRow(bf,row, numFields))
	{
	int i;
	bed = bedLoadN(row, numFields);

	verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);

	if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
	    {
	    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
	    if (outputToDo)
		outputCounts(counts, prevChrom, chromSize, f);
	    outputToDo = FALSE;
	    memset((void *)counts, 0,
		sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */
	    freez(&prevChrom); 
	    // prevChrom is now NULL so it will be caught by next if!
	    }
	if ((char *)NULL == prevChrom)  // begin a chr
	    {
	    if (hashLookup(seenHash, bed->chrom))
		errAbort("ERROR:input file not sorted. %s seen before on line %d\n",
		    bed->chrom, bf->lineIx);

	    hashAdd(seenHash, bed->chrom, NULL);
	    prevChrom = cloneString(bed->chrom);
	    chromSize = hashIntVal(chromHash, prevChrom);
	    verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
	    }
	if (bed->chromEnd > chromSize)
	    {
	    // check for circular chrM
	    if (doBed12 || bed->chromStart>=chromSize 
		|| differentWord(bed->chrom,"chrM")) 
		{
		warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
		bed->chromEnd);
		errAbort("chromEnd > chromSize ?  %d > %d", 
		    bed->chromEnd,chromSize);
		}

	    for (i = bed->chromStart; i < chromSize; ++i)
		INCWOVERFLOW(counts,i);
	    for (i = 0; i < (bed->chromEnd - chromSize); ++i)
		INCWOVERFLOW(counts,i);
	    }
	else if (doBed12)
	    {
	    int *starts = bed->chromStarts;
	    int *sizes = bed->blockSizes;
	    int *endStarts = &bed->chromStarts[bed->blockCount];

	    for(; starts < endStarts; starts++, sizes++)
		{
		unsigned int end = *starts + *sizes + bed->chromStart;
		for (i = *starts + bed->chromStart; i < end; ++i)
		    INCWOVERFLOW(counts,i);
		}
	    }
	else
	    {
	    for (i = bed->chromStart; i < bed->chromEnd; ++i)
		INCWOVERFLOW(counts, i);
	    }
	outputToDo = TRUE;
	bedFree(&bed); // plug the memory leak
	}

    lineFileClose(&bf);
    // Note, next file could be on same chr!

if (outputToDo)
    outputCounts(counts, prevChrom, chromSize, f);

if (doOutBounds)
    fprintf(stderr, "min %lu max %lu\n", (unsigned long)overMin, (unsigned long)overMax);

verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
carefulClose(&f);
freeMem(counts);
freez(&prevChrom);
// hashFreeWithVals(&chromHash, freez);
freeHash(&seenHash);
}
Ejemplo n.º 7
0
struct edwFastqFile *edwFastqFileFromNextRa(struct lineFile *lf, struct raToStructReader *reader)
/* Return next stanza put into an edwFastqFile. */
{
enum fields
    {
    sampleCountField,
    basesInSampleField,
    readCountField,
    baseCountField,
    readSizeMeanField,
    readSizeStdField,
    readSizeMinField,
    readSizeMaxField,
    qualMeanField,
    qualStdField,
    qualMinField,
    qualMaxField,
    qualTypeField,
    qualZeroField,
    atRatioField,
    aRatioField,
    cRatioField,
    gRatioField,
    tRatioField,
    nRatioField,
    qualPosField,
    aAtPosField,
    cAtPosField,
    gAtPosField,
    tAtPosField,
    nAtPosField,
    };
if (!raSkipLeadingEmptyLines(lf, NULL))
    return NULL;

struct edwFastqFile *el;
AllocVar(el);

bool *fieldsObserved = reader->fieldsObserved;
bzero(fieldsObserved, reader->fieldCount);

char *tag, *val;
while (raNextTagVal(lf, &tag, &val, NULL))
    {
    struct hashEl *hel = hashLookup(reader->fieldIds, tag);
    if (hel != NULL)
        {
	int id = ptToInt(hel->val);
	if (fieldsObserved[id])
	     errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName);
	fieldsObserved[id] = TRUE;
	switch (id)
	    {
	    case sampleCountField:
	        {
	        el->sampleCount = sqlLongLong(val);
		break;
	        }
	    case basesInSampleField:
	        {
	        el->basesInSample = sqlLongLong(val);
		break;
	        }
	    case readCountField:
	        {
	        el->readCount = sqlLongLong(val);
		break;
	        }
	    case baseCountField:
	        {
	        el->baseCount = sqlLongLong(val);
		break;
	        }
	    case readSizeMeanField:
	        {
	        el->readSizeMean = sqlDouble(val);
		break;
	        }
	    case readSizeStdField:
	        {
	        el->readSizeStd = sqlDouble(val);
		break;
	        }
	    case readSizeMinField:
	        {
	        el->readSizeMin = sqlSigned(val);
		break;
	        }
	    case readSizeMaxField:
	        {
                int arraySize = sqlSigned(val);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "readSizeMax");
		break;
	        }
	    case qualMeanField:
	        {
	        el->qualMean = sqlDouble(val);
		break;
	        }
	    case qualStdField:
	        {
	        el->qualStd = sqlDouble(val);
		break;
	        }
	    case qualMinField:
	        {
	        el->qualMin = sqlDouble(val);
		break;
	        }
	    case qualMaxField:
	        {
	        el->qualMax = sqlDouble(val);
		break;
	        }
	    case qualTypeField:
	        {
	        el->qualType = cloneString(val);
		break;
	        }
	    case qualZeroField:
	        {
	        el->qualZero = sqlSigned(val);
		break;
	        }
	    case atRatioField:
	        {
	        el->atRatio = sqlDouble(val);
		break;
	        }
	    case aRatioField:
	        {
	        el->aRatio = sqlDouble(val);
		break;
	        }
	    case cRatioField:
	        {
	        el->cRatio = sqlDouble(val);
		break;
	        }
	    case gRatioField:
	        {
	        el->gRatio = sqlDouble(val);
		break;
	        }
	    case tRatioField:
	        {
	        el->tRatio = sqlDouble(val);
		break;
	        }
	    case nRatioField:
	        {
	        el->nRatio = sqlDouble(val);
		break;
	        }
	    case qualPosField:
	        {
                int arraySize;
		sqlDoubleDynamicArray(val, &el->qualPos, &arraySize);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "qualPos");
		break;
	        }
	    case aAtPosField:
	        {
                int arraySize;
		sqlDoubleDynamicArray(val, &el->aAtPos, &arraySize);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "aAtPos");
		break;
	        }
	    case cAtPosField:
	        {
                int arraySize;
		sqlDoubleDynamicArray(val, &el->cAtPos, &arraySize);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "cAtPos");
		break;
	        }
	    case gAtPosField:
	        {
                int arraySize;
		sqlDoubleDynamicArray(val, &el->gAtPos, &arraySize);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "gAtPos");
		break;
	        }
	    case tAtPosField:
	        {
                int arraySize;
		sqlDoubleDynamicArray(val, &el->tAtPos, &arraySize);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "tAtPos");
		break;
	        }
	    case nAtPosField:
	        {
                int arraySize;
		sqlDoubleDynamicArray(val, &el->nAtPos, &arraySize);
                raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "nAtPos");
		break;
	        }
	    default:
	        internalErr();
		break;
	    }
	}
    }

raToStructReaderCheckRequiredFields(reader, lf);
return el;
}
Ejemplo n.º 8
0
void encodeExpToTab(char *outExp, char *outSeries, char *outResults)
/* encodeExpToCvDb - Convert encode experiments table to a table more suitable for cvDb. */
{
struct hash *optHash = optionalFieldsHash();
struct mdbObj *mdbList = getMdbList(metaDbs, ArraySize(metaDbs));
struct hash *mdbHash = mdbHashKeyedByExpId(mdbList);
struct hash *seriesHash = hashNew(0);
struct series *seriesList = NULL;
verbose(1, "read %d mdb objects from %s in %d databases\n", mdbHash->elCount, metaTable, 
    (int)ArraySize(metaDbs));
struct sqlConnection *expDbConn = sqlConnect(expDb);
struct sqlConnection *cvDbConn = sqlConnect(cvDb);
char query[256];
sqlSafef(query, sizeof(query), "select * from %s", expTable);
struct sqlResult *sr = sqlGetResult(expDbConn, query);
FILE *f = mustOpen(outExp, "w");
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    {
    /* Read in database structure. */
    struct encodeExp *ee = encodeExpLoad(row);
    
    /* Much of the data we're processing comes from lists of the form 
     * "a=aVal b=bVal c=cVal." We'll convert these to id's in the appropriate 
     * tables and store the IDs in the optCol array declared below.  */
    int optColCount = ArraySize(expOptionalFields);
    int optCol[optColCount];
    int i;
    for (i=0; i<optColCount; ++i)
	optCol[i] = 0;

    /* Convert var=val string in encodeExp.expVars into list of slPairs, and loop through it. */
    struct slPair *varList = slPairListFromString(ee->expVars, TRUE);
    struct slPair *var;
    for (var = varList; var != NULL; var = var->next)
	 {
	 /* Figure out name of table and the term within that table. */
	 char *table = var->name;
	 char *term = var->val;
	 if (sameString(table, "antibody")) // Deal with antibody special case
	    {
	    if (sameString(term, "Control") || sameString(term, "Input") 
	    || sameString(term, "RevXlinkChromatin") || sameString(term, "ripInput"))
		{
		table = "control";
		}
	    }

	 /* If it looks like we have a valid table and term, store result in
	  * optCol array we'll output soon. */
	 struct hashEl *hel;
	 if ((hel = hashLookup(optHash, table)) != NULL)
	     {
	     int id = lookupId(cvDbConn, table, term);
	     if (id == 0)
		  {
	          warn("No id in cvDb for %s=%s\n", table, term);
		  continue;
		  }
	     int optColIx = ptToInt(hel->val);
	     optCol[optColIx] = id;
	     }
	 else
	     verbose(2, "%s %s ?\n", table, term);
	 }

    /* Now we want to process metaDb, which has some info encodeExp does not. */
    char *composite = NULL;
    char ixAsString[16];
    safef(ixAsString, sizeof(ixAsString), "%d", ee->ix);
    struct mdbObj *mdb = hashFindVal(mdbHash, ixAsString);
    if (mdb != NULL)
	{
	struct mdbVar *v;
	for (v = mdb->vars; v != NULL; v = v->next)
	    {
	    /* Look up table and term and change table name if need be */
	    char *table = v->var;
	    char *term = v->val;
	    if (sameString(table, "antibody")) 
		 table = "ab";
	    else if (sameString(table, "grant"))
	         table = "grantee";

	    /* Squirrel away the ever-important composite term for later. */
	    if (sameString("composite", table))
	         composite = term;

	    struct hashEl *hel;
	    if ((hel = hashLookup(optHash, table)) != NULL)
		{
		int optColIx = ptToInt(hel->val);
		if (optCol[optColIx] == 0)  // Only use mdb if encodeExp has no data.
		    {
		    int id = lookupId(cvDbConn, table, term);
		    optCol[optColIx] = id;
		    }
		}
	    }
	}

    /* If we've got a composite, then make up a series record. */
    if (composite != NULL)
        {
	assert(mdb != NULL);
	struct series *series = hashFindVal(seriesHash, composite);
	if (series == NULL)
	    {
	    series = seriesFromMdb(mdb, composite);
	    hashAdd(seriesHash, composite, series);
	    slAddHead(&seriesList, series);
	    }
	}

    if (ee->accession != NULL)
	{
	/* Write out required fields.  Order of required fields
	 * here needs to follow order in expRequiredFields. */
	fprintf(f, "%u", ee->ix);
	fprintf(f, "\t%s", ee->updateTime);
	fprintf(f, "\t%s", naForNull(composite));
	fprintf(f, "\t%s", ee->accession);
	fprintf(f, "\t%d", lookupId(cvDbConn, "organism", ee->organism));
	fprintf(f, "\t%d", lookupId(cvDbConn, "lab", ee->lab));
	fprintf(f, "\t%d", lookupId(cvDbConn, "dataType", ee->dataType));
	fprintf(f, "\t%d", lookupId(cvDbConn, "cellType", ee->cellType));

	/* Now write out optional fields. */
	for (i=0; i<optColCount; ++i)
	    fprintf(f, "\t%d", optCol[i]);

	/* End output record. */
	fprintf(f, "\n");
	}
    }

/* Write out series list to a separate file. */
slReverse(&seriesList);
writeSeriesList(outSeries, seriesList);

/* Write out results to a separate file. */
writeMdbListAsResults(mdbList, outResults);

/* Clean up and go home. */
carefulClose(&f);
sqlFreeResult(&sr);
sqlDisconnect(&expDbConn);
sqlDisconnect(&cvDbConn);
}
Ejemplo n.º 9
0
int findSize(struct hash *hash, char *name)
/* Find size of name in hash or die trying. */
{
void *val = hashMustFindVal(hash, name);
return ptToInt(val);
}
Ejemplo n.º 10
0
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, 
	struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount, boolean tabSep)
/* Go through bed file and collect chromosomes and statistics.  If eim parameter is non-NULL
 * collect max field sizes there too. */
{
int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1);
char *row[maxRowSize];
struct bbiChromUsage *usage = NULL, *usageList = NULL;
int lastStart = -1;
bits32 id = 0;
bits64 totalBases = 0, bedCount = 0;
int minDiff = BIGNUM;

lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    int rowSize = 0;

    if (tabSep)
        rowSize = lineFileChopCharNext(lf, '\t', row, maxRowSize);
    else
        rowSize = lineFileChopNext(lf, row, maxRowSize);
    if (rowSize == 0)
        break;
    lineFileExpectAtLeast(lf, maxRowSize, rowSize);
    char *chrom = row[0];
    int start = lineFileNeedNum(lf, row, 1);
    int end = lineFileNeedNum(lf, row, 2);
    if (eim != NULL)
	bbExIndexMakerUpdateMaxFieldSize(eim, row);
    if (start > end)
        {
	    errAbort("end (%d) before start (%d) line %d of %s",
	    	end, start, lf->lineIx, lf->fileName);
	}
    ++bedCount;
    totalBases += (end - start);
    if (usage == NULL || differentString(usage->name, chrom))
        {
	/* make sure chrom names are sorted in ASCII order */
	if ((usage != NULL) && strcmp(usage->name, chrom) > 0)
	    {
	    errAbort("%s is not case-sensitive sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" with LC_COLLATE=C,  or bedSort and try again.",
	    	lf->fileName, lf->lineIx);
	    }
	struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom);
	if (chromHashEl == NULL)
	    errAbort("%s is not found in chromosome sizes file", chrom);
	int chromSize = ptToInt(chromHashEl->val);
	AllocVar(usage);
	usage->name = cloneString(chrom);
	usage->id = id++;
	usage->size = chromSize;
	slAddHead(&usageList, usage);
	lastStart = -1;
	}
    if (end > usage->size)
        errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName);
    usage->itemCount += 1;
    if (lastStart >= 0)
        {
	int diff = start - lastStart;
	if (diff < minDiff)
	    {
	    if (diff < 0)
		errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
		    lf->fileName, lf->lineIx);
	    minDiff = diff;
	    }
	}
    lastStart = start;
    }
slReverse(&usageList);
double aveSize = 0;
if (bedCount > 0)
    aveSize = (double)totalBases/bedCount;
*retMinDiff = minDiff;
*retAveSize = aveSize;
*retBedCount = bedCount;
return usageList;
}