struct cdwBamFile *cdwBamFileFromNextRa(struct lineFile *lf, struct raToStructReader *reader) /* Return next stanza put into an cdwBamFile. */ { enum fields { isPairedField, isSortedByTargetField, readCountField, readBaseCountField, mappedCountField, uniqueMappedCountField, readSizeMeanField, readSizeStdField, readSizeMinField, readSizeMaxField, u4mReadCountField, u4mUniquePosField, u4mUniqueRatioField, targetBaseCountField, targetSeqCountField, }; if (!raSkipLeadingEmptyLines(lf, NULL)) return NULL; struct cdwBamFile *el; AllocVar(el); bool *fieldsObserved = reader->fieldsObserved; bzero(fieldsObserved, reader->fieldCount); char *tag, *val; while (raNextTagVal(lf, &tag, &val, NULL)) { struct hashEl *hel = hashLookup(reader->fieldIds, tag); if (hel != NULL) { int id = ptToInt(hel->val); if (fieldsObserved[id]) errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName); fieldsObserved[id] = TRUE; switch (id) { case isPairedField: { el->isPaired = sqlSigned(val); break; } case isSortedByTargetField: { el->isSortedByTarget = sqlSigned(val); break; } case readCountField: { el->readCount = sqlLongLong(val); break; } case readBaseCountField: { el->readBaseCount = sqlLongLong(val); break; } case mappedCountField: { el->mappedCount = sqlLongLong(val); break; } case uniqueMappedCountField: { el->uniqueMappedCount = sqlLongLong(val); break; } case readSizeMeanField: { el->readSizeMean = sqlDouble(val); break; } case readSizeStdField: { el->readSizeStd = sqlDouble(val); break; } case readSizeMinField: { el->readSizeMin = sqlSigned(val); break; } case readSizeMaxField: { el->readSizeMax = sqlSigned(val); break; } case u4mReadCountField: { el->u4mReadCount = sqlSigned(val); break; } case u4mUniquePosField: { el->u4mUniquePos = sqlSigned(val); break; } case u4mUniqueRatioField: { el->u4mUniqueRatio = sqlDouble(val); break; } case targetBaseCountField: { el->targetBaseCount = sqlLongLong(val); break; } case targetSeqCountField: { el->targetSeqCount = sqlUnsigned(val); break; } default: internalErr(); break; } } } raToStructReaderCheckRequiredFields(reader, lf); return el; }
struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, double forceJoinScore, double weakLevel) /* Convert a list of items to a list of clusters of items. This may break up clusters that * have weakly linked parts. [ ] AAAAAAAAAAAAAAAAAA BBBBBB DDDDDD CCCC EEEE gets tranformed into [ ] [ ] AAAAAAAAAAAAAAAAAA BBBBBB DDDDDD CCCC EEEE The strategy is to build a rangeTree of coverage, which might look something like so: 123333211123333211 then define cluster ends that exceed the minimum limit, which is either weakLevel (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is more than forceJoinScore. This will go to something like so: [---] [----] Finally the items that are overlapping a cluster are assigned to it. Note that this may mean that an item may be in multiple clusters. [ABC] [ ADE] */ { int easyMax = round(1.0/weakLevel); int itemCount = slCount(itemList); struct peakCluster *clusterList = NULL; if (itemCount < easyMax) { struct peakItem *item = itemList; int chromStart = item->chromStart; int chromEnd = item->chromEnd; for (item = item->next; item != NULL; item = item->next) { if (item->chromStart < chromStart) chromStart = item->chromStart; if (item->chromEnd > chromEnd) chromEnd = item->chromEnd; } addCluster(lm, itemList, chromStart, chromEnd, &clusterList); } else { /* Make up coverage tree. */ struct rbTree *covTree = rangeTreeNew(); struct peakItem *item; for (item = itemList; item != NULL; item = item->next) rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd); struct range *range, *rangeList = rangeTreeList(covTree); /* Figure out maximum coverage. */ int maxCov = 0; for (range = rangeList; range != NULL; range = range->next) { int cov = ptToInt(range->val); if (cov > maxCov) maxCov = cov; } /* Figure coverage threshold. */ int threshold = round(maxCov * weakLevel); if (threshold > forceJoinScore-1) threshold = forceJoinScore-1; /* Loop through emitting sections over threshold as clusters */ boolean inRange = FALSE; boolean start = 0, end = 0; for (range = rangeList; range != NULL; range = range->next) { int cov = ptToInt(range->val); if (cov > threshold) { if (inRange) end = range->end; else { inRange = TRUE; start = range->start; end = range->end; } } else { if (inRange) { addCluster(lm, itemList, start, end, &clusterList); inRange = FALSE; } } } if (inRange) addCluster(lm, itemList, start, end, &clusterList); } slReverse(&clusterList); return clusterList; }
static int bedToGffLines(struct bed *bedList, struct slName *exonFramesList, struct hTableInfo *hti, int fieldCount, char *source, boolean gtf2StopCodons) /* Translate a (list of) bed into gff and print out. * Note that field count (perhaps reduced by bitwise intersection) * can in effect override hti. */ { if (! bedList) return 0; struct hash *nameHash = newHash(20); struct bed *bed; struct slName *exonFrames = exonFramesList; int i, exonStart, exonEnd; char txName[256]; int itemCount = 0; static int namelessIx = 0; for (bed = bedList; bed != NULL; bed = bed->next) { /* Enforce unique transcript_ids. */ if (bed->name != NULL) { struct hashEl *hel = hashLookup(nameHash, bed->name); int dupCount = (hel != NULL ? ptToInt(hel->val) : 0); if (dupCount > 0) { safef(txName, sizeof(txName), "%s_dup%d", bed->name, dupCount); hel->val = intToPt(dupCount + 1); } else { safef(txName, sizeof(txName), "%s", bed->name); hashAddInt(nameHash, bed->name, 1); } } else safef(txName, sizeof(txName), "tx%d", ++namelessIx); if (hti->hasBlocks && hti->hasCDS && fieldCount > 4) { /* first pass: compute frames, in order dictated by strand. */ int startIndx = 0, stopIndx = 0; char *frames = NULL; char *ef = NULL; if (exonFramesList) ef = exonFrames->name; frames = computeFrames(bed, ef, &startIndx, &stopIndx); /* second pass: one exon (possibly CDS, start/stop_codon) per block. */ for (i=0; i < bed->blockCount; i++) { exonStart = bed->chromStart + bed->chromStarts[i]; exonEnd = exonStart + bed->blockSizes[i]; if ((exonStart < bed->thickEnd) && (exonEnd > bed->thickStart)) { int exonCdsStart = max(exonStart, bed->thickStart); int exonCdsEnd = min(exonEnd, bed->thickEnd); addCdsStartStop(bed, source, exonCdsStart, exonCdsEnd, frames, i, startIndx, stopIndx, gtf2StopCodons, txName); } addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName); } freeMem(frames); } else if (hti->hasBlocks && fieldCount > 4) { for (i=0; i < bed->blockCount; i++) { exonStart = bed->chromStart + bed->chromStarts[i]; exonEnd = exonStart + bed->blockSizes[i]; addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName); } } else if (hti->hasCDS && fieldCount > 4) { if (bed->thickStart == 0 && bed->thickEnd == 0) bed->thickStart = bed->thickEnd = bed->chromStart; if (bed->thickStart > bed->chromStart) { addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->thickStart, '.', txName); } if (bed->thickEnd > bed->thickStart) addGffLineFromBed(bed, source, "CDS", bed->thickStart, bed->thickEnd, '0', txName); if (bed->thickEnd < bed->chromEnd) { addGffLineFromBed(bed, source, "exon", bed->thickEnd, bed->chromEnd, '.', txName); } } else { addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->chromEnd, '.', txName); } itemCount++; if (exonFrames) exonFrames = exonFrames->next; } hashFree(&nameHash); return itemCount; }
struct cdwQaPairedEndFastq *cdwQaPairedEndFastqFromNextRa(struct lineFile *lf, struct raToStructReader *reader) /* Return next stanza put into an cdwQaPairedEndFastq. */ { enum fields { fileId1Field, concordanceField, distanceMeanField, distanceStdField, distanceMinField, distanceMaxField, }; if (!raSkipLeadingEmptyLines(lf, NULL)) return NULL; struct cdwQaPairedEndFastq *el; AllocVar(el); bool *fieldsObserved = reader->fieldsObserved; bzero(fieldsObserved, reader->fieldCount); char *tag, *val; while (raNextTagVal(lf, &tag, &val, NULL)) { struct hashEl *hel = hashLookup(reader->fieldIds, tag); if (hel != NULL) { int id = ptToInt(hel->val); if (fieldsObserved[id]) errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName); fieldsObserved[id] = TRUE; switch (id) { case fileId1Field: { el->fileId1 = sqlUnsigned(val); break; } case concordanceField: { el->concordance = sqlDouble(val); break; } case distanceMeanField: { el->distanceMean = sqlDouble(val); break; } case distanceStdField: { el->distanceStd = sqlDouble(val); break; } case distanceMinField: { el->distanceMin = sqlDouble(val); break; } case distanceMaxField: { el->distanceMax = sqlDouble(val); break; } default: internalErr(); break; } } } raToStructReaderCheckRequiredFields(reader, lf); return el; }
void bioImageLoad(char *setRaFile, char *itemTabFile) /* bioImageLoad - Load data into bioImage database. */ { struct hash *raHash = raReadSingle(setRaFile); struct hash *rowHash; struct lineFile *lf = lineFileOpen(itemTabFile, TRUE); char *line, *words[256]; struct sqlConnection *conn = sqlConnect(database); int rowSize; int submissionSetId; struct hash *fullDirHash = newHash(0); struct hash *screenDirHash = newHash(0); struct hash *thumbDirHash = newHash(0); struct hash *treatmentHash = newHash(0); struct hash *bodyPartHash = newHash(0); struct hash *sliceTypeHash = newHash(0); struct hash *imageTypeHash = newHash(0); struct hash *sectionSetHash = newHash(0); struct dyString *dy = dyStringNew(0); /* Read first line of tab file, and from it get all the field names. */ if (!lineFileNext(lf, &line, NULL)) errAbort("%s appears to be empty", lf->fileName); if (line[0] != '#') errAbort("First line of %s needs to start with #, and then contain field names", lf->fileName); rowHash = hashRowOffsets(line+1); rowSize = rowHash->elCount; if (rowSize >= ArraySize(words)) errAbort("Too many fields in %s", lf->fileName); /* Check that have all required fields */ { char *fieldName; int i; for (i=0; i<ArraySize(requiredSetFields); ++i) { fieldName = requiredSetFields[i]; if (!hashLookup(raHash, fieldName)) errAbort("Field %s is not in %s", fieldName, setRaFile); } for (i=0; i<ArraySize(requiredItemFields); ++i) { fieldName = requiredItemFields[i]; if (!hashLookup(rowHash, fieldName)) errAbort("Field %s is not in %s", fieldName, itemTabFile); } for (i=0; i<ArraySize(requiredFields); ++i) { fieldName = requiredFields[i]; if (!hashLookup(rowHash, fieldName) && !hashLookup(raHash, fieldName)) errAbort("Field %s is not in %s or %s", fieldName, setRaFile, itemTabFile); } } /* Create/find submission record. */ submissionSetId = saveSubmissionSet(conn, raHash); /* Process rest of tab file. */ while (lineFileNextRowTab(lf, words, rowSize)) { int fullDir = cachedId(conn, "location", "name", fullDirHash, "fullDir", raHash, rowHash, words); int screenDir = cachedId(conn, "location", "name", screenDirHash, "screenDir", raHash, rowHash, words); int thumbDir = cachedId(conn, "location", "name", thumbDirHash, "thumbDir", raHash, rowHash, words); int bodyPart = cachedId(conn, "bodyPart", "name", bodyPartHash, "bodyPart", raHash, rowHash, words); int sliceType = cachedId(conn, "sliceType", "name", sliceTypeHash, "sliceType", raHash, rowHash, words); int imageType = cachedId(conn, "imageType", "name", imageTypeHash, "imageType", raHash, rowHash, words); int treatment = cachedId(conn, "treatment", "conditions", treatmentHash, "treatment", raHash, rowHash, words); char *fileName = getVal("fileName", raHash, rowHash, words, NULL); char *submitId = getVal("submitId", raHash, rowHash, words, NULL); char *taxon = getVal("taxon", raHash, rowHash, words, NULL); char *isEmbryo = getVal("isEmbryo", raHash, rowHash, words, NULL); char *age = getVal("age", raHash, rowHash, words, NULL); char *sectionSet = getVal("sectionSet", raHash, rowHash, words, ""); char *sectionIx = getVal("sectionIx", raHash, rowHash, words, "0"); char *gene = getVal("gene", raHash, rowHash, words, ""); char *locusLink = getVal("locusLink", raHash, rowHash, words, ""); char *refSeq = getVal("refSeq", raHash, rowHash, words, ""); char *genbank = getVal("genbank", raHash, rowHash, words, ""); char *priority = getVal("priority", raHash, rowHash, words, "200"); int sectionId = 0; int oldId; // char *xzy = getVal("xzy", raHash, rowHash, words, xzy); if (sectionSet[0] != 0 && !sameString(sectionSet, "0")) { struct hashEl *hel = hashLookup(sectionSetHash, sectionSet); if (hel != NULL) sectionId = ptToInt(hel->val); else { sqlUpdate(conn, "insert into sectionSet values(default)"); sectionId = sqlLastAutoId(conn); hashAdd(sectionSetHash, sectionSet, intToPt(sectionId)); } } dyStringClear(dy); dyStringAppend(dy, "select id from image "); dyStringPrintf(dy, "where fileName = '%s' ", fileName); dyStringPrintf(dy, "and fullLocation = %d", fullDir); oldId = sqlQuickNum(conn, dy->string); if (oldId != 0) { if (replace) { dyStringClear(dy); dyStringPrintf(dy, "delete from image where id = %d", oldId); sqlUpdate(conn, dy->string); } else errAbort("%s is already in database line %d of %s", fileName, lf->lineIx, lf->fileName); } dyStringClear(dy); dyStringAppend(dy, "insert into image set\n"); dyStringPrintf(dy, " id = default,\n"); dyStringPrintf(dy, " fileName = '%s',\n", fileName); dyStringPrintf(dy, " fullLocation = %d,\n", fullDir); dyStringPrintf(dy, " screenLocation = %d,\n", screenDir); dyStringPrintf(dy, " thumbLocation = %d,\n", thumbDir); dyStringPrintf(dy, " submissionSet = %d,\n", submissionSetId); dyStringPrintf(dy, " sectionSet = %d,\n", sectionId); dyStringPrintf(dy, " sectionIx = %s,\n", sectionIx); dyStringPrintf(dy, " submitId = '%s',\n", submitId); dyStringPrintf(dy, " gene = '%s',\n", gene); dyStringPrintf(dy, " locusLink = '%s',\n", locusLink); dyStringPrintf(dy, " refSeq = '%s',\n", refSeq); dyStringPrintf(dy, " genbank = '%s',\n", genbank); dyStringPrintf(dy, " priority = %s,\n", priority); dyStringPrintf(dy, " taxon = %s,\n", taxon); dyStringPrintf(dy, " isEmbryo = %s,\n", isEmbryo); dyStringPrintf(dy, " age = %s,\n", age); dyStringPrintf(dy, " bodyPart = %d,\n", bodyPart); dyStringPrintf(dy, " sliceType = %d,\n", sliceType); dyStringPrintf(dy, " imageType = %d,\n", imageType); dyStringPrintf(dy, " treatment = %d\n", treatment); sqlUpdate(conn, dy->string); } }
void bedItemOverlapCount(struct hash *chromHash, char *infile, char *outfile){ unsigned maxChromSize = 0; unitSize *counts = (unitSize *)NULL; FILE *f = mustOpen(outfile, "w"); struct hashCookie hc = hashFirst(chromHash); struct hashEl *hel; while( (hel = hashNext(&hc)) != NULL) { unsigned num = (unsigned) ptToInt(hel->val); maxChromSize = max(num, maxChromSize); } verbose(2,"#\tmaxChromSize: %u\n", maxChromSize); if (maxChromSize < 1) errAbort("maxChromSize is zero ?"); /* Allocate just once for the largest chrom and reuse this array */ counts = needHugeMem(sizeof(unitSize) * maxChromSize); /* Reset the array to be zero to be reused */ memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize); unsigned chromSize = 0; char *prevChrom = (char *)NULL; boolean outputToDo = FALSE; struct hash *seenHash = newHash(5); struct lineFile *bf = lineFileOpen(infile , TRUE); struct bed *bed = (struct bed *)NULL; char *row[12]; int numFields = doBed12 ? 12 : 3; while (lineFileNextRow(bf,row, numFields)) { int i; bed = bedLoadN(row, numFields); verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd); if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr { verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize); if (outputToDo) outputCounts(counts, prevChrom, chromSize, f); outputToDo = FALSE; memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */ freez(&prevChrom); // prevChrom is now NULL so it will be caught by next if! } if ((char *)NULL == prevChrom) // begin a chr { if (hashLookup(seenHash, bed->chrom)) errAbort("ERROR:input file not sorted. %s seen before on line %d\n", bed->chrom, bf->lineIx); hashAdd(seenHash, bed->chrom, NULL); prevChrom = cloneString(bed->chrom); chromSize = hashIntVal(chromHash, prevChrom); verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize); } if (bed->chromEnd > chromSize) { // check for circular chrM if (doBed12 || bed->chromStart>=chromSize || differentWord(bed->chrom,"chrM")) { warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart, bed->chromEnd); errAbort("chromEnd > chromSize ? %d > %d", bed->chromEnd,chromSize); } for (i = bed->chromStart; i < chromSize; ++i) INCWOVERFLOW(counts,i); for (i = 0; i < (bed->chromEnd - chromSize); ++i) INCWOVERFLOW(counts,i); } else if (doBed12) { int *starts = bed->chromStarts; int *sizes = bed->blockSizes; int *endStarts = &bed->chromStarts[bed->blockCount]; for(; starts < endStarts; starts++, sizes++) { unsigned int end = *starts + *sizes + bed->chromStart; for (i = *starts + bed->chromStart; i < end; ++i) INCWOVERFLOW(counts,i); } } else { for (i = bed->chromStart; i < bed->chromEnd; ++i) INCWOVERFLOW(counts, i); } outputToDo = TRUE; bedFree(&bed); // plug the memory leak } lineFileClose(&bf); // Note, next file could be on same chr! if (outputToDo) outputCounts(counts, prevChrom, chromSize, f); if (doOutBounds) fprintf(stderr, "min %lu max %lu\n", (unsigned long)overMin, (unsigned long)overMax); verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize); carefulClose(&f); freeMem(counts); freez(&prevChrom); // hashFreeWithVals(&chromHash, freez); freeHash(&seenHash); }
struct edwFastqFile *edwFastqFileFromNextRa(struct lineFile *lf, struct raToStructReader *reader) /* Return next stanza put into an edwFastqFile. */ { enum fields { sampleCountField, basesInSampleField, readCountField, baseCountField, readSizeMeanField, readSizeStdField, readSizeMinField, readSizeMaxField, qualMeanField, qualStdField, qualMinField, qualMaxField, qualTypeField, qualZeroField, atRatioField, aRatioField, cRatioField, gRatioField, tRatioField, nRatioField, qualPosField, aAtPosField, cAtPosField, gAtPosField, tAtPosField, nAtPosField, }; if (!raSkipLeadingEmptyLines(lf, NULL)) return NULL; struct edwFastqFile *el; AllocVar(el); bool *fieldsObserved = reader->fieldsObserved; bzero(fieldsObserved, reader->fieldCount); char *tag, *val; while (raNextTagVal(lf, &tag, &val, NULL)) { struct hashEl *hel = hashLookup(reader->fieldIds, tag); if (hel != NULL) { int id = ptToInt(hel->val); if (fieldsObserved[id]) errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName); fieldsObserved[id] = TRUE; switch (id) { case sampleCountField: { el->sampleCount = sqlLongLong(val); break; } case basesInSampleField: { el->basesInSample = sqlLongLong(val); break; } case readCountField: { el->readCount = sqlLongLong(val); break; } case baseCountField: { el->baseCount = sqlLongLong(val); break; } case readSizeMeanField: { el->readSizeMean = sqlDouble(val); break; } case readSizeStdField: { el->readSizeStd = sqlDouble(val); break; } case readSizeMinField: { el->readSizeMin = sqlSigned(val); break; } case readSizeMaxField: { int arraySize = sqlSigned(val); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "readSizeMax"); break; } case qualMeanField: { el->qualMean = sqlDouble(val); break; } case qualStdField: { el->qualStd = sqlDouble(val); break; } case qualMinField: { el->qualMin = sqlDouble(val); break; } case qualMaxField: { el->qualMax = sqlDouble(val); break; } case qualTypeField: { el->qualType = cloneString(val); break; } case qualZeroField: { el->qualZero = sqlSigned(val); break; } case atRatioField: { el->atRatio = sqlDouble(val); break; } case aRatioField: { el->aRatio = sqlDouble(val); break; } case cRatioField: { el->cRatio = sqlDouble(val); break; } case gRatioField: { el->gRatio = sqlDouble(val); break; } case tRatioField: { el->tRatio = sqlDouble(val); break; } case nRatioField: { el->nRatio = sqlDouble(val); break; } case qualPosField: { int arraySize; sqlDoubleDynamicArray(val, &el->qualPos, &arraySize); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "qualPos"); break; } case aAtPosField: { int arraySize; sqlDoubleDynamicArray(val, &el->aAtPos, &arraySize); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "aAtPos"); break; } case cAtPosField: { int arraySize; sqlDoubleDynamicArray(val, &el->cAtPos, &arraySize); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "cAtPos"); break; } case gAtPosField: { int arraySize; sqlDoubleDynamicArray(val, &el->gAtPos, &arraySize); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "gAtPos"); break; } case tAtPosField: { int arraySize; sqlDoubleDynamicArray(val, &el->tAtPos, &arraySize); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "tAtPos"); break; } case nAtPosField: { int arraySize; sqlDoubleDynamicArray(val, &el->nAtPos, &arraySize); raToStructArraySignedSizer(lf, arraySize, &el->readSizeMax, "nAtPos"); break; } default: internalErr(); break; } } } raToStructReaderCheckRequiredFields(reader, lf); return el; }
void encodeExpToTab(char *outExp, char *outSeries, char *outResults) /* encodeExpToCvDb - Convert encode experiments table to a table more suitable for cvDb. */ { struct hash *optHash = optionalFieldsHash(); struct mdbObj *mdbList = getMdbList(metaDbs, ArraySize(metaDbs)); struct hash *mdbHash = mdbHashKeyedByExpId(mdbList); struct hash *seriesHash = hashNew(0); struct series *seriesList = NULL; verbose(1, "read %d mdb objects from %s in %d databases\n", mdbHash->elCount, metaTable, (int)ArraySize(metaDbs)); struct sqlConnection *expDbConn = sqlConnect(expDb); struct sqlConnection *cvDbConn = sqlConnect(cvDb); char query[256]; sqlSafef(query, sizeof(query), "select * from %s", expTable); struct sqlResult *sr = sqlGetResult(expDbConn, query); FILE *f = mustOpen(outExp, "w"); char **row; while ((row = sqlNextRow(sr)) != NULL) { /* Read in database structure. */ struct encodeExp *ee = encodeExpLoad(row); /* Much of the data we're processing comes from lists of the form * "a=aVal b=bVal c=cVal." We'll convert these to id's in the appropriate * tables and store the IDs in the optCol array declared below. */ int optColCount = ArraySize(expOptionalFields); int optCol[optColCount]; int i; for (i=0; i<optColCount; ++i) optCol[i] = 0; /* Convert var=val string in encodeExp.expVars into list of slPairs, and loop through it. */ struct slPair *varList = slPairListFromString(ee->expVars, TRUE); struct slPair *var; for (var = varList; var != NULL; var = var->next) { /* Figure out name of table and the term within that table. */ char *table = var->name; char *term = var->val; if (sameString(table, "antibody")) // Deal with antibody special case { if (sameString(term, "Control") || sameString(term, "Input") || sameString(term, "RevXlinkChromatin") || sameString(term, "ripInput")) { table = "control"; } } /* If it looks like we have a valid table and term, store result in * optCol array we'll output soon. */ struct hashEl *hel; if ((hel = hashLookup(optHash, table)) != NULL) { int id = lookupId(cvDbConn, table, term); if (id == 0) { warn("No id in cvDb for %s=%s\n", table, term); continue; } int optColIx = ptToInt(hel->val); optCol[optColIx] = id; } else verbose(2, "%s %s ?\n", table, term); } /* Now we want to process metaDb, which has some info encodeExp does not. */ char *composite = NULL; char ixAsString[16]; safef(ixAsString, sizeof(ixAsString), "%d", ee->ix); struct mdbObj *mdb = hashFindVal(mdbHash, ixAsString); if (mdb != NULL) { struct mdbVar *v; for (v = mdb->vars; v != NULL; v = v->next) { /* Look up table and term and change table name if need be */ char *table = v->var; char *term = v->val; if (sameString(table, "antibody")) table = "ab"; else if (sameString(table, "grant")) table = "grantee"; /* Squirrel away the ever-important composite term for later. */ if (sameString("composite", table)) composite = term; struct hashEl *hel; if ((hel = hashLookup(optHash, table)) != NULL) { int optColIx = ptToInt(hel->val); if (optCol[optColIx] == 0) // Only use mdb if encodeExp has no data. { int id = lookupId(cvDbConn, table, term); optCol[optColIx] = id; } } } } /* If we've got a composite, then make up a series record. */ if (composite != NULL) { assert(mdb != NULL); struct series *series = hashFindVal(seriesHash, composite); if (series == NULL) { series = seriesFromMdb(mdb, composite); hashAdd(seriesHash, composite, series); slAddHead(&seriesList, series); } } if (ee->accession != NULL) { /* Write out required fields. Order of required fields * here needs to follow order in expRequiredFields. */ fprintf(f, "%u", ee->ix); fprintf(f, "\t%s", ee->updateTime); fprintf(f, "\t%s", naForNull(composite)); fprintf(f, "\t%s", ee->accession); fprintf(f, "\t%d", lookupId(cvDbConn, "organism", ee->organism)); fprintf(f, "\t%d", lookupId(cvDbConn, "lab", ee->lab)); fprintf(f, "\t%d", lookupId(cvDbConn, "dataType", ee->dataType)); fprintf(f, "\t%d", lookupId(cvDbConn, "cellType", ee->cellType)); /* Now write out optional fields. */ for (i=0; i<optColCount; ++i) fprintf(f, "\t%d", optCol[i]); /* End output record. */ fprintf(f, "\n"); } } /* Write out series list to a separate file. */ slReverse(&seriesList); writeSeriesList(outSeries, seriesList); /* Write out results to a separate file. */ writeMdbListAsResults(mdbList, outResults); /* Clean up and go home. */ carefulClose(&f); sqlFreeResult(&sr); sqlDisconnect(&expDbConn); sqlDisconnect(&cvDbConn); }
int findSize(struct hash *hash, char *name) /* Find size of name in hash or die trying. */ { void *val = hashMustFindVal(hash, name); return ptToInt(val); }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount, boolean tabSep) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = 0; if (tabSep) rowSize = lineFileChopCharNext(lf, '\t', row, maxRowSize); else rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { /* make sure chrom names are sorted in ASCII order */ if ((usage != NULL) && strcmp(usage->name, chrom) > 0) { errAbort("%s is not case-sensitive sorted at line %d. Please use \"sort -k1,1 -k2,2n\" with LC_COLLATE=C, or bedSort and try again.", lf->fileName, lf->lineIx); } struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); double aveSize = 0; if (bedCount > 0) aveSize = (double)totalBases/bedCount; *retMinDiff = minDiff; *retAveSize = aveSize; *retBedCount = bedCount; return usageList; }