int compress(FILE* inputFile, FILE* outputFile, int compressionLevel) { struct BitwiseBufferedFile* w = openBitwiseBufferedFile(NULL, 1, -1, outputFile); uint8_t readByte[LOCAL_BYTE_BUFFER_LENGTH]; size_t indexLength = INITIAL_INDEX_LENGTH; size_t bufferedBytes; int byteIndex = 0; struct LZ78HashTableEntry* hashTable; uint32_t childIndex = 257; uint32_t lookupIndex = ROOT_INDEX; uint32_t indexLengthMask = (1 << INITIAL_INDEX_LENGTH) - 1; uint32_t child; if((compressionLevel < MIN_COMPRESSION_LEVEL || compressionLevel > MAX_COMPRESSION_LEVEL) || inputFile == NULL || w == NULL) { errno = EINVAL; if(w != NULL) closeBitwiseBufferedFile(w); return -1; } uint32_t hashTableEntries = compressionParameters[compressionLevel - MIN_COMPRESSION_LEVEL].hashTableEntries; uint32_t moduloMask = hashTableEntries - 1; uint32_t maxChild = compressionParameters[compressionLevel - MIN_COMPRESSION_LEVEL].maxChild; hashTable = hashCreate(hashTableEntries, moduloMask); if(hashTable == NULL) { closeBitwiseBufferedFile(w); return -1; } if(writeBitBuffer(w, (uint32_t)compressionLevel, 3) == -1) goto exceptionHandler; while(!feof(inputFile) && !ferror(inputFile)) { bufferedBytes = fread(readByte, 1, LOCAL_BYTE_BUFFER_LENGTH, inputFile); for(byteIndex = 0; byteIndex < bufferedBytes; byteIndex++) { child = hashLookup(hashTable, lookupIndex, readByte[byteIndex], moduloMask); if(child != ROOT_INDEX) lookupIndex = child; //ROOT_INDEX means NOT FOUND else { if(writeBitBuffer(w, lookupIndex, indexLength) == -1 || hashInsert(hashTable, lookupIndex, readByte[byteIndex], childIndex, moduloMask) == -1) goto exceptionHandler; childIndex++; if((childIndex & indexLengthMask) == 0) //A power of 2 is reached { //The length of the transmitted index is incremented indexLength++; //The next power of 2 mask is set indexLengthMask = (indexLengthMask << 1) | 1; } //readByte value is also the right index to start with next time //because you have to start from the last character recognized lookupIndex = readByte[byteIndex] + 1; //ROOT_INDEX = 0 so ASCII characters are indexed in [1, 257] if (childIndex == maxChild) //hash table is full { if(hashReset(hashTable, hashTableEntries, moduloMask) == NULL) goto exceptionHandler; //hash table was not successfully created childIndex = FIRST_CHILD; //starts from the beginning indexLength = INITIAL_INDEX_LENGTH; indexLengthMask = (1 << INITIAL_INDEX_LENGTH) - 1; } } } } if(ferror(inputFile)) { errno = EBADFD; goto exceptionHandler; } if(writeBitBuffer(w, lookupIndex, indexLength) == -1 || writeBitBuffer(w, ROOT_INDEX, indexLength) == -1) goto exceptionHandler; hashDestroy(hashTable, hashTableEntries); return closeBitwiseBufferedFile(w); exceptionHandler: hashDestroy(hashTable, hashTableEntries); closeBitwiseBufferedFile(w); return -1; }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; info.sourceAcc = txAccFromTempName(bed->name); info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc)) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
static void gapToLift(char *db, char *outFile) /* gapToLift - create lift file from gap table(s). */ { FILE *out = mustOpen(outFile, "w"); struct sqlConnection *conn = sqlConnect(db); struct chromInfo *cInfoList = loadChromInfo(conn); struct agpGap *gapList = loadAllGaps(conn, db, cInfoList); struct agpGap *gap; int start = 0; int end = 0; char *prevChr = NULL; int liftCount = 0; int chrSize = 0; static struct hash *chrDone = NULL; chrDone = newHash(0); if (isNotEmpty(bedFileName)) { bedFile = mustOpen(bedFileName, "w"); verbose(2,"#\tbed output requested to %s\n", bedFileName); } for (gap = gapList; gap; gap = gap->next) { verbose(3,"#\t%s\t%d\t%d\t%s\n", gap->chrom, gap->chromStart, gap->chromEnd, gap->bridge); if (prevChr && sameWord(prevChr, gap->chrom)) { /* continuing same segment, check for gap break, * or gap at end of chrom */ if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize)) { end = gap->chromStart; liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize); start = gap->chromEnd; end = start; } else end = gap->chromEnd; } else /* new chrom encountered */ { /* output last segment of previous chrom when necessary */ if (prevChr && differentWord(prevChr, gap->chrom)) { if (end < chrSize) liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize); } liftCount = 0; chrSize = hashIntVal(cInfoHash, gap->chrom); hashAddInt(chrDone, gap->chrom, 1); if (gap->chromStart > 0) { /* starting first segment at position 0 */ start = 0; end = gap->chromStart; /* does the first gap break it ? Or gap goes to end of chrom. */ if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize)) { liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize); start = gap->chromEnd; end = start; } } else /* first gap is actually the beginning of the chrom */ { /* thus, first segment starts after this first gap */ start = gap->chromEnd; end = start; } } prevChr = gap->chrom; /* remember prev chrom to detect next chrom */ } /* potentially a last one */ if (end < chrSize) liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize); /* check that all chroms have been used */ struct hashCookie cookie = hashFirst(cInfoHash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { if (NULL == hashLookup(chrDone, hel->name)) { chrSize = hashIntVal(cInfoHash, hel->name); verbose(2, "#\tno gaps on chrom: %s, size: %d\n", hel->name, chrSize); liftCount = liftOutLine(out, hel->name, 0, chrSize, 0, chrSize); } } carefulClose(&out); sqlDisconnect(&conn); }
struct hash *readKeyHash(char *db, struct joiner *joiner, struct joinerField *keyField, struct keyHitInfo **retList) /* Read key-field into hash. Check for dupes if need be. */ { struct sqlConnection *conn = sqlWarnConnect(db); struct hash *keyHash = NULL; struct keyHitInfo *khiList = NULL, *khi; if (conn == NULL) { return NULL; } else { struct slName *table; struct slName *tableList = getTablesForField(conn,keyField->splitPrefix, keyField->table, keyField->splitSuffix); int rowCount = totalTableRows(conn, tableList); int hashSize = digitsBaseTwo(rowCount)+1; char query[256], **row; struct sqlResult *sr; int itemCount = 0; int dupeCount = 0; char *dupe = NULL; if (rowCount > 0) { if (hashSize > hashMaxSize) hashSize = hashMaxSize; keyHash = hashNew(hashSize); for (table = tableList; table != NULL; table = table->next) { safef(query, sizeof(query), "select %s from %s", keyField->field, table->name); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *id = doChopsAndUpper(keyField, row[0]); if (hashLookup(keyHash, id)) { if (keyField->unique) { if (keyField->exclude == NULL || !slNameInList(keyField->exclude, id)) { if (dupeCount == 0) dupe = cloneString(id); ++dupeCount; } } } else { AllocVar(khi); hashAddSaveName(keyHash, id, khi, &khi->name); slAddHead(&khiList, khi); ++itemCount; } } sqlFreeResult(&sr); } if (dupe != NULL) { warn("Error: %d duplicates in %s.%s.%s including '%s'", dupeCount, db, keyField->table, keyField->field, dupe); freez(&dupe); } verbose(2, " %s.%s.%s - %d unique identifiers\n", db, keyField->table, keyField->field, itemCount); } slFreeList(&tableList); } sqlDisconnect(&conn); *retList = khiList; return keyHash; }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *geneList = NULL, *curGene, *gene; int geneIx, geneCount = 0; struct microData **geneArray = NULL; float *weights = NULL; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); long time1, time2; time1 = clock1000(); /* Get list/hash of all items with expression values. */ /* uglyf("warning: temporarily limited to 1000 records\n"); */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); time2 = clock1000(); verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Get an array for sorting. */ AllocArray(geneArray, geneCount); for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx) geneArray[geneIx] = gene; /* Print out closest 1000 in tab file. */ for (curGene = geneList; curGene != NULL; curGene = curGene->next) { calcDistances(curGene, geneList, weights); qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance); for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx) { gene = geneArray[geneIx]; fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance); } dotOut(); } printf("Made %s.tab\n", outTable); time1 = time2; time2 = clock1000(); verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); time1 = time2; time2 = clock1000(); verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0); }
void processSnps(char *chromName) /* read through all rows in snpTmp */ /* look up class and observed */ /* write to output file */ { char query[512]; struct sqlConnection *conn = hAllocConn(); struct sqlResult *sr; char **row; char tableName[64]; char fileName[64]; FILE *f; struct hashEl *univarElement = NULL; struct snpData *dataElement = NULL; int classInt = 0; char *classString = NULL; int loc_type = 0; int skipCount = 0; safef(tableName, ArraySize(tableName), "%s_snpTmp", chromName); safef(fileName, ArraySize(fileName), "%s_snpTmp.tab", chromName); f = mustOpen(fileName, "w"); safef(query, sizeof(query), "select snp_id, chromStart, chromEnd, loc_type, orientation, allele, refUCSC, refUCSCReverseComp, weight from %s ", tableName); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { loc_type = sqlUnsigned(row[3]); /* get univarElement from snpHash */ univarElement = hashLookup(snpHash, row[0]); if (univarElement == NULL) { { fprintf(errorFileHandle, "no data for %s (dropping)\n", row[0]); skipCount++; continue; } } dataElement = (struct snpData *)univarElement->val; classInt = dataElement->classInt; // verbose(1, "classInt = %d\n", classInt); assert(classInt >= 1 && classInt <= classCount); /* lookup classString */ classString = classStrings[classInt-1]; /* special handling for class = in-del; split into classes of our own construction */ if (sameString(classString, "in-del")) { if (loc_type == 3) classString = cloneString("insertion"); if (loc_type == 1 || loc_type == 2) classString = cloneString("deletion"); } fprintf(f, "%s\t%s\t%s\t%d\t%s\t", row[0], row[1], row[2], loc_type, classString); fprintf(f, "%s\t%s\t%s\t%s\t%s\t%s\n", row[4], row[5], row[6], row[7], dataElement->observed, row[8]); } sqlFreeResult(&sr); hFreeConn(&conn); carefulClose(&f); if (skipCount > 0) verbose(1, "%d rows dropped\n", skipCount); }
INLINE boolean fieldOk(char *field, struct hash *fieldHash) /* Return TRUE if fieldHash is NULL or field exists in fieldHash. */ { return (fieldHash == NULL || hashLookup(fieldHash, field)); }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *gene; int rc, t; pthread_t *threads = NULL; pthread_attr_t attr; int *threadID = NULL; void *status; char *tempDir = "."; int arrayNum; struct microDataDistance *geneDistPtr = NULL; struct microDataDistance *geneDistArray = NULL; int geneIx; FILE *f = NULL; /* Get list/hash of all items with expression values. */ safef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); f = hgCreateTabFile(tempDir, outTable); synQ = synQueueNew(); /* instantiate threads */ AllocArray( threadID, numThreads ); AllocArray( threads, numThreads ); pthread_attr_init( &attr ); pthread_mutex_init( &mutexDotOut, NULL ); pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ); for (t = 0; t < numThreads; t++) { threadID[t] = t; rc = pthread_create( &threads[t], &attr, computeDistance, (void *) &threadID[t]); if (rc) errAbort("ERROR: in pthread_create() %d\n", rc ); } /* this thread will write to the file from the queue */ for (arrayNum = 0; arrayNum < geneCount; arrayNum++) { geneDistArray = (struct microDataDistance *)synQueueGet( synQ ); geneDistPtr = geneDistArray; /* Print out closest GENEDISTS distances in tab file. */ for (geneIx=0; geneIx < GENEDISTS && geneIx < geneCount; ++geneIx, geneDistPtr++) if (geneDistPtr != NULL) fprintf(f, "%s\t%s\t%f\n", geneDistPtr->name1, geneDistPtr->name2, geneDistPtr->distance); else errAbort("ERROR: writing distance %d to file\n", geneIx); freeMem( geneDistArray ); } /* synchronize all threads */ for (t = 0; t < numThreads; t++) { rc = pthread_join( threads[t], &status); if (rc) errAbort("ERROR: in pthread_join() %d\n", rc ); } printf("Made %s.tab\n", outTable); slFreeList( &geneList ); pthread_mutex_destroy( &mutexDotOut ); pthread_attr_destroy( &attr ); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ safef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { safef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); }
static struct grp *makeGroupList(char *db, struct trackDb *trackList, struct grp **pHubGrpList, boolean allTablesOk) /* Get list of groups that actually have something in them. */ { struct grp *groupsAll, *groupList = NULL, *group; struct hash *groupsInTrackList = newHash(0); struct hash *groupsInDatabase = newHash(0); struct trackDb *track; /* Stream through track list building up hash of active groups. */ for (track = trackList; track != NULL; track = track->next) { if (!hashLookup(groupsInTrackList,track->grp)) hashAdd(groupsInTrackList, track->grp, NULL); } /* Scan through group table, putting in ones where we have data. */ groupsAll = hLoadGrps(db); for (group = slPopHead(&groupsAll); group != NULL; group = slPopHead(&groupsAll)) { if (hashLookup(groupsInTrackList, group->name)) { slAddTail(&groupList, group); hashAdd(groupsInDatabase, group->name, group); } else grpFree(&group); } /* if we have custom tracks, we want to add the track hubs * after that group */ struct grp *addAfter = NULL; if ((groupList != NULL) && sameString(groupList->name, "user")) addAfter = groupList; /* Add in groups from hubs. */ for (group = slPopHead(pHubGrpList); group != NULL; group = slPopHead(pHubGrpList)) { // if the group isn't represented in any track, don't add it to list if (!hashLookup(groupsInTrackList,group->name)) continue; /* check to see if we're inserting hubs rather than * adding them to the front of the list */ struct grp *newGrp = grpDup(group); if (addAfter != NULL) { newGrp->next = addAfter->next; addAfter->next = newGrp; } else slAddHead(&groupList, newGrp); hashAdd(groupsInDatabase, newGrp->name, newGrp); } /* Do some error checking for tracks with group names that are * not in database. Just warn about them. */ if (!trackHubDatabase(db)) for (track = trackList; track != NULL; track = track->next) { if (!hashLookup(groupsInDatabase, track->grp)) warn("Track %s has group %s, which isn't in grp table", track->table, track->grp); } /* Create dummy group for all tracks. */ AllocVar(group); group->name = cloneString("allTracks"); group->label = cloneString("All Tracks"); slAddTail(&groupList, group); /* Create another dummy group for all tables. */ if (allTablesOk) { AllocVar(group); group->name = cloneString("allTables"); group->label = cloneString("All Tables"); slAddTail(&groupList, group); } hashFree(&groupsInTrackList); hashFree(&groupsInDatabase); return groupList; }
static void agpMergeChromScaf(char *agpFile, char *agpOut, boolean filtering) /* Create a combined agp file from the chrom.agp and scaffold.agp, * merging in only scaffolds from scaffold.agp * that are not already in chroms. */ { struct lineFile *lf = lineFileOpen(agpFile, TRUE); char *line, *words[16]; int lineSize, wordCount; unsigned lastPos = 0; struct agpFrag *agp; struct agpGap *gap; FILE *f; char *lastObj = NULL; f = mustOpen(agpOut, filtering ? "a" : "w"); char *newChrom = NULL; static struct hash *hash = NULL; boolean skipping = FALSE; if (!hash) hash = hashNew(0); verbose(2,"#\tprocessing AGP file: %s\n", agpFile); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == 0 || line[0] == '#' || line[0] == '\n') continue; //verbose(2,"#\tline: %d\n", lf->lineIx); wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Bad line %d of %s: need at least 5 words, got %d\n", lf->lineIx, lf->fileName, wordCount); if (!lastObj || !sameString(words[0],lastObj)) { freez(&newChrom); newChrom = cloneString(words[0]); lastPos = 0; } skipping = FALSE; if (filtering) { if (hashLookup(hash, words[0])) skipping = TRUE; } if (words[4][0] != 'N') { lineFileExpectAtLeast(lf, 9, wordCount); agp = agpFragLoad(words); /* agp is 1-based but agp loaders do not adjust for 0-based: */ agp->chromStart -= 1; agp->fragStart -= 1; if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Sizes don't match in %s and %s line %d of %s\n", agp->chrom, agp->frag, lf->lineIx, lf->fileName); if (!filtering) { char *root = cloneString(agp->frag); chopSuffixAt(root, '.'); hashStore(hash, root); freeMem(root); } } else { lineFileExpectAtLeast(lf, 8, wordCount); gap = agpGapLoad(words); /* to be consistent with agpFrag */ gap->chromStart -= 1; agp = (struct agpFrag*)gap; } if (agp->chromStart != lastPos) errAbort("Start doesn't match previous end line %d of %s\n" "agp->chromStart: %u\n" "agp->chromEnd: %u\n" "lastPos: %u\n" ,lf->lineIx, lf->fileName ,agp->chromStart ,agp->chromEnd ,lastPos ); lastPos = agp->chromEnd; freez(&lastObj); lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */ if (words[4][0] != 'N') { /* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */ if (!skipping) agpFragOutput(agp, f, '\t', '\n'); agpFragFree(&agp); } else { /* restore back to 1-based for agp * because agpGapOutput doesn't compensate */ gap->chromStart += 1; if (!skipping) agpGapOutput(gap, f, '\t', '\n'); agpGapFree(&gap); } } carefulClose(&f); }
void bamTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) /* Print out selected fields from BAM. If fields is NULL, then print out all fields. */ { struct hTableInfo *hti = NULL; hti = getHti(db, table, conn); struct hash *idHash = NULL; char *idField = getIdField(db, curTrack, table, hti); int idFieldNum = 0; /* if we know what field to use for the identifiers, get the hash of names */ if (idField != NULL) idHash = identifierHash(db, table); if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = bamGetFields(table); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) { /* if we know the field for identifiers, save it away */ if ((idField != NULL) && sameString(idField, bb->name)) idFieldNum = i; hashAddInt(fieldHash, bb->name, i); } /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } /* Output row of labels */ fprintf(f, "#%s", fieldArray[0]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", fieldArray[i]); fprintf(f, "\n"); struct asObject *as = bamAsObj(); struct asFilter *filter = NULL; if (anyFilter()) { filter = asFilterFromCart(cart, db, table, as); if (filter) { fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); } } /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { struct lm *lm = lmInit(0); char *fileName = bamFileName(table, conn, region->chrom); struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom, region->start, region->end, lm); char *row[SAMALIGNMENT_NUM_COLS]; char numBuf[BAM_NUM_BUF_SIZE]; for (sam = samList; sam != NULL && (maxOut > 0); sam = sam->next) { samAlignmentToRow(sam, numBuf, row); if (asFilterOnRow(filter, row)) { /* if we're looking for identifiers, check if this matches */ if ((idHash != NULL)&&(hashLookup(idHash, row[idFieldNum]) == NULL)) continue; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", row[columnArray[i]]); fprintf(f, "\n"); maxOut --; } } freeMem(fileName); lmCleanup(&lm); } if (maxOut == 0) warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); /* Clean up and exit. */ hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); }
void ffaToFa(char *inFile, char *outDir, char *outTabName) /* convert Greg Schulers .ffa fasta files to our .fa files */ { struct lineFile *in; FILE *out = NULL, *tab; int lineSize; char *line; char ucscName[128]; char path[512]; static char lastPath[512]; int outFileCount = 0; struct hash *uniqClone = newHash(16); struct hash *uniqFrag = newHash(19); boolean ignore = FALSE; makeDir(outDir); errLog = mustOpen("ffaToFa.err", "w"); tab = mustOpen(outTabName, "w"); printf("Converting %s", inFile); fflush(stdout); if (sameString(inFile, "stdin")) in = lineFileStdin(TRUE); else in = lineFileOpen(inFile, TRUE); while (lineFileNext(in, &line, &lineSize)) { if (line[0] == '>') { ignore = FALSE; gsToUcsc(line+1, ucscName); faRecNameToFaFileName(outDir, ucscName, path); if (hashLookup(uniqFrag, ucscName)) { ignore = TRUE; warn("Duplicate %s in %s, ignoring all but first", ucscName, inFile); } else { hashAdd(uniqFrag, ucscName, NULL); } if (!sameString(path, lastPath)) { strcpy(lastPath, path); carefulClose(&out); if (hashLookup(uniqClone, path)) { warn("Duplicate %s in %s ignoring all but first", ucscName, inFile); } else { hashAdd(uniqClone, path, NULL); out = mustOpen(path, "w"); ++outFileCount; if ((outFileCount&7) == 0) { putc('.', stdout); fflush(stdout); } } } if (out != NULL && !ignore) { fprintf(out, ">%s\n", ucscName); fprintf(tab, "%s\t%s\n", ucscName, line+1); } } else { if (out != NULL && !ignore) { fputs(line, out); fputc('\n', out); } } } carefulClose(&out); fclose(tab); lineFileClose(&in); printf("Made %d .fa files in %s\n", outFileCount, outDir); }
void doGenePredNongenomic(struct sqlConnection *conn, int typeIx) /* Get mrna or protein associated with selected genes. */ { /* Note this does do the whole genome at once rather than one * chromosome at a time, but that's ok because the gene prediction * tracks this serves are on the small side. */ char *typeWords[3]; char *table; struct lm *lm = lmInit(64*1024); int fieldCount; struct bed *bed, *bedList = cookedBedsOnRegions(conn, curTable, getRegions(), lm, &fieldCount); int typeWordCount; textOpen(); /* Figure out which table to use. */ if (isRefGeneTrack(curTable)) { if (typeIx == 1) /* Protein */ doRefGeneProteinSequence(conn, bedList); else doRefGeneMrnaSequence(conn, bedList); } else { char *dupType = cloneString(findTypeForTable(database, curTrack, curTable, ctLookupName)); typeWordCount = chopLine(dupType, typeWords); if (typeIx >= typeWordCount) internalErr(); table = typeWords[typeIx]; if (sqlTableExists(conn, table)) { struct sqlResult *sr; char **row; char query[256]; struct hash *hash = newHash(18); boolean gotResults = FALSE; /* Make hash of all id's passing filters. */ for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(hash, bed->name, NULL); /* Scan through table, outputting ones that match. */ sqlSafef(query, sizeof(query), "select name, seq from %s", table); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { if (hashLookup(hash, row[0])) { hPrintf(">%s\n", row[0]); writeSeqWithBreaks(stdout, row[1], strlen(row[1]), 60); gotResults = TRUE; } } sqlFreeResult(&sr); hashFree(&hash); if (!gotResults) hPrintf(NO_RESULTS); } else { internalErr(); } freez(&dupType); } lmCleanup(&lm); }
void knownToVisiGene(char *database) /* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */ { char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); struct sqlConnection *hConn = sqlConnect(database); struct sqlConnection *iConn = sqlConnect(visiDb); struct sqlResult *sr; char **row; struct hash *geneImageHash = newHash(18); struct hash *locusLinkImageHash = newHash(18); struct hash *refSeqImageHash = newHash(18); struct hash *genbankImageHash = newHash(18); struct hash *probeImageHash = newHash(18); struct hash *knownToLocusLinkHash = newHash(18); struct hash *knownToRefSeqHash = newHash(18); struct hash *knownToGeneHash = newHash(18); struct hash *favorHugoHash = newHash(18); struct hash *knownToProbeHash = newHash(18); struct hash *knownToAllProbeHash = newHash(18); struct genePred *knownList = NULL, *known; struct hash *dupeHash = newHash(17); probesDb = optionVal("probesDb", database); struct sqlConnection *probesConn = sqlConnect(probesDb); vgProbes = sqlTableExists(probesConn,"vgProbes"); vgAllProbes = sqlTableExists(probesConn,"vgAllProbes"); /* Go through and make up hashes of images keyed by various fields. */ sr = sqlGetResult(iConn, "NOSQLINJ select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank" ",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id" " from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap" " where image.imageFile = imageFile.id" " and image.id = imageProbe.image" " and imageProbe.probe = probe.id" " and probe.gene = gene.id" " and image.submissionSet=submissionSet.id" " and vgPrbMap.probe = probe.id"); while ((row = sqlNextRow(sr)) != NULL) { int id = sqlUnsigned(row[0]); float priority = atof(row[1]); int privateUser = sqlSigned(row[7]); char vgPrb_Id[256]; safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]); int geneId = sqlUnsigned(row[9]); if (privateUser == 0) { addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id); addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]); addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]); addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]); addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]); } } verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d" ", genbankImageHash %d probeImageHash %d\n", geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, genbankImageHash->elCount, probeImageHash->elCount); sqlFreeResult(&sr); /* Build up list of known genes. */ sr = sqlGetResult(hConn, "NOSQLINJ select * from knownGene"); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *known = genePredLoad(row); if (!hashLookup(dupeHash, known->name)) { hashAdd(dupeHash, known->name, NULL); slAddHead(&knownList, known); } } slReverse(&knownList); sqlFreeResult(&sr); verbose(2, "Got %d known genes\n", slCount(knownList)); /* Build up hashes from knownGene to other things. */ if (vgProbes) bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash); if (vgAllProbes) bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash); foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE); foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE); foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE); foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE); verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount, knownToProbeHash->elCount, knownToAllProbeHash->elCount); /* Try and find an image for each gene. */ for (known = knownList; known != NULL; known = known->next) { char *name = known->name; struct prioritizedImage *best = NULL; { best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash); if (!best) best = bestImage(name, knownToRefSeqHash, refSeqImageHash); if (!best) { best = hashFindVal(genbankImageHash, name); } if (!best) best = bestImage(name, knownToGeneHash, geneImageHash); if (vgProbes && !best) best = bestImage(name, knownToProbeHash, probeImageHash); if (vgAllProbes && !best) best = bestImage(name, knownToAllProbeHash, probeImageHash); } if (best) { fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId); } } createTable(hConn, outTable); hgLoadTabFile(hConn, tempDir, outTable, &f); hgRemoveTabFile(tempDir, outTable); }
void checkMetaFiles(struct mdbObj *mdbObj, char *downDir, struct hash *allNames) { verbose(1, "----------------------------------------------\n"); verbose(1, "Checking that files specified in metaDb exist in download dir\n"); verbose(1, "----------------------------------------------\n"); for(; mdbObj != NULL; mdbObj=mdbObj->next) { struct mdbVar *mdbVar = hashFindVal(mdbObj->varHash, "objType"); if (mdbVar == NULL) { warn("objType not found in object %s", mdbObj->obj); continue; } if (sameString(mdbVar->val, "composite")) // skip objType composite continue; mdbObj->deleteThis = FALSE; mdbVar = hashFindVal(mdbObj->varHash, "composite"); if (mdbVar == NULL) { warn("composite not found in object %s", mdbObj->obj); continue; } // char *composite = mdbVar->val; mdbVar = hashFindVal(mdbObj->varHash, "fileName"); if (mdbVar == NULL) { mdbObj->deleteThis = TRUE; warn("fileName not found in object %s", mdbObj->obj); continue; } char *fileName = mdbVar->val; char buffer[10 * 1024]; struct hash *bamNames = hashNew(8); struct slName *list = slNameListFromString(fileName, ','), *el; for(el=list; el; el=el->next) { if (hashLookup(allNames, el->name)) { warn("duplicate fileName entry: %s", el->name); } else { hashAdd(allNames, el->name, NULL); } if (endsWith(el->name,".bam")) { hashAdd(bamNames, el->name, NULL); } if (endsWith(el->name,".bam.bai")) { el->name[strlen(el->name)-4] = 0; struct hashEl *hel = hashLookup(bamNames, el->name); el->name[strlen(el->name)] = '.'; if (hel == NULL) { warn(".bam.bai without corresponding .bam: %s", el->name); } else { hel->val = (void *)1; } } } // see if we have to add any .bam.bai to the list for(el=list; el; el=el->next) { if (endsWith(el->name,".bam")) { struct hashEl *hel = hashLookup(bamNames, el->name); if (hel->val == NULL) { // we have to add a .bam.bai to the list char *bambai = addSuffix(el->name, ".bai"); warn(".bam.bai not found for corresponding .bam in meta.fileName: %s", el->name); slNameAddTail(&list, bambai); if (hashLookup(allNames, bambai)) { warn("duplicate fileName entry: %s", bambai); } else hashAdd(allNames, bambai, NULL); } } } // make sure the files are there for(el=list; el; el=el->next) { if (!startsWith(mdbObj->obj, el->name)) { warn("fileName %s does not start with object name %s", el->name, mdbObj->obj); } safef(buffer, sizeof buffer, "%s/%s", downDir, el->name); verbose(2, "checking for fileExists %s\n", buffer); if (!fileExists(buffer)) { mdbObj->deleteThis = TRUE; warn("metaDb file %s not found in download dir %s",buffer, downDir); } } } }
void submitRefToFiles(struct sqlConnection *conn, struct sqlConnection *conn2, struct sqlConnection *connSp, char *ref, char *fileRoot, char *inJax) /* Create a .ra and a .tab file for given reference. */ { /* Initially the tab file will have some duplicate lines, so * write to temp file, and then filter. */ char raName[PATH_LEN], tabName[PATH_LEN], capName[PATH_LEN]; FILE *ra = NULL, *tab = NULL, *cap = NULL; struct dyString *query = dyStringNew(0); struct sqlResult *sr; char **row; char *pubMed; struct slName *list, *el; boolean gotAny = FALSE; struct hash *uniqImageHash = newHash(0); struct hash *captionHash = newHash(0); int imageWidth = 0, imageHeight = 0; char path[PATH_LEN]; struct dyString *caption = dyStringNew(0); struct dyString *copyright = dyStringNew(0); struct dyString *probeNotes = dyStringNew(0); boolean lookedForCopyright = FALSE; safef(raName, sizeof(raName), "%s.ra", fileRoot); safef(tabName, sizeof(tabName), "%s.tab", fileRoot); safef(capName, sizeof(capName), "%s.txt", fileRoot); tab = mustOpen(tabName, "w"); cap = mustOpen(capName, "w"); sqlDyStringPrintf(query, "select authors,journal,title,year from BIB_Refs where "); sqlDyStringPrintf(query, "_Refs_key = '%s'", ref); sr = sqlGetResultVerbose(conn, query->string); row = sqlNextRow(sr); if (row == NULL) errAbort("Can't find _Refs_key %s in BIB_Refs", ref); /* Make ra file with stuff common to whole submission set. */ ra = mustOpen(raName, "w"); fprintf(ra, "submissionSource MGI\n"); fprintf(ra, "acknowledgement Thanks to the Gene Expression Database group at " "Mouse Genome Informatics (MGI) for collecting, annotating and sharing " "this image. The MGI images were last updated in VisiGene on March 28, 2006. " "Additional and more up to date annotations and images may be available " "directly at <A HREF='http://www.informatics.jax.org' target='_blank'>MGI.</A>\n"); fprintf(ra, "submitSet jax%s\n", ref); fprintf(ra, "taxon 10090\n"); /* Mus musculus taxon */ fprintf(ra, "fullDir http://hgwdev.gi.ucsc.edu/visiGene/full/inSitu/Mouse/jax\n"); fprintf(ra, "thumbDir http://hgwdev.gi.ucsc.edu/visiGene/200/inSitu/Mouse/jax\n"); fprintf(ra, "setUrl http://www.informatics.jax.org/\n"); fprintf(ra, "itemUrl http://www.informatics.jax.org/searches/image.cgi?%%s\n"); fprintf(ra, "abUrl http://www.informatics.jax.org/searches/antibody.cgi?%%s\n"); fprintf(ra, "journal %s\n", row[1]); fprintf(ra, "publication %s\n", row[2]); fprintf(ra, "year %s\n", row[3]); /* The contributor (author) list is in format Kent WJ; Haussler DH; format in * Jackson. We convert it to Kent W.J.,Haussler D.H., format for visiGene. */ fprintf(ra, "contributor "); list = charSepToSlNames(row[0], ';'); for (el = list; el != NULL; el = el->next) { char *lastName = skipLeadingSpaces(el->name); char *initials = strrchr(lastName, ' '); if (initials == NULL) initials = ""; else *initials++ = 0; fprintf(ra, "%s", lastName); if (initials[0] != 0) { char c; fprintf(ra, " "); while ((c = *initials++) != 0) fprintf(ra, "%c.", c); } fprintf(ra, ","); } fprintf(ra, "\n"); slNameFreeList(&list); sqlFreeResult(&sr); /* Add in link to PubMed record on publication. */ dyStringClear(query); sqlDyStringPrintf(query, "select ACC_Accession.accID from ACC_Accession,ACC_LogicalDB " "where ACC_Accession._Object_key = %s " "and ACC_Accession._LogicalDB_key = ACC_LogicalDB._LogicalDB_key " "and ACC_LogicalDB.name = 'PubMed'", ref); pubMed = sqlQuickStringVerbose(conn, query->string); if (pubMed != NULL) fprintf(ra, "pubUrl https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=%s\n", pubMed); freez(&pubMed); dyStringClear(query); sqlDyStringPrintf(query, "select distinct MRK_Marker.symbol as gene," "GXD_Specimen.sex as sex," "GXD_Specimen.age as age," "GXD_Specimen.ageMin as ageMin," "GXD_Specimen.ageMax as ageMax," "IMG_ImagePane.paneLabel as paneLabel," "ACC_Accession.numericPart as fileKey," "IMG_Image._Image_key as imageKey," "GXD_Assay._ProbePrep_key as probePrepKey," "GXD_Assay._AntibodyPrep_key as antibodyPrepKey," "GXD_Assay._ReporterGene_key as reporterGeneKey," "GXD_FixationMethod.fixation as fixation," "GXD_EmbeddingMethod.embeddingMethod as embedding," "GXD_Assay._Assay_key as assayKey," "GXD_Specimen.hybridization as sliceType," "GXD_Specimen._Genotype_key as genotypeKey," "IMG_ImagePane._ImagePane_key as imagePaneKey\n" "from MRK_Marker," "GXD_Assay," "GXD_Specimen," "GXD_InSituResult," "GXD_InSituResultImage," "GXD_FixationMethod," "GXD_EmbeddingMethod," "IMG_ImagePane," "IMG_Image," "ACC_Accession\n" "where MRK_Marker._Marker_key = GXD_Assay._Marker_key " "and GXD_Assay._Assay_key = GXD_Specimen._Assay_key " "and GXD_Specimen._Specimen_key = GXD_InSituResult._Specimen_key " "and GXD_InSituResult._Result_key = GXD_InSituResultImage._Result_key " "and GXD_InSituResultImage._ImagePane_key = IMG_ImagePane._ImagePane_key " "and GXD_FixationMethod._Fixation_key = GXD_Specimen._Fixation_key " "and GXD_EmbeddingMethod._Embedding_key = GXD_Specimen._Embedding_key " "and IMG_ImagePane._Image_key = IMG_Image._Image_key " "and IMG_Image._Image_key = ACC_Accession._Object_key " "and ACC_Accession.prefixPart = 'PIX:' " "and GXD_Assay._ImagePane_key is NULL " ); sqlDyStringPrintf(query, "and GXD_Assay._Refs_key = '%s'", ref); sr = sqlGetResultVerbose(conn, query->string); fprintf(tab, "#"); fprintf(tab, "gene\t"); fprintf(tab, "probeColor\t"); fprintf(tab, "sex\t"); fprintf(tab, "age\t"); fprintf(tab, "ageMin\t"); fprintf(tab, "ageMax\t"); fprintf(tab, "paneLabel\t"); fprintf(tab, "fileName\t"); fprintf(tab, "submitId\t"); fprintf(tab, "fPrimer\t"); fprintf(tab, "rPrimer\t"); fprintf(tab, "abName\t"); fprintf(tab, "abTaxon\t"); fprintf(tab, "abSubmitId\t"); fprintf(tab, "fixation\t"); fprintf(tab, "embedding\t"); fprintf(tab, "bodyPart\t"); fprintf(tab, "sliceType\t"); fprintf(tab, "genotype\t"); fprintf(tab, "strain\t"); fprintf(tab, "priority\t"); fprintf(tab, "captionId\t"); fprintf(tab, "imageWidth\t"); fprintf(tab, "imageHeight\n"); while ((row = sqlNextRow(sr)) != NULL) { char *gene = row[0]; char *sex = row[1]; char *age = row[2]; char *ageMin = row[3]; char *ageMax = row[4]; char *paneLabel = row[5]; char *fileKey = row[6]; char *imageKey = row[7]; char *probePrepKey = row[8]; char *antibodyPrepKey = row[9]; char *reporterGeneKey = row[10]; char *fixation = row[11]; char *embedding = row[12]; char *assayKey = row[13]; char *sliceType = row[14]; char *genotypeKey = row[15]; char *imagePaneKey = row[16]; double calcAge = -1; char *probeColor = ""; char *bodyPart = ""; char *abName = NULL; char *rPrimer = NULL, *fPrimer = NULL; char *genotype = NULL; char *strain = NULL; char *priority = NULL; char abTaxon[32]; char *captionId = ""; char *abSubmitId = NULL; verbose(3, " "); dumpRow(row, 16); if (age == NULL) continue; if (!lookedForCopyright) { struct sqlResult *sr = NULL; char **row; lookedForCopyright = TRUE; dyStringClear(query); sqlDyStringPrintf(query, "select note from MGI_NoteChunk,MGI_Note,MGI_NoteType,ACC_MGIType " "where MGI_Note._Object_key = %s " "and ACC_MGIType.name = 'Image' " "and ACC_MGIType._MGIType_key = MGI_Note._MGIType_key " "and MGI_NoteType.noteType='Copyright' " "and MGI_Note._NoteType_key = MGI_NoteType._NoteType_key " "and MGI_Note._Note_key = MGI_NoteChunk._Note_key " "order by sequenceNum" , imageKey); sr = sqlGetResultVerbose(conn2, query->string); while ((row = sqlNextRow(sr)) != NULL) dyStringAppend(copyright, row[0]); sqlFreeResult(&sr); verbose(2,"imageKey=%s\n",imageKey); if (copyright->stringSize != 0) { fprintf(ra, "copyright %s\n", copyright->string); } } /* Massage sex */ { if (sameString(sex, "Male")) sex = "male"; else if (sameString(sex, "Female")) sex = "female"; else sex = ""; } /* Massage age */ { char *embryoPat = "embryonic day "; char *newbornPat = "postnatal newborn"; char *dayPat = "postnatal day "; char *weekPat = "postnatal week "; char *adultPat = "postnatal adult"; double calcMinAge = atof(ageMin); double calcMaxAge = atof(ageMax); double mouseBirthAge = 21.0; //double mouseAdultAge = 63.0; /* Relative to conception, not birth */ if (age[0] == 0) { warn("age null, ageMin %s, ageMax %s\n", ageMin, ageMax); calcAge = (calcMinAge + calcMaxAge) * 0.5; } else if (startsWith(embryoPat, age)) calcAge = atof(age+strlen(embryoPat)); else if (sameString(newbornPat, age)) calcAge = mouseBirthAge; else if (startsWith(dayPat, age)) calcAge = atof(age+strlen(dayPat)) + mouseBirthAge; else if (startsWith(weekPat, age)) calcAge = 7.0 * atof(age+strlen(weekPat)) + mouseBirthAge; else if (sameString(adultPat, age) && calcMaxAge - calcMinAge > 1000 && calcMinAge < 365) calcAge = 365; /* Most adult mice are relatively young */ else { warn("Calculating age from %s", age); calcAge = (calcMinAge + calcMaxAge) * 0.5; } if (calcAge < calcMinAge) calcAge = calcMinAge; if (calcAge > calcMaxAge) calcAge = calcMaxAge; } /* Massage probeColor */ { if (!isStrNull(reporterGeneKey)) { /* Fixme: make sure that reporterGene's end up in probeType table. */ char *name = NULL; dyStringClear(query); sqlDyStringPrintf(query, "select term from VOC_Term where _Term_key = %s", reporterGeneKey); name = sqlQuickStringVerbose(conn2, query->string); if (name == NULL) warn("Can't find _ReporterGene_key %s in VOC_Term", reporterGeneKey); else if (sameString(name, "GFP")) probeColor = "green"; else if (sameString(name, "lacZ")) probeColor = "blue"; else warn("Don't know color of reporter gene %s", name); freez(&name); } if (!isStrNull(probePrepKey)) { char *name = NULL; dyStringClear(query); sqlDyStringPrintf(query, "select GXD_VisualizationMethod.visualization " "from GXD_VisualizationMethod,GXD_ProbePrep " "where GXD_ProbePrep._ProbePrep_key = %s " "and GXD_ProbePrep._Visualization_key = GXD_VisualizationMethod._Visualization_key" , probePrepKey); name = sqlQuickStringVerbose(conn2, query->string); if (name == NULL) warn("Can't find visualization from _ProbePrep_key %s", probePrepKey); probeColor = colorFromLabel(name, gene); freez(&name); if (probeColor[0] == 0) { dyStringClear(query); sqlDyStringPrintf(query, "select GXD_Label.label from GXD_Label,GXD_ProbePrep " "where GXD_ProbePrep._ProbePrep_key = %s " "and GXD_ProbePrep._Label_key = GXD_Label._Label_key" , probePrepKey); name = sqlQuickStringVerbose(conn2, query->string); if (name == NULL) warn("Can't find label from _ProbePrep_key %s", probePrepKey); probeColor = colorFromLabel(name, gene); } freez(&name); } if (!isStrNull(antibodyPrepKey) && probeColor[0] == 0 ) { char *name = NULL; dyStringClear(query); sqlDyStringPrintf(query, "select GXD_Label.label from GXD_Label,GXD_AntibodyPrep " "where GXD_AntibodyPrep._AntibodyPrep_key = %s " "and GXD_AntibodyPrep._Label_key = GXD_Label._Label_key" , antibodyPrepKey); name = sqlQuickStringVerbose(conn2, query->string); if (name == NULL) warn("Can't find label from _AntibodyPrep_key %s", antibodyPrepKey); probeColor = colorFromLabel(name, gene); freez(&name); } } /* Get abName, abTaxon, abSubmitId */ abTaxon[0] = 0; if (!isStrNull(antibodyPrepKey)) { struct sqlResult *sr = NULL; int orgKey = 0; char **row; dyStringClear(query); sqlDyStringPrintf(query, "select antibodyName,_Organism_key,GXD_Antibody._Antibody_key " "from GXD_AntibodyPrep,GXD_Antibody " "where GXD_AntibodyPrep._AntibodyPrep_key = %s " "and GXD_AntibodyPrep._Antibody_key = GXD_Antibody._Antibody_key" , antibodyPrepKey); sr = sqlGetResultVerbose(conn2, query->string); row = sqlNextRow(sr); if (row != NULL) { abName = cloneString(row[0]); orgKey = atoi(row[1]); abSubmitId = cloneString(row[2]); } sqlFreeResult(&sr); if (orgKey > 0) { char *latinName = NULL, *commonName = NULL; int spTaxon = 0; dyStringClear(query); sqlDyStringPrintf(query, "select latinName from MGI_Organism " "where _Organism_key = %d", orgKey); latinName = sqlQuickStringVerbose(conn2, query->string); if (latinName != NULL && !sameString(latinName, "Not Specified") && !sameString(latinName, "Not Applicable")) { char *e = strchr(latinName, '/'); if (e != NULL) *e = 0; /* Chop off / and after. */ spTaxon = spBinomialToTaxon(connSp, latinName); } else { dyStringClear(query); sqlDyStringPrintf(query, "select commonName from MGI_Organism " "where _Organism_key = %d", orgKey); commonName = sqlQuickStringVerbose(conn2, query->string); if (commonName != NULL && !sameString(commonName, "Not Applicable") && !sameString(commonName, "Not Specified")) { spTaxon = spCommonToTaxon(connSp, commonName); } } if (spTaxon != 0) safef(abTaxon, sizeof(abTaxon), "%d", spTaxon); freez(&latinName); freez(&commonName); } } if (abName == NULL) abName = cloneString(""); if (abSubmitId == NULL) abSubmitId = cloneString(""); /* Get rPrimer, lPrimer */ if (!isStrNull(probePrepKey)) { struct sqlResult *sr = NULL; char **row; dyStringClear(query); sqlDyStringPrintf(query, "select primer1sequence,primer2sequence " "from PRB_Probe,GXD_ProbePrep " "where PRB_Probe._Probe_key = GXD_ProbePrep._Probe_key " "and GXD_ProbePrep._ProbePrep_key = %s" , probePrepKey); sr = sqlGetResultVerbose(conn2, query->string); row = sqlNextRow(sr); if (row != NULL) { fPrimer = cloneString(row[0]); rPrimer = cloneString(row[1]); } sqlFreeResult(&sr); } /* Note Jackson database actually stores the primers very * erratically. In all the cases I can find for in situs * the primers are actually stored in free text in the PRB_Notes * e.g. ... primers CGCGGATCCAGGGGAAACAGAAGGGCTGCG and CCCAAGCTTAGACTGTACAGGCTGAGCC ... */ if (fPrimer == NULL || fPrimer[0]==0) { struct sqlResult *sr = NULL; char **row; dyStringClear(query); sqlDyStringPrintf(query, "select PRB_Notes.note from GXD_ProbePrep, PRB_Notes" " where GXD_ProbePrep._ProbePrep_key = %s" " and GXD_ProbePrep._Probe_key = PRB_Notes._Probe_key" " order by PRB_Notes.sequenceNum" , probePrepKey); sr = sqlGetResultVerbose(conn2, query->string); dyStringClear(probeNotes); while ((row = sqlNextRow(sr)) != NULL) dyStringAppend(probeNotes, row[0]); sqlFreeResult(&sr); if (probeNotes->stringSize > 0) { char f[256]; char r[256]; int i = 0; char *s = strstr(probeNotes->string," primers "); if (s) { s += strlen(" primers "); i = 0; while (strchr("ACGT",*s) && (i<sizeof(f))) f[i++] = *s++; f[i]=0; if (strstr(s," and ")==s) { s += strlen(" and "); i = 0; while (strchr("ACGT",*s) && (i<sizeof(r))) r[i++] = *s++; r[i]=0; if (strlen(f) >= 10 && strlen(r) >= 10) { fPrimer = cloneString(f); rPrimer = cloneString(r); } else { verbose(1, "bad primer parse:_ProbePrep_key=%s fPrimer=[%s], rPrimer=[%s]\n", probePrepKey,f,r); } } } } } if (fPrimer == NULL) fPrimer = cloneString(""); if (rPrimer == NULL) rPrimer = cloneString(""); fixation = blankOutUnknown(fixation); embedding = blankOutUnknown(embedding); /* Massage body part and slice type. We only handle whole mounts. */ if (sameString(sliceType, "whole mount")) { bodyPart = "whole"; priority = "100"; } else { sliceType = ""; priority = "1000"; } genotypeAndStrainFromKey(genotypeKey, conn2, &genotype, &strain); if (isStrNull(paneLabel)) paneLabel = cloneString(""); /* trying to suppress nulls in output */ stripChar(paneLabel, '"'); /* Get rid of a difficult quote to process. */ /* Fetch image dimensions from file. */ imageWidth=0; imageHeight=0; safef(path, sizeof(path), "%s/%s.jpg", inJax, fileKey); if (fileExists(path)) jpegSize(path,&imageWidth,&imageHeight); /* will errAbort if no valid .jpeg exists */ else warn("Picture Missing! %s ",path); /* Deal caption if any. Most of the work only happens the * first time see the image. */ if (!hashLookup(uniqImageHash, imageKey)) { struct sqlResult *sr = NULL; char **row; hashAdd(uniqImageHash, imageKey, NULL); dyStringClear(caption); dyStringClear(query); sqlDyStringPrintf(query, "select note from MGI_NoteChunk,MGI_Note,MGI_NoteType,ACC_MGIType " "where MGI_Note._Object_key = %s " "and ACC_MGIType.name = 'Image' " "and ACC_MGIType._MGIType_key = MGI_Note._MGIType_key " "and MGI_NoteType.noteType='Caption' " "and MGI_Note._NoteType_key = MGI_NoteType._NoteType_key " "and MGI_Note._Note_key = MGI_NoteChunk._Note_key " "order by sequenceNum" , imageKey); sr = sqlGetResultVerbose(conn2, query->string); while ((row = sqlNextRow(sr)) != NULL) dyStringAppend(caption, row[0]); sqlFreeResult(&sr); if (caption->stringSize > 0) { subChar(caption->string, '\t', ' '); subChar(caption->string, '\n', ' '); fprintf(cap, "%s\t%s\n", imageKey, caption->string); hashAdd(captionHash, imageKey, imageKey); } } if (hashLookup(captionHash, imageKey)) captionId = imageKey; else captionId = ""; fprintf(tab, "%s\t", gene); fprintf(tab, "%s\t", probeColor); fprintf(tab, "%s\t", sex); fprintf(tab, "%3.2f\t", calcAge); fprintf(tab, "%s\t", ageMin); fprintf(tab, "%s\t", ageMax); fprintf(tab, "%s\t", paneLabel); /* may have to change NULL to empty string or "0" ? */ fprintf(tab, "%s.jpg\t", fileKey); fprintf(tab, "%s\t", imageKey); fprintf(tab, "%s\t", fPrimer); fprintf(tab, "%s\t", rPrimer); fprintf(tab, "%s\t", abName); fprintf(tab, "%s\t", abTaxon); fprintf(tab, "%s\t", abSubmitId); fprintf(tab, "%s\t", fixation); fprintf(tab, "%s\t", embedding); fprintf(tab, "%s\t", bodyPart); fprintf(tab, "%s\t", sliceType); fprintf(tab, "%s\t", genotype); fprintf(tab, "%s\t", strain); fprintf(tab, "%s\t", priority); fprintf(tab, "%s\t", captionId); fprintf(tab, "%d\t", imageWidth); fprintf(tab, "%d\n", imageHeight); printExpression(tab, conn2, imagePaneKey, assayKey); gotAny = TRUE; freez(&genotype); freez(&abName); freez(&abSubmitId); freez(&rPrimer); freez(&fPrimer); } sqlFreeResult(&sr); carefulClose(&ra); carefulClose(&tab); carefulClose(&cap); if (!gotAny) { remove(raName); remove(capName); remove(tabName); } dyStringFree(&probeNotes); dyStringFree(©right); dyStringFree(&caption); dyStringFree(&query); hashFree(&uniqImageHash); hashFree(&captionHash); }
void checkMetaTrackDb(struct mdbObj *mdbObj, struct hash *trackHash) { verbose(1, "----------------------------------------------\n"); verbose(1, "Checking that tables specified in metaDb exist in trackDb\n"); verbose(1, "----------------------------------------------\n"); for(; mdbObj != NULL; mdbObj=mdbObj->next) { struct mdbVar *mdbVar = hashFindVal(mdbObj->varHash, "objType"); if (mdbVar == NULL) { warn("objType not found in object %s", mdbObj->obj); continue; } if (differentString(mdbVar->val, "table")) continue; if (mdbObj->deleteThis) continue; mdbVar = hashFindVal(mdbObj->varHash, "tableName"); if (mdbVar == NULL) { warn("tableName not found in object %s", mdbObj->obj); continue; } char *tableName = mdbVar->val; struct mdbVar *atticVar = hashFindVal(mdbObj->varHash, "attic"); struct mdbVar *statusVar = hashFindVal(mdbObj->varHash, "objStatus"); char *reason = NULL; if (atticVar) reason = "attic"; if (statusVar) { if (startsWith("renamed", statusVar->val)) reason = "renamed"; if (startsWith("replaced", statusVar->val)) reason = "replaced"; if (startsWith("revoked", statusVar->val)) reason = "revoked"; } if (hashLookup(trackHash, tableName)) { if (reason) { warn("%s table %s: should NOT be found in trackDb", reason, tableName); continue; } } else { if (reason) { // ok because attic, replaced, revoked, renamed should not be in trackDb continue; } warn("table %s: not found in trackDb",tableName); } } }
void trimUniq(bioSeq *seqList) /* Check that all seq's in list have a unique name. Try and * abbreviate longer sequence names. */ { struct hash *hash = newHash(0); bioSeq *seq; for (seq = seqList; seq != NULL; seq = seq->next) { char *saferString = needMem(strlen(seq->name)+1); char *c, *s; /* Some chars are safe to allow through, other chars cause * problems. It isn't necessarily a URL safe string that is * being calculated here. The original problem was a user had * the fasta header line of: * chr8|59823648:59825047|+ * The plus sign was being taken as the query name and this * created problems as that name was passed on to hgc via * the ss cart variable. The + sign became part of a URL * eventually. This loop allows only isalnum and =_/.:;_| * to get through as part of the header name. These characters * all proved to be safe as single character names, or all * together. */ s = saferString; for (c = seq->name; *c != '\0'; ++c) { if (c && (*c != '\0')) { if ( isalnum(*c) || (*c == '=') || (*c == '-') || (*c == '/') || (*c == '.') || (*c == ':') || (*c == ';') || (*c == '_') || (*c == '|') ) *s++ = *c; } } *s = '\0'; freeMem(seq->name); if (*saferString == '\0') { freeMem(saferString); saferString = cloneString("YourSeq"); } seq->name = saferString; if (strlen(seq->name) > 14) /* Try and get rid of long NCBI .fa cruft. */ { char *nameClone = NULL; char *abbrv = NULL; char *words[32]; int wordCount; boolean isEns = (stringIn("ENSEMBL:", seq->name) != NULL); nameClone = cloneString(seq->name); wordCount = chopString(nameClone, "|", words, ArraySize(words)); if (wordCount > 1) /* Looks like it's an Ensembl/NCBI * long name alright. */ { if (isEns) { abbrv = words[0]; if (abbrv[0] == 0) abbrv = words[1]; } else if (sameString(words[1], "dbSNP")) { if (wordCount > 2) abbrv = words[2]; else abbrv = nameClone; } else { abbrv = words[wordCount-1]; if (abbrv[0] == 0) abbrv = words[wordCount-2]; } if (hashLookup(hash, abbrv) == NULL) { freeMem(seq->name); seq->name = cloneString(abbrv); } freez(&nameClone); } } hashAddUnique(hash, seq->name, hash); } freeHash(&hash); }
boolean outputProtein(struct cdsEvidence *cds, struct dnaSeq *txSeq, FILE *f) /* Translate txSeq to protein guided by cds, and output to file. * The implementation is a little complicated by checking for internal * stop codons and other error conditions. Return True if a sequence was * generated and was not impacted by error conditions, False otherwise */ { boolean selenocysteine = FALSE; if (selenocysteineHash != NULL) { if (hashLookup(selenocysteineHash, txSeq->name)) selenocysteine = TRUE; } struct dyString *dy = dyStringNew(4*1024); int blockIx; for (blockIx=0; blockIx<cds->cdsCount; ++blockIx) { DNA *dna = txSeq->dna + cds->cdsStarts[blockIx]; int rnaSize = cds->cdsSizes[blockIx]; if (rnaSize%3 != 0) { warn("size of block (%d) #%d not multiple of 3 in %s (source %s %s)", rnaSize, blockIx, cds->name, cds->source, cds->accession); return FALSE; } int aaSize = rnaSize/3; int i; for (i=0; i<aaSize; ++i) { AA aa = lookupCodon(dna); if (aa == 0) { aa = '*'; if (selenocysteine) { if (!isReallyStopCodon(dna, TRUE)) aa = 'U'; } } dyStringAppendC(dy, aa); dna += 3; } } int lastCharIx = dy->stringSize-1; if (dy->string[lastCharIx] == '*') { dy->string[lastCharIx] = 0; dy->stringSize = lastCharIx; } char *prematureStop = strchr(dy->string, '*'); if (prematureStop != NULL) { warn("Stop codons in CDS at position %d for %s, (source %s %s)", (int)(prematureStop - dy->string), cds->name, cds->source, cds->accession); return(FALSE); } else { faWriteNext(f, cds->name, dy->string, dy->stringSize); dyStringFree(&dy); return(TRUE); } }
static void getRepeatsUnsplit(struct sqlConnection *conn, struct hash *chromHash, struct hash *arHash) /* Return a tree of ranges for sequence gaps all chromosomes, * assuming an unsplit table -- when the table is unsplit, it's * probably for a scaffold assembly where we *really* don't want * to do one query per scaffold! */ { struct sqlResult *sr; char **row; struct rbTreeNode **stack = lmAlloc(qLm, 256 * sizeof(stack[0])); struct rbTree *allTree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack); struct rbTreeNode **newstack = lmAlloc(qLm, 256 * sizeof(newstack[0])); struct rbTree *newTree = rbTreeNewDetailed(simpleRangeCmp, qLm, newstack); char *prevChrom = NULL; struct simpleRange *prevRange = NULL, *prevNewRange = NULL; sr = sqlGetResult(conn, "NOSQLINJ select genoName,genoStart,genoEnd,repName,repClass,repFamily from rmsk " "order by genoName,genoStart"); while ((row = sqlNextRow(sr)) != NULL) { struct simpleRange *range; char arKey[512]; if (prevChrom == NULL) prevChrom = cloneString(row[0]); else if (! sameString(prevChrom, row[0])) { rbTreeAdd(allTree, prevRange); if (prevNewRange != NULL) rbTreeAdd(newTree, prevNewRange); setRepeats(prevChrom, chromHash, allTree, newTree); freeMem(prevChrom); prevRange = prevNewRange = NULL; stack = lmAlloc(qLm, 256 * sizeof(stack[0])); allTree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack); if (arHash != NULL) { stack = lmAlloc(qLm, 256 * sizeof(stack[0])); newTree = rbTreeNewDetailed(simpleRangeCmp, qLm, stack); } prevChrom = cloneString(row[0]); } lmAllocVar(allTree->lm, range); range->start = sqlUnsigned(row[1]); range->end = sqlUnsigned(row[2]); if (prevRange == NULL) prevRange = range; else if (overlap(range, prevRange)) { /* merge r into prevR & discard; prevR gets passed forward. */ if (range->end > prevRange->end) prevRange->end = range->end; if (range->start < prevRange->start) prevRange->start = range->start; } else { rbTreeAdd(allTree, prevRange); prevRange = range; } sprintf(arKey, "%s.%s.%s", row[3], row[4], row[5]); if (arHash != NULL && hashLookup(arHash, arKey)) { lmAllocVar(newTree->lm, range); range->start = sqlUnsigned(row[1]); range->end = sqlUnsigned(row[2]); if (prevNewRange == NULL) prevNewRange = range; else if (overlap(range, prevNewRange)) { /* merge r into prevR & discard; prevR gets passed forward. */ if (range->end > prevNewRange->end) prevNewRange->end = range->end; if (range->start < prevNewRange->start) prevNewRange->start = range->start; } else { rbTreeAdd(newTree, prevNewRange); prevNewRange = range; } } } if (prevChrom != NULL) { rbTreeAdd(allTree, prevRange); if (prevNewRange != NULL) rbTreeAdd(newTree, prevNewRange); setRepeats(prevChrom, chromHash, allTree, newTree); freeMem(prevChrom); } sqlFreeResult(&sr); }
static struct jsonWrite *rTdbToJw(struct trackDb *tdb, struct hash *fieldHash, struct hash *excludeTypesHash, int depth, int maxDepth) /* Recursively build and return a new jsonWrite object with JSON for tdb and its children, * or NULL if tdb or all children have been filtered out by excludeTypesHash. * If excludeTypesHash is non-NULL, omit any tracks/views/subtracks with type in excludeTypesHash. * If fieldHash is non-NULL, include only the field names indexed in fieldHash. */ { if (maxDepth >= 0 && depth > maxDepth) return NULL; boolean doSubtracks = (tdb->subtracks && fieldOk("subtracks", fieldHash)); // If excludeTypesHash is given and tdb is a leaf track/subtrack, look up the first word // of tdb->type in excludeTypesHash; if found, return NULL. if (excludeTypesHash && !doSubtracks) { char typeCopy[PATH_LEN]; safecpy(typeCopy, sizeof(typeCopy), tdb->type); if (hashLookup(excludeTypesHash, firstWordInLine(typeCopy))) return NULL; } boolean gotSomething = !doSubtracks; struct jsonWrite *jwNew = jsonWriteNew(); jsonWriteObjectStart(jwNew, NULL); writeTdbSimple(jwNew, tdb, fieldHash); if (tdb->parent && fieldOk("parent", fieldHash)) { // We can't link to an object in JSON and better not recurse here or else infinite loop. if (tdbIsSuperTrackChild(tdb)) { // Supertracks have been omitted from fullTrackList, so add the supertrack object's // non-parent/child info here. jsonWriteObjectStart(jwNew, "parent"); writeTdbSimple(jwNew, tdb->parent, fieldHash); jsonWriteObjectEnd(jwNew); } else // Just the name so we don't have infinite loops. jsonWriteString(jwNew, "parent", tdb->parent->track); } if (doSubtracks) { jsonWriteListStart(jwNew, "subtracks"); slSort(&tdb->subtracks, trackDbViewCmp); struct trackDb *subTdb; for (subTdb = tdb->subtracks; subTdb != NULL; subTdb = subTdb->next) { struct jsonWrite *jwSub = rTdbToJw(subTdb, fieldHash, excludeTypesHash, depth+1, maxDepth); if (jwSub) { gotSomething = TRUE; jsonWriteAppend(jwNew, NULL, jwSub); jsonWriteFree(&jwSub); } } jsonWriteListEnd(jwNew); } jsonWriteObjectEnd(jwNew); if (! gotSomething) // All children were excluded; clean up and null out jwNew. jsonWriteFree(&jwNew); return jwNew; }
static void getRepeats(struct sqlConnection *conn, struct hash *arHash, char *chrom, struct rbTree **retAllRepeats, struct rbTree **retNewRepeats) /* Return a tree of ranges for sequence gaps in chromosome */ { char *db = sqlGetDatabase(conn); struct sqlResult *sr; char **row; struct rbTree *allTree = rbTreeNew(simpleRangeCmp); struct rbTree *newTree = rbTreeNew(simpleRangeCmp); char tableName[64]; char query[256]; boolean splitRmsk = TRUE; struct simpleRange *prevRange = NULL, *prevNewRange = NULL; safef(tableName, sizeof(tableName), "%s_rmsk", chrom); if (! sqlTableExists(conn, tableName)) { safef(tableName, sizeof(tableName), "rmsk"); if (! sqlTableExists(conn, tableName)) errAbort("Can't find rmsk table for %s (%s.%s_rmsk or %s.rmsk)\n", chrom, db, chrom, db); splitRmsk = FALSE; } if (splitRmsk) sqlSafef(query, sizeof query, "select genoStart,genoEnd,repName,repClass,repFamily from %s", tableName); else sqlSafef(query, sizeof query, "select genoStart,genoEnd,repName,repClass,repFamily from %s " "where genoName = \"%s\"", tableName, chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct simpleRange *range; char arKey[512]; lmAllocVar(allTree->lm, range); range->start = sqlUnsigned(row[0]); range->end = sqlUnsigned(row[1]); if (prevRange == NULL) prevRange = range; else if (overlap(range, prevRange)) { /* merge r into prevR & discard; prevR gets passed forward. */ if (range->end > prevRange->end) prevRange->end = range->end; if (range->start < prevRange->start) prevRange->start = range->start; } else { rbTreeAdd(allTree, prevRange); prevRange = range; } sprintf(arKey, "%s.%s.%s", row[2], row[3], row[4]); if (arHash != NULL && hashLookup(arHash, arKey)) { lmAllocVar(newTree->lm, range); range->start = sqlUnsigned(row[0]); range->end = sqlUnsigned(row[1]); if (prevNewRange == NULL) prevNewRange = range; else if (overlap(range, prevNewRange)) { /* merge r into prevR & discard; prevR gets passed forward. */ if (range->end > prevNewRange->end) prevNewRange->end = range->end; if (range->start < prevNewRange->start) prevNewRange->start = range->start; } else { rbTreeAdd(allTree, prevNewRange); prevNewRange = range; } } } if (prevRange != NULL) rbTreeAdd(allTree, prevRange); if (prevNewRange != NULL) rbTreeAdd(newTree, prevNewRange); sqlFreeResult(&sr); *retAllRepeats = allTree; *retNewRepeats = newTree; }
/* Small FSM. States indicate what the machine is looking for *next*, * so eg _cfgKEYSTART means "looking for the token that indicates the start * of a key" */ struct configFile *_cfgParseConfigFile (struct configFile *cfg) { char *currentSectionString="DEFAULT"; char *currentStringStart=NULL; char *currentKey=NULL; unsigned int filePos=0, state=_cfgKEYSTART; hash_table *tempHash; /* Create the default section. */ tempHash=hashConstructTable (31); hashInsert (currentSectionString, tempHash, cfg->sections); while (filePos < cfg->bbdgSize) { switch (state) { case _cfgKEYSTART: if (cfg->bbdg[filePos]=='[') { filePos++; currentStringStart=(char *) &(cfg->bbdg[filePos]); state=_cfgSECTIONEND; break; } if (isCommentStart(cfg->bbdg[filePos])) { filePos++; state=_cfgCOMMENTEND; break; } if ( !isspace (cfg->bbdg[filePos]) ) { currentStringStart=(char *) &(cfg->bbdg[filePos]); state=_cfgKEYEND; } else { filePos ++; } break; case _cfgCOMMENTEND: if (cfg->bbdg[filePos]=='\n') { state=_cfgKEYSTART; } filePos++; break; case _cfgSECTIONEND: if (cfg->bbdg[filePos]==']') { cfg->bbdg[filePos]='\0'; currentSectionString=currentStringStart; state=_cfgKEYSTART; } filePos++; break; case _cfgKEYEND: if (isspace (cfg->bbdg[filePos]) || isKeyValSep(cfg->bbdg[filePos])) { if (isKeyValSep(cfg->bbdg[filePos])) { cfg->bbdg[filePos]='\0'; } else { cfg->bbdg[filePos]='\0'; filePos++; } currentKey=currentStringStart; state=_cfgCOLON; } else { //Do this in search routine instead (with strcasecmp) //cfg->bbdg[filePos] = tolower(cfg->bbdg[filePos]); filePos++; } break; case _cfgCOLON: if (isKeyValSep(cfg->bbdg[filePos]) || cfg->bbdg[filePos]=='\0') { state=_cfgVALSTART; } filePos++; break; case _cfgVALSTART: if (!myisblank(cfg->bbdg[filePos])) { currentStringStart=(char *) &(cfg->bbdg[filePos]); state=_cfgVALEND; } else { filePos ++; } break; case _cfgVALEND: if (cfg->bbdg[filePos]=='\n' || isCommentStart(cfg->bbdg[filePos])) { /* First see if the current section exists. */ tempHash=hashLookup (currentSectionString, cfg->sections); if (tempHash==NULL) { tempHash=hashConstructTable (31); hashInsert (currentSectionString, tempHash, cfg->sections); } /* Now stick it in the table. */ if (isCommentStart(cfg->bbdg[filePos])) { cfg->bbdg[filePos]='\0'; hashInsert (currentKey, currentStringStart, tempHash); state=_cfgCOMMENTEND; } else { cfg->bbdg[filePos]='\0'; hashInsert (currentKey, currentStringStart, tempHash); state=_cfgKEYSTART; } } filePos++; break; } } return cfg; }
static void filterBed(struct track *tg, struct linkedFeatures **pLfList) /* Apply filters if any to mRNA linked features. */ { struct linkedFeatures *lf, *next, *newList = NULL, *oldList = NULL; struct mrnaUiData *mud = tg->extraUiData; struct mrnaFilter *fil; char *type; boolean anyFilter = FALSE; boolean colorIx = 0; boolean isExclude = FALSE; boolean andLogic = TRUE; if (*pLfList == NULL || mud == NULL) return; /* First make a quick pass through to see if we actually have * to do the filter. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { fil->pattern = cartUsualString(cart, fil->key, ""); if (fil->pattern[0] != 0) anyFilter = TRUE; } if (!anyFilter) return; type = cartUsualString(cart, mud->filterTypeVar, "red"); if (sameString(type, "exclude")) isExclude = TRUE; else if (sameString(type, "include")) isExclude = FALSE; else colorIx = getFilterColor(type, MG_BLACK); type = cartUsualString(cart, mud->logicTypeVar, "and"); andLogic = sameString(type, "and"); /* Make a pass though each filter, and start setting up search for * those that have some text. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { fil->pattern = cartUsualString(cart, fil->key, ""); if (fil->pattern[0] != 0) { fil->hash = newHash(10); } } /* Scan tables id/name tables to build up hash of matching id's. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { struct hash *hash = fil->hash; int wordIx, wordCount; char *words[128]; if (hash != NULL) { boolean anyWild; char *dupPat = cloneString(fil->pattern); wordCount = chopLine(dupPat, words); for (wordIx=0; wordIx <wordCount; ++wordIx) { char *pattern = cloneString(words[wordIx]); if (lastChar(pattern) != '*') { int len = strlen(pattern)+1; pattern = needMoreMem(pattern, len, len+1); pattern[len-1] = '*'; } anyWild = (strchr(pattern, '*') != NULL || strchr(pattern, '?') != NULL); touppers(pattern); for(lf = *pLfList; lf != NULL; lf=lf->next) { char copy[SMALLBUF]; boolean gotMatch; safef(copy, sizeof(copy), "%s", lf->name); touppers(copy); if (anyWild) gotMatch = wildMatch(pattern, copy); else gotMatch = sameString(pattern, copy); if (gotMatch) { hashAdd(hash, lf->name, NULL); } } freez(&pattern); } freez(&dupPat); } } /* Scan through linked features coloring and or including/excluding ones that * match filter. */ for (lf = *pLfList; lf != NULL; lf = next) { boolean passed = andLogic; next = lf->next; for (fil = mud->filterList; fil != NULL; fil = fil->next) { if (fil->hash != NULL) { if (hashLookup(fil->hash, lf->name) == NULL) { if (andLogic) passed = FALSE; } else { if (!andLogic) passed = TRUE; } } } if (passed ^ isExclude) { slAddHead(&newList, lf); if (colorIx > 0) lf->filterColor = colorIx; } else { slAddHead(&oldList, lf); } } slReverse(&newList); slReverse(&oldList); if (colorIx > 0) { /* Draw stuff that passes filter first in full mode, last in dense. */ if (tg->visibility == tvDense) { newList = slCat(oldList, newList); } else { newList = slCat(newList, oldList); } } *pLfList = newList; tg->limitedVisSet = FALSE; /* Need to recalculate this after filtering. */ /* Free up hashes, etc. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { hashFree(&fil->hash); } }
struct bed *searchStrand(struct hash *sixers, struct cutter *ACGTo[], struct dnaSeq *seq, int startOffset, char strand) /* Cheesey function that checks a strand for the enzymes after they're put in the hash/array structures. This used to be a part of the matchEnzymes function but I do it twice now. */ { struct cutter *enz; struct bed *bedList = NULL; int seqPos; if (ACGTo[0] || ACGTo[1] || ACGTo[2] || ACGTo[3] || (sixers->elCount > 0)) { for (seqPos = 0; seqPos < seq->size; seqPos++) { struct cutter *enzList = NULL; char sixer[7]; int bedPos = (strand == '-') ? (seq->size - seqPos) : seqPos; if (seq->size - seqPos >= 6) { struct hashEl *el = NULL; sixer[6] = '\0'; memcpy(sixer, seq->dna+seqPos, 6); el = hashLookup(sixers, sixer); if (el) { struct bed *add; enz = el->val; add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand); slAddHead(&bedList, add); /* Just in case there's another one with the same sequence. */ while ((el = hashLookupNext(el))) { enz = el->val; add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand); slAddHead(&bedList, add); } } } /* Use a certain list depending on which letter we're on in the sequence. */ if (seq->dna[seqPos] == 'A') enzList = ACGTo[0]; else if (seq->dna[seqPos] == 'C') enzList = ACGTo[1]; else if (seq->dna[seqPos] == 'G') enzList = ACGTo[2]; else if (seq->dna[seqPos] == 'T') enzList = ACGTo[3]; for (enz = enzList; enz != NULL; enz = enz->next) { int enzPos = 0; int seqCurPos = seqPos; while (enzPos < enz->size && seqCurPos < seq->size && matchingBase(enz->seq[enzPos],seq->dna[seqCurPos])) { enzPos++; seqCurPos++; } if (enzPos == enz->size) { struct bed *add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand); slAddHead(&bedList, add); } } } } return bedList; }
void writeGap(struct gapInfo *gap, struct xaAli *xa, int symStart, int symEnd, char geneStrand, FILE *f) /* Write out info on one gap to file. */ { char qStart[totSize+1], qEnd[totSize+1]; char tStart[totSize+1], tEnd[totSize+1]; char hStart[totSize+1], hEnd[totSize+1]; int s, e, size; int midSize; int i; char *threePrime, *fivePrime; boolean isQgap; fprintf(f, "%s %s %s hom %s:%d-%d %c %s:%d-%d %c slide %d\n", gapTypeStrings[gap->type], (gap->hasIntronEnds ? " intron" : "!intron"), (gap->hasStrongHomology ? "heavy" : "light"), gap->query, gap->qStart, gap->qEnd, xa->qStrand, gap->target, gap->tStart, gap->tEnd, geneStrand, gap->slideCount); s = symStart-exSize; e = symStart + inSize; if (s < 0) s = 0; size = e-s; uglyf("s %d size %d e %d totSize %d\n", s, size, e, totSize); strncpy(qStart, xa->qSym+s, size); strncpy(tStart, xa->tSym+s, size); strncpy(hStart, xa->hSym+s, size); qStart[size] = tStart[size] = hStart[size] = 0; // uglyf - crashes by here s = symEnd-inSize; midSize = s - e; e = symEnd+exSize; if (e > xa->symCount) e = xa->symCount; size = e-s; strncpy(qEnd, xa->qSym+s, size); strncpy(tEnd, xa->tSym+s, size); strncpy(hEnd, xa->hSym+s, size); qEnd[size] = tEnd[size] = hEnd[size] = 0; if (gap->isRc) { swapBytes(qStart, qEnd, totSize); swapBytes(tStart, tEnd, totSize); swapBytes(hStart, hEnd, totSize); reverseComplement(qStart, totSize); reverseComplement(qEnd, totSize); reverseComplement(tStart, totSize); reverseComplement(tEnd, totSize); reverseBytes(hStart, totSize); reverseBytes(hEnd, totSize); } /* Write out ends of gap to file. */ fprintf(f, "%s ...%d... %s\n", qStart, midSize, qEnd); fprintf(f, "%s ...%d... %s\n", tStart, midSize, tEnd); fprintf(f, "%s ...%d... %s\n\n", hStart, midSize, hEnd); /* Add intron ends to consensus sequence histogram. */ if (gap->hasIntronEnds && gap->type == cCodingGap) { isQgap = (qStart[exSize] == '-'); if (isQgap) { fivePrime = tStart; threePrime = tEnd; } else { fivePrime = qStart; threePrime = qEnd; } if (noInserts(threePrime, totSize) && noInserts(fivePrime, totSize) ) { int *homoCount; for (i=0; i<totSize; ++i) { hist5[i][histIx(fivePrime[i])] += 1; hist3[i][histIx(threePrime[i])] += 1; } ++histCount; if (isQgap) { ++ceOnlyCount; homoCount = ceOnlyHomoCount; } else { ++cbOnlyCount; homoCount = cbOnlyHomoCount; } ++bothCount; for (i=0; i<totSize; ++i) { if (fivePrime[i] == threePrime[i]) { homoCount[i] += 1; bothHomoCount[i] += 1; } } /* Add introns to list. */ { char idBuf[2*intronEndsSize+1]; struct intronList *il; struct hashEl *hel; memcpy(idBuf, fivePrime+exSize, intronEndsSize); memcpy(idBuf+intronEndsSize, threePrime, intronEndsSize); idBuf[ sizeof(idBuf)-1 ] = 0; if ((hel = hashLookup(intronHash, idBuf)) != NULL) { il = hel->val; il->count += 1; fprintf(f, ">>>%d of set<<<\n", il->count); if (il->isQgap != isQgap) { il->onBoth = TRUE; } } else { AllocVar(il); strcpy(il->ends, idBuf); il->count = 1; il->isQgap = isQgap; slAddHead(&intronList, il); hashAdd(intronHash, idBuf, il); } } } else { static insertCount = 0; warn("Skipping intron with flanking inserts %d", ++insertCount); } } }
int main(int argc, char *argv[]) { char *genoListName; char *cdnaListName; char *oocFileName; char *pairFileName; struct patSpace *patSpace; long startTime, endTime; char **genoList; int genoListSize; char *genoListBuf; char **cdnaList; int cdnaListSize; char *cdnaListBuf; char *genoName; int i; int estIx = 0; struct dnaSeq **seqListList = NULL, *seq; static char hitFileName[512], mergerFileName[512], okFileName[512]; char *outRoot; struct hash *pairHash; if ((hostName = getenv("HOST")) == NULL) hostName = ""; if (argc != 6) usage(); pushWarnHandler(warnHandler); startTime = clock1000(); dnaUtilOpen(); genoListName = argv[1]; cdnaListName = argv[2]; oocFileName = argv[3]; pairFileName = argv[4]; outRoot = argv[5]; sprintf(hitFileName, "%s.hit", outRoot); sprintf(mergerFileName, "%s.glu", outRoot); sprintf(okFileName, "%s.ok", outRoot); readAllWords(genoListName, &genoList, &genoListSize, &genoListBuf); readAllWords(cdnaListName, &cdnaList, &cdnaListSize, &cdnaListBuf); pairHash = makePairHash(pairFileName); hitOut = mustOpen(hitFileName, "w"); mergerOut = mustOpen(mergerFileName, "w"); seqListList = needMem(genoListSize*sizeof(seqListList[0]) ); fprintf(hitOut, "Pattern space 0.2 cDNA matcher\n"); fprintf(hitOut, "cDNA files: "); for (i=0; i<cdnaListSize; ++i) fprintf(hitOut, " %s", cdnaList[i]); fprintf(hitOut, "\n"); fprintf(hitOut, "%d genomic files\n", genoListSize); for (i=0; i<genoListSize; ++i) { genoName = genoList[i]; if (!startsWith("//", genoName) ) { seqListList[i] = seq = faReadAllDna(genoName); fprintf(hitOut, "%d els in %s ", slCount(seq), genoList[i]); for (; seq != NULL; seq = seq->next) fprintf(hitOut, "%d ", seq->size); fprintf(hitOut, "\n"); } } patSpace = makePatSpace(seqListList, genoListSize, 10, oocFileName, 4, 100000); for (i=0; i<cdnaListSize; ++i) { FILE *f; char *estFileName; DNA *dna; char *estName; int size; int c; int maxSizeForFuzzyFind = 20000; int dotCount = 0; estFileName = cdnaList[i]; if (startsWith("//", estFileName) ) continue; f = mustOpen(estFileName, "rb"); while ((c = fgetc(f)) != EOF) if (c == '>') break; printf("%s", cdnaList[i]); fflush(stdout); while (fastFaReadNext(f, &dna, &size, &estName)) { aliSeqName = estName; if (size < maxSizeForFuzzyFind) /* Some day need to fix this somehow... */ { struct hashEl *hel; struct cdnaAliList *calList = NULL; hel = hashLookup(pairHash, estName); if (hel != NULL) /* Do pair processing. */ { struct estPair *ep; struct seq *thisSeq, *otherSeq; ep = hel->val; if (hel->name == ep->name3) { thisSeq = &ep->seq3; otherSeq = &ep->seq5; } else { thisSeq = &ep->seq5; otherSeq = &ep->seq3; } if (otherSeq->dna == NULL) /* First in pair - need to save sequence. */ { thisSeq->size = size; thisSeq->dna = needMem(size); memcpy(thisSeq->dna, dna, size); } else /* Second in pair - do gluing and free partner. */ { char mergedName[64]; thisSeq->dna = dna; thisSeq->size = size; sprintf(mergedName, "%s_AND_%s", ep->name5, ep->name3); glueFindOne(patSpace, ep->seq5.dna, ep->seq5.size, '+', '5', ep->name5, &calList); reverseComplement(ep->seq5.dna, ep->seq5.size); glueFindOne(patSpace, ep->seq5.dna, ep->seq5.size, '-', '5', ep->name5, &calList); glueFindOne(patSpace, ep->seq3.dna, ep->seq3.size, '+', '3', ep->name3, &calList); reverseComplement(ep->seq3.dna, ep->seq3.size); glueFindOne(patSpace, ep->seq3.dna, ep->seq3.size, '-', '3', ep->name3, &calList); slReverse(&calList); writeMergers(calList, mergedName, genoList); freez(&otherSeq->dna); thisSeq->dna = NULL; thisSeq->size =otherSeq->size = 0; } } else { glueFindOne(patSpace, dna, size, '+', '5', estName, &calList); reverseComplement(dna, size); glueFindOne(patSpace, dna, size, '-', '5', estName, &calList); slReverse(&calList); writeMergers(calList, estName, genoList); } ++estIx; if ((estIx & 0xfff) == 0) { printf("."); ++dotCount; fflush(stdout); } } } printf("\n"); } aliSeqName = ""; printf("ffSubmitted %3d ffAccepted %3d ffOkScore %3d ffSolidMatch %2d\n", ffSubmitted, ffAccepted, ffOkScore, ffSolidMatch); endTime = clock1000(); printf("Total time is %4.2f\n", 0.001*(endTime-startTime)); /* Write out file who's presence say's we succeeded */ { FILE *f = mustOpen(okFileName, "w"); fputs("ok", f); fclose(f); } return 0; }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. */ { char *row[3]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, ArraySize(row)); if (rowSize == 0) break; lineFileExpectWords(lf, 3, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); *retMinDiff = minDiff; *retAveSize = (double)totalBases/bedCount; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }
struct edwQaWigSpot *edwQaWigSpotFromNextRa(struct lineFile *lf, struct raToStructReader *reader) /* Return next stanza put into an edwQaWigSpot. */ { enum fields { spotRatioField, enrichmentField, basesInGenomeField, basesInSpotsField, sumSignalField, spotSumSignalField, }; if (!raSkipLeadingEmptyLines(lf, NULL)) return NULL; struct edwQaWigSpot *el; AllocVar(el); bool *fieldsObserved = reader->fieldsObserved; bzero(fieldsObserved, reader->fieldCount); char *tag, *val; while (raNextTagVal(lf, &tag, &val, NULL)) { struct hashEl *hel = hashLookup(reader->fieldIds, tag); if (hel != NULL) { int id = ptToInt(hel->val); if (fieldsObserved[id]) errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName); fieldsObserved[id] = TRUE; switch (id) { case spotRatioField: { el->spotRatio = sqlDouble(val); break; } case enrichmentField: { el->enrichment = sqlDouble(val); break; } case basesInGenomeField: { el->basesInGenome = sqlLongLong(val); break; } case basesInSpotsField: { el->basesInSpots = sqlLongLong(val); break; } case sumSignalField: { el->sumSignal = sqlDouble(val); break; } case spotSumSignalField: { el->spotSumSignal = sqlDouble(val); break; } default: internalErr(); break; } } } raToStructReaderCheckRequiredFields(reader, lf); return el; }
void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, char *fileName, int lineIx) /* Process one row of GFF file (a non-comment line parsed by tabs normally). */ { struct hashEl *hel; struct gffLine *gl; if (wordCount < 8) gffSyntaxError(fileName, lineIx, "Word count less than 8 "); AllocVar(gl); if ((hel = hashLookup(gff->seqHash, words[0])) == NULL) { struct gffSeqName *el; AllocVar(el); hel = hashAdd(gff->seqHash, words[0], el); el->name = hel->name; slAddHead(&gff->seqList, el); } gl->seq = hel->name; if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL) { struct gffSource *el; AllocVar(el); hel = hashAdd(gff->sourceHash, words[1], el); el->name = hel->name; slAddHead(&gff->sourceList, el); } gl->source = hel->name; if ((hel = hashLookup(gff->featureHash, words[2])) == NULL) { struct gffFeature *el; AllocVar(el); hel = hashAdd(gff->featureHash, words[2], el); el->name = hel->name; slAddHead(&gff->featureList, el); } gl->feature = hel->name; if (!isdigit(words[3][0]) || !isdigit(words[4][0])) gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number "); gl->start = atoi(words[3])-1 + baseOffset; gl->end = atoi(words[4]) + baseOffset; gl->score = atof(words[5]); gl->strand = words[6][0]; gl->frame = words[7][0]; if (wordCount >= 9) { if (!gff->typeKnown) { gff->typeKnown = TRUE; gff->isGtf = isGtfGroup(words[8]); } if (gff->isGtf) { parseGtfEnd(words[8], gff, gl, fileName, lineIx); } else { char *tnName = gffTnName(gl->seq, trimSpaces(words[8])); if ((hel = hashLookup(gff->groupHash, tnName)) == NULL) { struct gffGroup *group; AllocVar(group); hel = hashAdd(gff->groupHash, tnName, group); group->name = hel->name; group->seq = gl->seq; group->source = gl->source; slAddHead(&gff->groupList, group); } gl->group = hel->name; } } slAddHead(&gff->lineList, gl); }