struct altGraphX *txGraphToAltGraphX(struct txGraph *tx) /* Copy transcription graph to altSpliceX format. */ { /* Allocate struct and deal with easy fields. */ struct altGraphX *ag; AllocVar(ag); ag->tName = cloneString(tx->tName); ag->tStart = tx->tStart; ag->tEnd = tx->tEnd; ag->name = cloneString(tx->name); ag->id = 0; ag->strand[0] = tx->strand[0]; /* Deal with vertices. */ int vertexCount = ag->vertexCount = tx->vertexCount; AllocArray(ag->vTypes, vertexCount); AllocArray(ag->vPositions, vertexCount); int i; for (i=0; i<vertexCount; ++i) { struct txVertex *v = &tx->vertices[i]; ag->vTypes[i] = v->type; ag->vPositions[i] = v->position; } /* Deal with edges. */ int edgeCount = ag->edgeCount = tx->edgeCount; AllocArray(ag->edgeStarts, edgeCount); AllocArray(ag->edgeEnds, edgeCount); AllocArray(ag->edgeTypes, edgeCount); struct txEdge *edge; for (edge = tx->edgeList, i=0; edge != NULL; edge = edge->next, ++i) { assert(i < edgeCount); ag->edgeStarts[i] = edge->startIx; ag->edgeEnds[i] = edge->endIx; ag->edgeTypes[i] = edge->type; } /* Deal with evidence inside of edges. */ for (edge = tx->edgeList; edge != NULL; edge = edge->next) { struct evidence *ev; AllocVar(ev); int *mrnaIds = AllocArray(ev->mrnaIds, edge->evCount); int i; struct txEvidence *txEv; for (txEv = edge->evList, i=0; txEv != NULL; txEv = txEv->next, ++i) { assert(i < edge->evCount); struct txSource *source = &tx->sources[txEv->sourceId]; char *sourceType = source->type; if (sameString(sourceType, "refSeq") || sameString(sourceType, "mrna") || sameString(sourceType, "est")) { mrnaIds[ev->evCount] = txEv->sourceId; ev->evCount += 1; } } slAddHead(&ag->evidence, ev); } slReverse(&ag->evidence); /* Convert sources into mrnaRefs. */ int sourceCount = ag->mrnaRefCount = tx->sourceCount; AllocArray(ag->mrnaRefs, sourceCount); int sourceIx; for (sourceIx=0; sourceIx<sourceCount; ++sourceIx) { struct txSource *source = &tx->sources[sourceIx]; ag->mrnaRefs[sourceIx] = cloneString(source->accession); } /* Deal with tissues and libs by just making arrays of all zero. */ AllocArray(ag->mrnaTissues, tx->sourceCount); AllocArray(ag->mrnaLibs, tx->sourceCount); return ag; }
void writeMergers(struct cdnaAliList *calList, char *cdnaName, char *bacNames[]) /* Write out any mergers indicated by this cdna. This destroys calList. */ { struct cdnaAliList *startBac, *endBac, *cal, *prevCal, *nextCal; int bacCount; int bacIx; { if (sameString(cdnaName, "R08304_AND_R08305")) { uglyf("Got you %s\n", cdnaName); } } slSort(&calList, cmpCal); for (startBac = calList; startBac != NULL; startBac = endBac) { /* Scan until find a cal that isn't pointing into the same BAC. */ bacCount = 1; bacIx = startBac->bacIx; prevCal = startBac; for (cal = startBac->next; cal != NULL; cal = cal->next) { if (cal->bacIx != bacIx) { prevCal->next = NULL; break; } ++bacCount; prevCal = cal; } endBac = cal; if (bacCount > 1) { while (startBac != NULL) { struct cdnaAliList *clumpList = NULL, *leftoverList = NULL; for (cal = startBac; cal != NULL; cal = nextCal) { nextCal = cal->next; if (noMajorOverlap(cal, clumpList)) { slAddHead(&clumpList, cal); } else { slAddHead(&leftoverList, cal); } } slReverse(&clumpList); slReverse(&leftoverList); if (slCount(clumpList) > 1) { char lastStrand = 0; boolean switchedStrand = FALSE; if (!allSameContig(clumpList)) { fprintf(mergerOut, "%s glues %s contigs", cdnaName, bacNames[bacIx]); lastStrand = clumpList->strand; for (cal = clumpList; cal != NULL; cal = cal->next) { if (cal->strand != lastStrand) switchedStrand = TRUE; fprintf(mergerOut, " %d %c %c' (%d-%d) %3.1f%%", cal->seqIx, cal->strand, cal->dir, cal->start, cal->end, 100.0*cal->cookedScore); } fprintf(mergerOut, "\n"); } } freeCalList(&clumpList); startBac = leftoverList; } } else { freeCalList(&startBac); } } }
void loadHumMusL(struct track *tg) /* Load humMusL track with 2 zoom levels and one normal level. * Also used for loading the musHumL track (called Human Cons) * on the mouse browser. It decides which of 4 tables to * load based on how large of a window the user is looking at*/ { struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; struct sample *sample; struct linkedFeatures *lfList = NULL, *lf; char *hasDense = NULL; char *where = NULL; char tableName[256]; int z; float pixPerBase = 0; if(tl.picWidth == 0) errAbort("hgTracks.c::loadHumMusL() - can't have pixel width of 0"); pixPerBase = (winEnd - winStart)/ tl.picWidth; /* Determine zoom level. */ if (!strstr(tg->table,"HMRConservation")) z = humMusZoomLevel(); else z=0; if(z == 1 ) safef(tableName, sizeof(tableName), "%s_%s", "zoom1", tg->table); else if( z == 2) safef(tableName, sizeof(tableName), "%s_%s", "zoom50", tg->table); else if(z == 3) safef(tableName, sizeof(tableName), "%s_%s", "zoom2500", tg->table); else safef(tableName, sizeof(tableName), "%s", tg->table); //printf("(%s)", tableName ); sr = hRangeQuery(conn, tableName, chromName, winStart, winEnd, where, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { sample = sampleLoad(row+rowOffset); lf = lfFromSample(sample); slAddHead(&lfList, lf); sampleFree(&sample); } if(where != NULL) freez(&where); sqlFreeResult(&sr); hFreeConn(&conn); slReverse(&lfList); /* sort to bring items with common names to the same line but only for tracks with a summary table (with name=shortLabel) in dense mode*/ if( hasDense != NULL ) { sortGroupList = tg; /* used to put track name at * top of sorted list. */ slSort(&lfList, lfNamePositionCmp); sortGroupList = NULL; } tg->items = lfList; }
static void parseFixedStepSection(struct lineFile *lf, boolean clipDontDie, struct lm *lm, int itemsPerSlot, char *chrom, bits32 chromSize, bits32 span, bits32 sectionStart, bits32 step, struct bwgSection **pSectionList) /* Read the single column data in section until get to end. */ { struct lm *lmLocal = lmInit(0); /* Stream through section until get to end of file or next section, * adding values from single column to list. */ char *words[1]; char *line; struct bwgFixedStepItem *item, *itemList = NULL; int originalSectionSize = 0; bits32 sectionEnd = sectionStart; while (lineFileNextReal(lf, &line)) { if (steppedSectionEnd(line, 1)) { lineFileReuse(lf); break; } chopLine(line, words); lmAllocVar(lmLocal, item); item->val = lineFileNeedDouble(lf, words, 0); if (sectionEnd + span > chromSize) { warn("line %d of %s: chromosome %s has %u bases, but item ends at %u", lf->lineIx, lf->fileName, chrom, chromSize, sectionEnd + span); if (!clipDontDie) noWarnAbort(); } else { slAddHead(&itemList, item); ++originalSectionSize; } sectionEnd += step; } slReverse(&itemList); /* Break up into sections of no more than items-per-slot size, and convert to packed format. */ int sizeLeft = originalSectionSize; for (item = itemList; item != NULL; ) { /* Figure out size of this section */ int sectionSize = sizeLeft; if (sectionSize > itemsPerSlot) sectionSize = itemsPerSlot; sizeLeft -= sectionSize; /* Allocate and fill in section. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = chrom; section->start = sectionStart; sectionStart += sectionSize * step; section->end = sectionStart - step + span; section->type = bwgTypeFixedStep; section->itemStep = step; section->itemSpan = span; section->itemCount = sectionSize; /* Allocate array for data, and copy from list to array representation */ struct bwgFixedStepPacked *packed; /* An array */ section->items.fixedStepPacked = lmAllocArray(lm, packed, sectionSize); int i; for (i=0; i<sectionSize; ++i) { packed->val = item->val; item = item->next; ++packed; } /* Add section to list. */ slAddHead(pSectionList, section); } lmCleanup(&lmLocal); }
static void saveAxtBundle(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, boolean qIsRc, boolean tIsRc, enum ffStringency stringency, int minMatch, struct gfOutput *out) /* Save alignment to axtBundle. */ { struct axtData *ad = out->data; struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL; struct axt *axt; struct dyString *q = newDyString(1024), *t = newDyString(1024); struct axtBundle *gab; struct trans3 *t3List = NULL; if (t3Hash != NULL) t3List = hashMustFindVal(t3Hash, tSeq->name); AllocVar(gab); gab->tSize = chromSize; gab->qSize = qSeq->size; for (sAli = ali; sAli != NULL; sAli = eAli) { eAli = ffNextBreak(sAli, 8, tSeq, t3List); dyStringClear(q); dyStringClear(t); for (ff = sAli; ff != eAli; ff = ff->right) { dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart); dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart); rt = ff->right; if (rt != eAli) { int nGap = rt->nStart - ff->nEnd; int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE) + chromOffset; int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset; int hGap = nhStart - ohEnd; int gap = max(nGap, hGap); if (nGap < 0 || hGap < 0) { errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name); } if (nGap == gap) { dyStringAppendN(q, ff->nEnd, gap); dyStringAppendMultiC(t, '-', gap); } else { dyStringAppendN(t, ff->hEnd, gap); dyStringAppendMultiC(q, '-', gap); } } eFf = ff; /* Keep track of last block in bunch */ } assert(t->stringSize == q->stringSize); AllocVar(axt); axt->qName = cloneString(qSeq->name); axt->qStart = sAli->nStart - qSeq->dna; axt->qEnd = eFf->nEnd - qSeq->dna; axt->qStrand = (qIsRc ? '-' : '+'); axt->tName = cloneString(chromName); axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset; axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset; axt->tStrand = (tIsRc ? '-' : '+'); axt->symCount = t->stringSize; axt->qSym = cloneString(q->string); axt->tSym = cloneString(t->string); axt->frame = trans3Frame(sAli->hStart, t3List); if (out->qIsProt) axt->score = axtScoreProteinDefault(axt); else axt->score = axtScoreDnaDefault(axt); slAddHead(&gab->axtList, axt); } slReverse(&gab->axtList); dyStringFree(&q); dyStringFree(&t); slAddHead(&ad->bundleList, gab); }
static struct bigBedInterval *bigBedIntervalsMatchingName(struct bbiFile *bbi, struct fileOffsetSize *fosList, BbFirstWordMatch matcher, int fieldIx, void *target, struct lm *lm) /* Return list of intervals inside of sectors of bbiFile defined by fosList where the name * matches target somehow. */ { struct bigBedInterval *interval, *intervalList = NULL; struct fileOffsetSize *fos; boolean isSwapped = bbi->isSwapped; for (fos = fosList; fos != NULL; fos = fos->next) { /* Read in raw data */ udcSeek(bbi->udc, fos->offset); char *rawData = needLargeMem(fos->size); udcRead(bbi->udc, rawData, fos->size); /* Optionally uncompress data, and set data pointer to uncompressed version. */ char *uncompressedData = NULL; char *data = NULL; int dataSize = 0; if (bbi->uncompressBufSize > 0) { data = uncompressedData = needLargeMem(bbi->uncompressBufSize); dataSize = zUncompress(rawData, fos->size, uncompressedData, bbi->uncompressBufSize); } else { data = rawData; dataSize = fos->size; } /* Set up for "memRead" routines to more or less treat memory block like file */ char *blockPt = data, *blockEnd = data + dataSize; struct dyString *dy = dyStringNew(32); // Keep bits outside of chrom/start/end here /* Read next record into local variables. */ while (blockPt < blockEnd) { bits32 chromIx = memReadBits32(&blockPt, isSwapped); bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); int c; dyStringClear(dy); // TODO - can simplify this probably just to for (;;) {if ((c = *blockPt++) == 0) ... while ((c = *blockPt++) >= 0) { if (c == 0) break; dyStringAppendC(dy, c); } if ((*matcher)(dy->string, fieldIx, target)) { lmAllocVar(lm, interval); interval->start = s; interval->end = e; interval->rest = cloneString(dy->string); interval->chromId = chromIx; slAddHead(&intervalList, interval); } } /* Clean up temporary buffers. */ dyStringFree(&dy); freez(&uncompressedData); freez(&rawData); } slReverse(&intervalList); return intervalList; }
static struct rTree *rTreeFromChromRangeArray( struct lm *lm, int blockSize, int itemsPerSlot, void *itemArray, int itemSize, bits64 itemCount, void *context, struct cirTreeRange (*fetchKey)(const void *va, void *context), bits64 (*fetchOffset)(const void *va, void *context), bits64 endFileOffset, int *retLevelCount) { char *items = itemArray; struct rTree *el, *list=NULL, *tree = NULL; /* Make first level above leaf. */ bits64 i; bits64 nextOffset = (*fetchOffset)(items, context); for (i=0; i<itemCount; i += itemsPerSlot) { /* Figure out if we are on final iteration through loop, and the * count of items in this iteration. */ boolean finalIteration = FALSE; int oneSize = itemCount-i; if (oneSize > itemsPerSlot) oneSize = itemsPerSlot; else finalIteration = TRUE; /* Allocate element and put on list. */ lmAllocVar(lm, el); slAddHead(&list, el); /* Fill out most of element from first item in element. */ char *startItem = items + itemSize * i; struct cirTreeRange key = (*fetchKey)(startItem, context); el->startChromIx = el->endChromIx = key.chromIx; el->startBase = key.start; el->endBase = key.end; el->startFileOffset = nextOffset; /* Figure out end of element from offset of next element (or file size * for final element.) */ if (finalIteration) nextOffset = endFileOffset; else { char *endItem = startItem + itemSize*oneSize; nextOffset = (*fetchOffset)(endItem, context); } el->endFileOffset = nextOffset; /* Expand area spanned to include all items in block. */ int j; for (j=1; j<oneSize; ++j) { void *item = items + itemSize*(i+j); key = (*fetchKey)(item, context); if (key.chromIx < el->startChromIx) { el->startChromIx = key.chromIx; el->startBase = key.start; } else if (key.chromIx == el->startChromIx) { if (key.start < el->startBase) el->startBase = key.start; } if (key.chromIx > el->endChromIx) { el->endChromIx = key.chromIx; el->endBase = key.end; } else if (key.chromIx == el->endChromIx) { if (key.end > el->endBase) el->endBase = key.end; } } } slReverse(&list); verbose(2, "Made %d primary index nodes out of %llu items\n", slCount(list), itemCount); /* Now iterate through making more and more condensed versions until have just one. */ int levelCount = 1; tree = list; while (tree->next != NULL || levelCount < 2) { list = NULL; int slotsUsed = blockSize; struct rTree *parent = NULL, *next; for (el = tree; el != NULL; el = next) { next = el->next; if (slotsUsed >= blockSize) { slotsUsed = 1; lmAllocVar(lm, parent); parent = lmCloneMem(lm, el, sizeof(*el)); parent->children = el; el->parent = parent; el->next = NULL; slAddHead(&list, parent); } else { ++slotsUsed; slAddHead(&parent->children, el); el->parent = parent; if (el->startChromIx < parent->startChromIx) { parent->startChromIx = el->startChromIx; parent->startBase = el->startBase; } else if (el->startChromIx == parent->startChromIx) { if (el->startBase < parent->startBase) parent->startBase = el->startBase; } if (el->endChromIx > parent->endChromIx) { parent->endChromIx = el->endChromIx; parent->endBase = el->endBase; } else if (el->endChromIx == parent->endChromIx) { if (el->endBase > parent->endBase) parent->endBase = el->endBase; } } } slReverse(&list); for (el = list; el != NULL; el = el->next) slReverse(&el->children); tree = list; levelCount += 1; } *retLevelCount = levelCount; return tree; }
void cdwJobCleanFailed(int submitId) /* Check out the symlink to determine its type. */ { struct sqlConnection *conn = sqlConnect("cdw"); struct dyString *query = dyStringNew(0); sqlDyStringPrintf(query, "select id, commandLine, startTime, endTime, returnCode, pid from cdwJob where submitId=%d " "order by commandLine,CAST(returnCode AS unsigned)", submitId); // NOTE we need this CAST on returnCode since it can be -1. we want success 0 first. // TODO DO we need to add any other conditions such as distinguishing // between running, queued, and done? /* Scan through result set finding redundant rows beyond success row. */ struct sqlResult *sr = sqlGetResult(conn, query->string); char **row; char *lastCommand = ""; boolean success = FALSE; struct slInt *list = NULL; struct slInt *e; while ((row = sqlNextRow(sr)) != NULL) { unsigned int id = sqlUnsigned(row[0]); char *commandLine = row[1]; unsigned long startTime = sqlUnsignedLong(row[2]); unsigned long endTime = sqlUnsignedLong(row[3]); int returnCode = sqlSigned(row[4]); unsigned int pid = sqlUnsigned(row[5]); verbose(2, "%u\t%s\t%lu\t%lu\t%d\t%u\t%u\n", id, commandLine, startTime, endTime, returnCode, pid, submitId); if (sameString(lastCommand, commandLine)) { if (success) // we already succeeded, the old failure is unwanted baggage. { e = slIntNew(id); // or add it to a list of rows whose ids should get removed slAddHead(&list, e); } } else { if (returnCode == 0) success = TRUE; else success = FALSE; } // note fields pid and submitId are defined as signed integers in cdwJob table, probably should be unsigned. lastCommand = cloneString(commandLine); } sqlFreeResult(&sr); slReverse(&list); for(e=list;e;e=e->next) { dyStringClear(query); sqlDyStringPrintf(query, "delete from cdwJob where id=%u", (unsigned int) e->val); //printf("%s\n", query->string); sqlUpdate(conn, query->string); } /* Clean up and go home */ dyStringFree(&query); sqlDisconnect(&conn); }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. */ { char *row[3]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, ArraySize(row)); if (rowSize == 0) break; lineFileExpectWords(lf, 3, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); *retMinDiff = minDiff; *retAveSize = (double)totalBases/bedCount; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }
struct gapInfo *findLargeGaps(struct xaAli *xa, struct gapInfo *oldList) /* Find large gaps in alignment and classify them. */ { struct gdfGene *gdfList; struct gapInfo *gapList = NULL, *gap; int ceIx=0, cbIx=0, symIx=0; int ceStart=0, cbStart=0, symStart=0; int runSize = 0; char sym, lastSym = 0; int symCount = xa->symCount; /* Fetch C. elegans region. */ gdfList = wormGdfGenesInRange(xa->target, xa->tStart, xa->tEnd, &wormSangerGdfCache); /* Run a little state machine that does something at the end of each solid run * of a symbol. */ for (symIx = 0; symIx <= symCount; ++symIx) { sym = xa->hSym[symIx]; if (sym != lastSym) { if (runSize > 32) /* Introns need to be at least this long. */ { /* We're at end of a solid run. */ if (lastSym == 'Q' || lastSym == 'T') { int ceGapStart = xa->tStart + ceStart; int ceGapEnd = xa->tStart + ceIx; struct gdfGene *gdf; char hBefore = xa->hSym[symStart-1]; char hAfter = sym; char strand = '.'; AllocVar(gap); gap->query = cloneString(xa->query); gap->qStart = xa->qStart + cbStart; gap->qEnd = xa->qStart + cbIx; gap->target = cloneString(xa->target); gap->tStart = ceGapStart; gap->tEnd = ceGapEnd; gap->name = cloneString(xa->name); gap->size = runSize; gap->hSym = lastSym; if (uniqueGap(oldList, gap)) { slAddHead(&gapList, gap); classifyGap(gdfList, xa->target, ceGapStart, ceGapEnd, lastSym, &gap->type, &gdf); if (gdf != NULL) strand = gdf->strand; gap->hasIntronEnds = isIntron(xa, symStart, symIx, lastSym, strand, &gap->slideCount, &gap->isRc); if (gap->hasIntronEnds) slideGap(gap, xa, lastSym, symStart, symIx); if (isConserved(hBefore) && isConserved(hAfter)) gap->hasStrongHomology = TRUE; if (gap->hasStrongHomology) { if (lastSym == 'T') writeGap(gap, xa, symStart+gap->slideCount, symIx+gap->slideCount, strand, out); } } } } runSize = 0; ceStart = ceIx; cbStart = cbIx; symStart = symIx; lastSym = sym; } ++runSize; if (xa->qSym[symIx] != '-') ++cbIx; if (xa->tSym[symIx] != '-') ++ceIx; } gdfFreeGeneList(&gdfList); slReverse(&gapList); return gapList; }
int main(int argc, char *argv[]) { FILE *xaFile; struct xaAli *xa; struct gapInfo *gapList = NULL, *gaps; int count = 0; long startTime = clock1000(); char *xaName, *newName; char *first; boolean cbFirst; if (argc != 4) usage(); first = argv[1]; xaName = argv[2]; newName = argv[3]; if (sameWord("elegans", first)) cbFirst = FALSE; else if (sameWord("briggsae", first)) cbFirst = TRUE; else usage(); dnaUtilOpen(); intronHash = newHash(0); out = mustOpen(newName, "w"); xaFile = mustOpen(xaName, "r"); while ((xa = xaReadNext(xaFile, FALSE)) != NULL) { char *s; if (!cbFirst) { char *swaps; int swapi; char swapc; uglyf("Swapping....\n"); swaps = xa->query; xa->query = xa->target; xa->target = swaps; swapi = xa->qStart; xa->qStart = xa->tStart; xa->tStart = swapi; swapi = xa->qEnd; xa->qEnd = xa->tEnd; xa->tEnd = swapi; swapc = xa->qStrand; xa->qStrand = xa->tStrand; xa->tStrand = swapc; swaps = xa->qSym; xa->qSym = xa->tSym; xa->tSym = swaps; swapSym(xa->hSym, xa->symCount); } uglyf("%d query %s target %s\n", count, xa->query, xa->target); s = chromFromPath(xa->target); freeMem(xa->target); xa->target = s; if (++count % 500 == 0) printf("Processing %d\n", count); gaps = findLargeGaps(xa, gapList); gapList = slCat(gaps, gapList); xaAliFree(xa); } slReverse(&gapList); report(out, "Processing took %f seconds\n", (clock1000()-startTime)*0.001); reportGaps(gapList, out); printAllHistograms(out); calcCeHomoCount(); printHomologousEndStats(out); printSameIntronStats(out); return 0; }
void txGeneCanonical(char *codingCluster, char *infoFile, char *noncodingGraph, char *genesBed, char *nearCoding, char *outCanonical, char *outIsoforms, char *outClusters) /* txGeneCanonical - Pick a canonical version of each gene - that is the form * to use when just interested in a single splicing varient. Produces final * transcript clusters as well. */ { /* Read in input into lists in memory. */ struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster); struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph); struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12); struct txInfo *info, *infoList = txInfoLoadAll(infoFile); struct bed *nearList = bedLoadNAll(nearCoding, 12); /* Make hash of all beds. */ struct hash *bedHash = hashNew(18); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(bedHash, bed->name, bed); /* Make has of all info. */ struct hash *infoHash = hashNew(18); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Make a binKeeper structure that we'll populate with coding genes. */ struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *keeperHash = minChromSizeKeeperHash(sizeHash); /* Make list of coding genes and toss them into binKeeper. * This will eat up bed list, but bedHash is ok. */ struct gene *gene, *geneList = NULL; for (coding = codingList; coding != NULL; coding = coding->next) { gene = geneFromCluster(coding, bedHash, infoHash); slAddHead(&geneList, gene); struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom); binKeeperAdd(bk, gene->start, gene->end, gene); } /* Go through near-coding genes and add them to the coding gene * they most overlap. */ for (bed = nearList; bed != NULL; bed = nextBed) { nextBed = bed->next; gene = mostOverlappingGene(keeperHash, bed); if (gene == NULL) errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name); geneAddBed(gene, bed); } /* Add non-coding genes. */ for (graph = graphList; graph != NULL; graph = graph->next) { gene = geneFromGraph(graph, bedHash); slAddHead(&geneList, gene); } /* Sort so it all looks nicer. */ slSort(&geneList, geneCmp); /* Open up output files. */ FILE *fCan = mustOpen(outCanonical, "w"); FILE *fIso = mustOpen(outIsoforms, "w"); FILE *fClus = mustOpen(outClusters, "w"); /* Loop through, making up gene name, and writing output. */ int geneId = 0; for (gene = geneList; gene != NULL; gene = gene->next) { /* Make up name. */ char name[16]; safef(name, sizeof(name), "g%05d", ++geneId); /* Reverse transcript list just to make it look better. */ slReverse(&gene->txList); /* Write out canonical file output */ bed = hashMustFindVal(bedHash, gene->niceTx->name); fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n", bed->chrom, bed->chromStart, bed->chromEnd, geneId, gene->niceTx->name, gene->niceTx->name); /* Write out isoforms output. */ for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fIso, "%d\t%s\n", geneId, bed->name); /* Write out cluster output, starting with bed 6 standard fields. */ fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t", gene->chrom, gene->start, gene->end, name, 0, gene->strand); /* Write out thick-start/thick end. */ if (gene->isCoding) { int thickStart = gene->end, thickEnd = gene->start; for (bed = gene->txList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { thickStart = min(thickStart, bed->thickStart); thickEnd = max(thickEnd, bed->thickEnd); } } fprintf(fClus, "%d\t%d\t", thickStart, thickEnd); } else { fprintf(fClus, "%d\t%d\t", gene->start, gene->start); } /* We got no rgb value, just write out zero. */ fprintf(fClus, "0\t"); /* Get exons from exonTree. */ struct range *exon, *exonList = rangeTreeList(gene->exonTree); fprintf(fClus, "%d\t", slCount(exonList)); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->start - gene->start); fprintf(fClus, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->end - exon->start); fprintf(fClus, "\t"); /* Write out associated transcripts. */ fprintf(fClus, "%d\t", slCount(gene->txList)); for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fClus, "%s,", bed->name); fprintf(fClus, "\t"); /* Write out nice value */ fprintf(fClus, "%s\t", gene->niceTx->name); /* Write out coding/noncoding value. */ fprintf(fClus, "%d\n", gene->isCoding); } /* Close up files. */ carefulClose(&fCan); carefulClose(&fIso); carefulClose(&fClus); }
void loadSimpleBed(struct track *tg) /* Load the items in one track - just move beds in * window... */ { struct bed *(*loader)(char **row); struct bed *bed, *list = NULL; struct sqlConnection *conn = hAllocConnTrack(database, tg->tdb); char **row; int rowOffset; char *words[3]; int wordCt; char query[128]; char *setting = NULL; bool doScoreCtFilter = FALSE; int scoreFilterCt = 0; char *topTable = NULL; if (tg->bedSize <= 3) loader = bedLoad3; else if (tg->bedSize == 4) loader = bedLoad; else if (tg->bedSize == 5) loader = bedLoad5; else loader = bedLoad6; // pairedTagAlign loader is required for base coloring using sequence from seq1 & seq2 // after removing optional bin column, this loader assumes seq1 and seq2 are in // row[6] and row[7] respectively of the sql result. if ((setting = trackDbSetting(tg->tdb, BASE_COLOR_USE_SEQUENCE)) && sameString(setting, "seq1Seq2")) loader = bedLoadPairedTagAlign; /* limit to a specified count of top scoring items. * If this is selected, it overrides selecting item by specified score */ if ((setting = trackDbSettingClosestToHome(tg->tdb, "filterTopScorers")) != NULL) { wordCt = chopLine(cloneString(setting), words); if (wordCt >= 3) { doScoreCtFilter = cartUsualBooleanClosestToHome(cart, tg->tdb, FALSE, "filterTopScorersOn",sameString(words[0], "on")); scoreFilterCt = cartUsualIntClosestToHome(cart, tg->tdb, FALSE, "filterTopScorersCt", atoi(words[1])); topTable = words[2]; /* if there are not too many rows in the table then can define */ /* top table as the track or subtrack table */ if (sameWord(topTable, "self")) topTable = cloneString(tg->table); } } /* Get list of items */ if (tg->isBigBed) { char *scoreFilter = cartOrTdbString(cart, tg->tdb, "scoreFilter", NULL); if (scoreFilter != NULL || tg->visibility != tvDense) { struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedSelectRange(tg, chromName, winStart, winEnd, lm); char *bedRow[32]; char startBuf[16], endBuf[16]; int minScore = 0; if (scoreFilter) minScore = atoi(scoreFilter); for (bb = bbList; bb != NULL; bb = bb->next) { bigBedIntervalToRow(bb, chromName, startBuf, endBuf, bedRow, ArraySize(bedRow)); bed = loader(bedRow); if (scoreFilter == NULL || bed->score >= minScore) slAddHead(&list, bed); } lmCleanup(&lm); } } else { struct sqlResult *sr = NULL; /* limit to items above a specified score */ char *scoreFilterClause = getScoreFilterClause(cart, tg->tdb,NULL); if (doScoreCtFilter && (topTable != NULL) && hTableExists(database, topTable)) { safef(query, sizeof(query),"select * from %s order by score desc limit %d", topTable, scoreFilterCt); sr = sqlGetResult(conn, query); rowOffset = hOffsetPastBin(database, hDefaultChrom(database), topTable); } else if(scoreFilterClause != NULL && tg->bedSize >= 5) { sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd, scoreFilterClause, &rowOffset); } else { sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd, NULL, &rowOffset); } freeMem(scoreFilterClause); while ((row = sqlNextRow(sr)) != NULL) { bed = loader(row+rowOffset); slAddHead(&list, bed); } sqlFreeResult(&sr); } if (doScoreCtFilter) { /* filter out items not in this window */ struct bed *newList = bedFilterListInRange(list, NULL, chromName, winStart, winEnd); list = newList; } slReverse(&list); hFreeConn(&conn); tg->items = list; }
static void filterBed(struct track *tg, struct linkedFeatures **pLfList) /* Apply filters if any to mRNA linked features. */ { struct linkedFeatures *lf, *next, *newList = NULL, *oldList = NULL; struct mrnaUiData *mud = tg->extraUiData; struct mrnaFilter *fil; char *type; boolean anyFilter = FALSE; boolean colorIx = 0; boolean isExclude = FALSE; boolean andLogic = TRUE; if (*pLfList == NULL || mud == NULL) return; /* First make a quick pass through to see if we actually have * to do the filter. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { fil->pattern = cartUsualString(cart, fil->key, ""); if (fil->pattern[0] != 0) anyFilter = TRUE; } if (!anyFilter) return; type = cartUsualString(cart, mud->filterTypeVar, "red"); if (sameString(type, "exclude")) isExclude = TRUE; else if (sameString(type, "include")) isExclude = FALSE; else colorIx = getFilterColor(type, MG_BLACK); type = cartUsualString(cart, mud->logicTypeVar, "and"); andLogic = sameString(type, "and"); /* Make a pass though each filter, and start setting up search for * those that have some text. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { fil->pattern = cartUsualString(cart, fil->key, ""); if (fil->pattern[0] != 0) { fil->hash = newHash(10); } } /* Scan tables id/name tables to build up hash of matching id's. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { struct hash *hash = fil->hash; int wordIx, wordCount; char *words[128]; if (hash != NULL) { boolean anyWild; char *dupPat = cloneString(fil->pattern); wordCount = chopLine(dupPat, words); for (wordIx=0; wordIx <wordCount; ++wordIx) { char *pattern = cloneString(words[wordIx]); if (lastChar(pattern) != '*') { int len = strlen(pattern)+1; pattern = needMoreMem(pattern, len, len+1); pattern[len-1] = '*'; } anyWild = (strchr(pattern, '*') != NULL || strchr(pattern, '?') != NULL); touppers(pattern); for(lf = *pLfList; lf != NULL; lf=lf->next) { char copy[SMALLBUF]; boolean gotMatch; safef(copy, sizeof(copy), "%s", lf->name); touppers(copy); if (anyWild) gotMatch = wildMatch(pattern, copy); else gotMatch = sameString(pattern, copy); if (gotMatch) { hashAdd(hash, lf->name, NULL); } } freez(&pattern); } freez(&dupPat); } } /* Scan through linked features coloring and or including/excluding ones that * match filter. */ for (lf = *pLfList; lf != NULL; lf = next) { boolean passed = andLogic; next = lf->next; for (fil = mud->filterList; fil != NULL; fil = fil->next) { if (fil->hash != NULL) { if (hashLookup(fil->hash, lf->name) == NULL) { if (andLogic) passed = FALSE; } else { if (!andLogic) passed = TRUE; } } } if (passed ^ isExclude) { slAddHead(&newList, lf); if (colorIx > 0) lf->filterColor = colorIx; } else { slAddHead(&oldList, lf); } } slReverse(&newList); slReverse(&oldList); if (colorIx > 0) { /* Draw stuff that passes filter first in full mode, last in dense. */ if (tg->visibility == tvDense) { newList = slCat(oldList, newList); } else { newList = slCat(newList, oldList); } } *pLfList = newList; tg->limitedVisSet = FALSE; /* Need to recalculate this after filtering. */ /* Free up hashes, etc. */ for (fil = mud->filterList; fil != NULL; fil = fil->next) { hashFree(&fil->hash); } }
struct mafAli *mafFromBed12(char *database, char *track, struct bed *bed, struct slName *orgList) /* Construct a maf out of exons in bed. */ { /* Loop through all block in bed, collecting a list of mafs, one * for each block. While we're at make a hash of all species seen. */ struct hash *speciesHash = hashNew(0); struct mafAli *mafList = NULL, *maf, *bigMaf; struct mafComp *comp, *bigComp; int totalTextSize = 0; int i; for (i=0; i<bed->blockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; if (thickOnly) { start = max(start, bed->thickStart); end = min(end, bed->thickEnd); } if (start < end) { maf = hgMafFrag(database, track, bed->chrom, start, end, '+', database, NULL); slAddHead(&mafList, maf); for (comp = maf->components; comp != NULL; comp = comp->next) hashStore(speciesHash, comp->src); totalTextSize += maf->textSize; } } slReverse(&mafList); /* Add species in order list too */ struct slName *org; for (org = orgList; org != NULL; org = org->next) hashStore(speciesHash, org->name); /* Allocate memory for return maf that contains all blocks concatenated together. * Also fill in components with any species seen at all. */ AllocVar(bigMaf); bigMaf->textSize = totalTextSize; struct hashCookie it = hashFirst(speciesHash); struct hashEl *hel; while ((hel = hashNext(&it)) != NULL) { AllocVar(bigComp); bigComp->src = cloneString(hel->name); bigComp->text = needLargeMem(totalTextSize + 1); memset(bigComp->text, '.', totalTextSize); bigComp->text[totalTextSize] = 0; bigComp->strand = '+'; bigComp->srcSize = totalTextSize; /* It's safe if a bit of a lie. */ hel->val = bigComp; slAddHead(&bigMaf->components, bigComp); } /* Loop through maf list copying in data. */ int textOffset = 0; for (maf = mafList; maf != NULL; maf = maf->next) { for (comp = maf->components; comp != NULL; comp = comp->next) { bigComp = hashMustFindVal(speciesHash, comp->src); memcpy(bigComp->text + textOffset, comp->text, maf->textSize); bigComp->size += comp->size; } textOffset += maf->textSize; } /* Cope with strand of darkness. */ if (bed->strand[0] == '-') { for (comp = bigMaf->components; comp != NULL; comp = comp->next) reverseComplement(comp->text, bigMaf->textSize); } /* If got an order list then reorder components according to it. */ if (orgList != NULL) { struct mafComp *newList = NULL; for (org = orgList; org != NULL; org = org->next) { comp = hashMustFindVal(speciesHash, org->name); slAddHead(&newList, comp); } slReverse(&newList); bigMaf->components = newList; } /* Rename our own component to bed name */ comp = hashMustFindVal(speciesHash, database); freeMem(comp->src); comp->src = cloneString(bed->name); /* Clean up and go home. */ hashFree(&speciesHash); mafAliFreeList(&mafList); return bigMaf; }
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed) /* Create a new broken-up that excludes part of gene between CDS breaks. * Also jiggles cds->end coordinate to cope with the sequence we remove. * Deals with transcript to genome coordinate mapping including negative * strand. Be afraid, be very afraid! */ { /* Create range tree covering all breaks. The coordinates here * are transcript coordinates. While we're out it shrink outer CDS * since we are actually shrinking transcript. */ struct rbTree *gapTree = rangeTreeNew(); int bedSize = bed->chromEnd - bed->chromStart; struct lm *lm = gapTree->lm; /* Convenient place to allocate memory. */ int i, lastCds = cds->cdsCount-1; for (i=0; i<lastCds; ++i) { int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i]; int gapEnd = cds->cdsStarts[i+1]; int gapSize = gapEnd - gapStart; cds->end -= gapSize; rangeTreeAdd(gapTree, gapStart, gapEnd); } /* Get list of exons in bed, flipped to reverse strand if need be. */ struct range *exon, *exonList = bedToExonList(bed, lm); if (bed->strand[0] == '-') flipExonList(&exonList, bedSize); /* Go through exon list, mapping each exon to transcript * coordinates. Check if exon needs breaking up, and if * so do so, as we copy it to new list. */ /* Copy exons to new list, breaking them up if need be. */ struct range *newList = NULL, *nextExon, *newExon; int txStartPos = 0, txEndPos; for (exon = exonList; exon != NULL; exon = nextExon) { txEndPos = txStartPos + exon->end - exon->start; nextExon = exon->next; struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos); if (gapList != NULL) { verbose(3, "Splitting exon because of CDS gap\n"); /* Make up exons from current position up to next gap. This is a little * complicated by possibly the gap starting before the exon. */ int exonStart = exon->start; int txStart = txStartPos; struct range *gap; for (gap = gapList; gap != NULL; gap = gap->next) { int txEnd = gap->start; int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos); int exonSize = txEnd - txStart; if (exonSize > 0) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exonStart + exonSize; slAddHead(&newList, newExon); } else /* This case happens if gap starts before exon */ { exonSize = 0; } /* Update current position in both transcript and genome space. */ exonStart += exonSize + gapSize; txStart += exonSize + gapSize; } /* Make up final exon from last gap to end, at least if we don't end in a gap. */ if (exonStart < exon->end) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exon->end; slAddHead(&newList, newExon); } } else { /* Easy case where we don't intersect any gaps. */ slAddHead(&newList, exon); } txStartPos= txEndPos; } slReverse(&newList); /* Flip exons back to forward strand if need be */ if (bed->strand[0] == '-') flipExonList(&newList, bedSize); /* Convert exons to bed12 */ struct bed *newBed; AllocVar(newBed); newBed->chrom = cloneString(bed->chrom); newBed->chromStart = newList->start + bed->chromStart; newBed->chromEnd = newList->end + bed->chromStart; newBed->name = cloneString(bed->name); newBed->score = bed->score; newBed->strand[0] = bed->strand[0]; newBed->blockCount = slCount(newList); AllocArray(newBed->blockSizes, newBed->blockCount); AllocArray(newBed->chromStarts, newBed->blockCount); for (exon = newList, i=0; exon != NULL; exon = exon->next, i++) { newBed->chromStarts[i] = exon->start; newBed->blockSizes[i] = exon->end - exon->start; newBed->chromEnd = exon->end + bed->chromStart; } /* Clean up and go home. */ rbTreeFree(&gapTree); return newBed; }
struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, int maxItems, struct lm *lm) /* Get data for interval. Return list allocated out of lm. Set maxItems to maximum * number of items to return, or to 0 for all items. */ { struct bigBedInterval *el, *list = NULL; int itemCount = 0; bbiAttachUnzoomedCir(bbi); bits32 chromId; struct fileOffsetSize *blockList = bbiOverlappingBlocks(bbi, bbi->unzoomedCir, chrom, start, end, &chromId); struct fileOffsetSize *block, *beforeGap, *afterGap; struct udcFile *udc = bbi->udc; boolean isSwapped = bbi->isSwapped; struct dyString *dy = dyStringNew(32); /* Set up for uncompression optionally. */ char *uncompressBuf = NULL; if (bbi->uncompressBufSize > 0) uncompressBuf = needLargeMem(bbi->uncompressBufSize); for (block = blockList; block != NULL; ) { /* Find contigious blocks and read them into mergedBuf. */ fileOffsetSizeFindGap(block, &beforeGap, &afterGap); bits64 mergedOffset = block->offset; bits64 mergedSize = beforeGap->offset + beforeGap->size - mergedOffset; udcSeek(udc, mergedOffset); char *mergedBuf = needLargeMem(mergedSize); udcMustRead(udc, mergedBuf, mergedSize); char *blockBuf = mergedBuf; /* Loop through individual blocks within merged section. */ for (;block != afterGap; block = block->next) { /* Uncompress if necessary. */ char *blockPt, *blockEnd; if (uncompressBuf) { blockPt = uncompressBuf; int uncSize = zUncompress(blockBuf, block->size, uncompressBuf, bbi->uncompressBufSize); blockEnd = blockPt + uncSize; } else { blockPt = blockBuf; blockEnd = blockPt + block->size; } while (blockPt < blockEnd) { /* Read next record into local variables. */ bits32 chr = memReadBits32(&blockPt, isSwapped); // Read and discard chromId bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); int c; dyStringClear(dy); // TODO - can simplify this probably just to for (;;) {if ((c = *blockPt++) == 0) ... while ((c = *blockPt++) >= 0) { if (c == 0) break; dyStringAppendC(dy, c); } /* If we're actually in range then copy it into a new element and add to list. */ if (chr == chromId && s < end && e > start) { ++itemCount; if (maxItems > 0 && itemCount > maxItems) break; lmAllocVar(lm, el); el->start = s; el->end = e; if (dy->stringSize > 0) el->rest = lmCloneString(lm, dy->string); el->chromId = chromId; slAddHead(&list, el); } } if (maxItems > 0 && itemCount > maxItems) break; blockBuf += block->size; } if (maxItems > 0 && itemCount > maxItems) break; freez(&mergedBuf); } freeMem(uncompressBuf); dyStringFree(&dy); slFreeList(&blockList); slReverse(&list); return list; }
static struct linkedFeatures *cgapSageToLinkedFeatures(struct cgapSage *tag, struct hash *libHash, struct hash *libTotHash, enum trackVisibility vis) /* Convert a single CGAP tag to a list of linkedFeatures. */ { struct linkedFeatures *libList = NULL; struct linkedFeatures *skel = skeletonLf(tag); int i; if (vis == tvDense) /* Just use the skeleton one. */ { int tagTotal = 0; int freqTotal = 0; int libsUsed = 0; for (i = 0; i < tag->numLibs; i++) { char libId[16]; char *libName; safef(libId, sizeof(libId), "%d", tag->libIds[i]); libName = hashMustFindVal(libHash, libId); if (keepThisLib(libName, libId)) { int libTotal = hashIntVal(libTotHash, libId); tagTotal += libTotal; freqTotal += tag->freqs[i]; libsUsed++; } } if (libsUsed > 0) { skel->name = cloneString("whatever"); skel->score = (float)((double)freqTotal * (1000000/tagTotal)); skel->grayIx = grayIxForCgap(skel->score); addSimpleFeature(skel); libList = skel; } } else if (vis == tvPack) { /* If it's pack mode, average tissues into one linkedFeature. */ struct hash *tpmHash = combineCgapSages(tag, libHash, libTotHash); struct hashEl *tpmList = hashElListHash(tpmHash); struct hashEl *tpmEl; slSort(&tpmList, slNameCmp); for (tpmEl = tpmList; tpmEl != NULL; tpmEl = tpmEl->next) { struct linkedFeatures *tiss = CloneVar(skel); struct cgapSageTpmHashEl *tpm = (struct cgapSageTpmHashEl *)tpmEl->val; char link[256]; char *encTissName = NULL; double score = 0; int len = strlen(tpmEl->name) + 32; tiss->name = needMem(len); safef(tiss->name, len, "%s (%d)", tpmEl->name, tpm->count); encTissName = cgiEncode(tpmEl->name); safef(link, sizeof(link), "i=%s&tiss=%s", tag->name, encTissName); score = (double)tpm->freqTotal*(1000000/(double)tpm->libTotals); tiss->score = (float)score; tiss->grayIx = grayIxForCgap(score); tiss->extra = cloneString(link); freeMem(encTissName); addSimpleFeature(tiss); slAddHead(&libList, tiss); } hashElFreeList(&tpmList); freeHashAndVals(&tpmHash); } else /* full mode */ { for (i = 0; i < tag->numLibs; i++) { char libId[16]; char *libName; char link[256]; struct linkedFeatures *lf; safef(libId, sizeof(libId), "%d", tag->libIds[i]); libName = hashMustFindVal(libHash, libId); if (keepThisLib(libName, libId)) { lf = CloneVar(skel); lf->name = cloneString(libName); safef(link, sizeof(link), "i=%s&lib=%s", tag->name, libId); lf->score = (float)tag->tagTpms[i]; lf->grayIx = grayIxForCgap(tag->tagTpms[i]); lf->extra = cloneString(link); addSimpleFeature(lf); slAddHead(&libList, lf); } } } slSort(&libList, cgapLinkedFeaturesCmp); slReverse(&libList); return libList; }
int main(int argc, char *argv[]) { char *genoListName; char *cdnaListName; char *oocFileName; char *hitFileName; char *mergerFileName; struct patSpace *patSpace; long startTime, endTime; char **genoList; int genoListSize; char *genoListBuf; char **cdnaList; int cdnaListSize; char *cdnaListBuf; char *genoName; int i; int estIx = 0; struct dnaSeq **seqListList = NULL, *seq; if (dumpMe) { bigHtmlFile = mustOpen("C:\\inetpub\\wwwroot\\test\\patAli.html", "w"); littleHtmlFile = mustOpen("C:\\inetpub\\wwwroot\\test\\patSpace.html", "w"); htmStart(bigHtmlFile, "PatSpace Alignments"); htmStart(littleHtmlFile, "PatSpace Index"); } if (argc != 6) usage(); startTime = clock1000(); dnaUtilOpen(); makePolys(); genoListName = argv[1]; cdnaListName = argv[2]; oocFileName = argv[3]; hitFileName = argv[4]; mergerFileName = argv[5]; readAllWords(genoListName, &genoList, &genoListSize, &genoListBuf); readAllWords(cdnaListName, &cdnaList, &cdnaListSize, &cdnaListBuf); hitOut = mustOpen(hitFileName, "w"); mergerOut = mustOpen(mergerFileName, "w"); dumpOut = mustOpen("dump.out", "w"); seqListList = needMem(genoListSize*sizeof(seqListList[0]) ); fprintf(hitOut, "Pattern space 0.2 cDNA matcher\n"); fprintf(hitOut, "cDNA files: ", cdnaListSize); for (i=0; i<cdnaListSize; ++i) fprintf(hitOut, " %s", cdnaList[i]); fprintf(hitOut, "\n"); fprintf(hitOut, "%d genomic files\n", genoListSize); for (i=0; i<genoListSize; ++i) { genoName = genoList[i]; if (!startsWith("//", genoName) ) { seqListList[i] = seq = faReadAllDna(genoName); fprintf(hitOut, "%d els in %s ", slCount(seq), genoList[i]); for (; seq != NULL; seq = seq->next) fprintf(hitOut, "%d ", seq->size); fprintf(hitOut, "\n"); } } patSpace = makePatSpace(seqListList, genoListSize, oocFileName); for (i=0; i<cdnaListSize; ++i) { FILE *f; char *estFileName; DNA *dna; char *estName; int size; int c; int maxSizeForFuzzyFind = 20000; int dotCount = 0; estFileName = cdnaList[i]; if (startsWith("//", estFileName) ) continue; f = mustOpen(estFileName, "rb"); while ((c = fgetc(f)) != EOF) if (c == '>') break; printf("%s", cdnaList[i]); fflush(stdout); while (fastFaReadNext(f, &dna, &size, &estName)) { if (size < maxSizeForFuzzyFind) /* Some day need to fix this somehow... */ { struct cdnaAliList *calList = NULL; patSpaceFindOne(patSpace, dna, size, '+', estName, estIx, &calList); reverseComplement(dna, size); patSpaceFindOne(patSpace, dna, size, '-', estName, estIx, &calList); slReverse(&calList); writeMergers(calList, estName, size, genoList); ++estIx; if ((estIx & 0xfff) == 0) { printf("."); ++dotCount; fflush(stdout); } } } printf("\n"); } printf("raw %4d ffSubmitted %3d ffAccepted %3d ffOkScore %3d ffSolidMatch %2d\n", grandTotalHits, ffSubmitted, ffAccepted, ffOkScore, ffSolidMatch); endTime = clock1000(); printf("Total time is %4.2f\n", 0.001*(endTime-startTime)); if (dumpMe) { htmEnd(bigHtmlFile); htmEnd(littleHtmlFile); } return 0; }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *geneList = NULL, *curGene, *gene; int geneIx, geneCount = 0; struct microData **geneArray = NULL; float *weights = NULL; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); long time1, time2; time1 = clock1000(); /* Get list/hash of all items with expression values. */ /* uglyf("warning: temporarily limited to 1000 records\n"); */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); time2 = clock1000(); verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Get an array for sorting. */ AllocArray(geneArray, geneCount); for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx) geneArray[geneIx] = gene; /* Print out closest 1000 in tab file. */ for (curGene = geneList; curGene != NULL; curGene = curGene->next) { calcDistances(curGene, geneList, weights); qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance); for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx) { gene = geneArray[geneIx]; fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance); } dotOut(); } printf("Made %s.tab\n", outTable); time1 = time2; time2 = clock1000(); verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); time1 = time2; time2 = clock1000(); verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0); }
void cloneSpan(char *fileName) /* cloneSpan - List clones and the amount the span by looking at .gl file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount, lineSize; char *words[16], *line; struct hash *hash = newHash(0); struct hashEl *hel; char *cloneName; int start, end; struct clone *cloneList = NULL, *clone; int totalSpan = 0, totalBases = 0; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#') continue; wordCount = chopLine(line, words); if (wordCount == 0) continue; if (wordCount < 3) lineFileExpectWords(lf, 3, wordCount); cloneName = words[0]; chopSuffix(cloneName); start = sqlUnsigned(words[1]); end = sqlUnsigned(words[2]); clone = hashFindVal(hash, cloneName); if (clone == NULL) { AllocVar(clone); hel = hashAdd(hash, cloneName, clone); clone->name = hel->name; clone->start = start; clone->end = end; slAddHead(&cloneList, clone); } else { if (clone->start > start) clone->start = start; if (clone->end < end) clone->end = end; } clone->baseCount += end-start; } lineFileClose(&lf); slReverse(&cloneList); for (clone = cloneList; clone != NULL; clone = clone->next) { int span = clone->end - clone->start; #ifdef SOMETIMES printf("clone %s, bases %d, spans %d, density %4.2f%%\n", clone->name, clone->baseCount, span, 100.0 * (double)clone->baseCount/(double)span); #endif totalSpan += span; totalBases += clone->baseCount; } printf("%s bases %d, spans %d, density %4.2f%%\n", fileName, totalBases, totalSpan, 100.0 * (double)totalBases/(double)totalSpan); }
struct cutter *readGcg(char *gcgFile) /* Parse a GCG file and load it into cutter format. */ { struct lineFile *lf = lineFileOpen(gcgFile,TRUE); struct cutter *enzList = NULL; char *line = "whatever", *words[10], numWords; /* Skip to the right line. */ while (lineFileNext(lf,&line,NULL) && !startsWith("..",line)); /* */ while ((numWords=lineFileChop(lf,words))) { struct cutter *newone = NULL; int comIx = (numWords==7) ? 5 : 6; int refIx = (numWords==7) ? 6 : 7; int i; char *items[100]; /* Skip ones */ if (words[4][0] == '?') continue; AllocVar(newone); newone->semicolon = (words[0][0] == ';') ? TRUE : FALSE; /* Deal with the first few columns */ if (!isdigit(words[1][0])) errAbort("Error: expecting a number in cut site column on line %d\n", lf->lineIx+1); if (!isdigit(words[3][0]) && words[3][0]!='-') errAbort("Error: expecting a number in the overhang column on line %d\n", lf->lineIx+1); if (words[comIx][0] != '>') errAbort("Error: expecting a \'>\' in the commercial sources column of line %d\n", lf->lineIx+1); newone->name = (words[0][0] == ';') ? cloneString(words[0]+1) : cloneString(words[0]); newone->cut = atoi(words[1]); newone->seq = cloneString(words[2]); touppers(newone->seq); stripChar(newone->seq,'\''); stripChar(newone->seq,'_'); newone->size = strlen(newone->seq); newone->matchSize = newone->size - countChars(newone->seq, 'N'); newone->palindromic = isPalindrome(newone->seq); newone->overhang = atoi(words[3]); newone->numCompanies = strlen(words[comIx]+1); if (newone->numCompanies > 0) newone->companies = cloneMem(words[comIx]+1, newone->numCompanies*sizeof(char)); newone->numRefs = chopString(words[refIx], ",", items, ArraySize(items)); AllocArray(newone->refs, newone->numRefs); for (i = 0; i < newone->numRefs; i++) { if (i == 100) errAbort("Error: Andy didn't make the array for holding references big enough\n"); if (!isdigit(items[i][0])) errAbort("Error: expecting number in references column in line %d\n", lf->lineIx+1); newone->refs[i] = atoi(items[i]); } /* Deal with isoscizomers. */ if (numWords == 8) { newone->numSciz = chopString(words[5], ",", items, ArraySize(items)); AllocArray(newone->scizs, newone->numSciz*sizeof(int)); for (i = 0; i < newone->numSciz; i++) { if (i == 100) errAbort("Error: Andy didn't make the array for having isoscizomers big enough\n"); newone->scizs[i] = cloneString(items[i]); } } else newone->numSciz = 0; slAddHead(&enzList, newone); } slReverse(&enzList); lineFileClose(&lf); return enzList; }
struct commit* getCommits() /* Get all commits from startTag to endTag */ { int numCommits = 0; safef(gitCmd,sizeof(gitCmd), "" "git log %s..%s --name-status > commits.tmp" , startTag, endTag); runShell(gitCmd); struct lineFile *lf = lineFileOpen("commits.tmp", TRUE); int lineSize; char *line; struct commit *commits = NULL, *commit = NULL; struct files *files = NULL, *f = NULL; char *sep = ""; while (lineFileNext(lf, &line, &lineSize)) { boolean isMerge = FALSE; char *w = nextWord(&line); AllocVar(commit); if (!sameString("commit", w)) errAbort("expected keyword commit parsing commits.tmp\n"); commit->commitId = cloneString(nextWord(&line)); commit->commitNumber = ++numCommits; lineFileNext(lf, &line, &lineSize); w = nextWord(&line); if (sameString("Merge:", w)) { isMerge = TRUE; lineFileNext(lf, &line, &lineSize); w = nextWord(&line); } if (!sameString("Author:", w)) errAbort("expected keyword Author: parsing commits.tmp\n"); /* by request, keep just the email account name */ char *lc = strchr(line, '<'); if (!lc) errAbort("expected '<' char in email address in Author: parsing commits.tmp\n"); ++lc; char *rc = strchr(lc, '>'); if (!rc) errAbort("expected '>' char in email address in Author: parsing commits.tmp\n"); char *ac = strchr(lc, '@'); if (ac) rc = ac; commit->author = cloneStringZ(lc, rc-lc); lineFileNext(lf, &line, &lineSize); w = nextWord(&line); if (!sameString("Date:", w)) errAbort("expected keyword Date: parsing commits.tmp\n"); commit->date = cloneString(line); lineFileNext(lf, &line, &lineSize); if (!sameString("", line)) errAbort("expected blank line parsing commits.tmp\n"); /* collect the comment-lines */ struct dyString *dy = NULL; dy = dyStringNew(0); sep = ""; files = NULL; while (lineFileNext(lf, &line, &lineSize)) { if (sameString("", line)) break; w = skipLeadingSpaces(line); dyStringPrintf(dy, "%s%s", w, sep); sep = "\n"; } commit->comment = cloneString(dy->string); freeDyString(&dy); if (!isMerge) { /* collect the files-list */ while (lineFileNext(lf, &line, &lineSize)) { if (sameString("", line)) break; AllocVar(f); w = nextWord(&line); f->type = w[0]; f->path = cloneString(line); slAddHead(&files, f); } slReverse(&files); } commit->files = files; if (!isMerge /* for now, default to filtering out the records for automatic-merges */ && !endsWith(commit->comment, "elease log update")) /* filter out automatic release log commits */ slAddHead(&commits, commit); verbose(2, "commitId: %s\n" "author: %s\n" "date: %s\n" "comment: [%s]\n" "file(s): \n" , commit->commitId , commit->author , commit->date , commit->comment); for (f=commit->files; f; f = f->next) { verbose(2, "%c %s\n", f->type, f->path); // anything other than M or A? if (f->type != 'M' && f->type != 'A' ) verbose(2, "special type: %c %s\n", f->type, f->path); } verbose(2, "------------\n"); } lineFileClose(&lf); /* We want to keep them chronological order, so do not need slReverse since the addHead reversed git log's rev chron order already */ unlink("commits.tmp"); return commits; }
struct bed *matchEnzymes(struct cutter *cutters, struct dnaSeq *seq, int startOffset) /* Match the enzymes to sequence and return a bed list in all cases. */ { struct hash *sixers = newHash(8), *palinSixers = newHash(8); struct cutter *enz; struct cutter *ACGTo[5], *palinACGTo[5]; struct bed *bedList = NULL, *tmp; int i; if (!cutters) return NULL; for (i = 0; i < 5; i++) ACGTo[i] = palinACGTo[i] = NULL; /* Put each of the enzymes in either a hash table of six-cutters or */ enz = cutters; while (enz != NULL) { int acgtCount = 0; struct cutter *next = enz->next; acgtCount = countChars(enz->seq,'A') + countChars(enz->seq,'C') + countChars(enz->seq,'G') + countChars(enz->seq,'T'); /* Super dumb coding here but it's quick. */ if (enz->palindromic) { if (enz->size==6 && acgtCount==6) hashAdd(palinSixers, enz->seq, enz); else { if (enz->seq[0] == 'A') slAddHead(&palinACGTo[0], enz); else if (enz->seq[0] == 'C') slAddHead(&palinACGTo[1], enz); else if (enz->seq[0] == 'G') slAddHead(&palinACGTo[2], enz); else if (enz->seq[0] == 'T') slAddHead(&palinACGTo[3], enz); else { slAddHead(&palinACGTo[4], enz); } } } else { if (enz->size==6 && acgtCount==6) hashAdd(sixers, enz->seq, enz); else { if (enz->seq[0] == 'A') slAddHead(&ACGTo[0], enz); else if (enz->seq[0] == 'C') slAddHead(&ACGTo[1], enz); else if (enz->seq[0] == 'G') slAddHead(&ACGTo[2], enz); else if (enz->seq[0] == 'T') slAddHead(&ACGTo[3], enz); else slAddHead(&ACGTo[4], enz); } } enz = next; } /* At this point we got a hash for the palindromes and non-palindromic six-cutters, plus an array for each too. The array is set up so the enzymes starting with 'A' go into [0], 'C' into [1], 'G' into [2], 'T' into [3], and other bases into [4]. */ if (ACGTo[4]) { ACGTo[0] = slCat(ACGTo[0], ACGTo[4]); ACGTo[1] = slCat(ACGTo[1], ACGTo[4]); ACGTo[2] = slCat(ACGTo[2], ACGTo[4]); ACGTo[3] = slCat(ACGTo[3], ACGTo[4]); } if (palinACGTo[4]) { palinACGTo[0] = slCat(palinACGTo[0], palinACGTo[4]); palinACGTo[1] = slCat(palinACGTo[1], palinACGTo[4]); palinACGTo[2] = slCat(palinACGTo[2], palinACGTo[4]); palinACGTo[3] = slCat(palinACGTo[3], palinACGTo[4]); } /* Search the DNA in three ways: on the plus strand for both palindromes and nonpalindromes, and then just nonpalindromes on the minus strand. */ bedList = searchStrand(palinSixers, palinACGTo, seq, startOffset, '+'); tmp = searchStrand(sixers, ACGTo, seq, startOffset, '+'); bedList = slCat(bedList, tmp); reverseComplement(seq->dna, seq->size); tmp = searchStrand(sixers, ACGTo, seq, startOffset, '-'); bedList = slCat(bedList, tmp); if (bedList) slReverse(&bedList); return bedList; }
void oneChrom(char *database, char *chrom, char *refAliTrack, char *bedTrack, struct hash *otherHash, struct stats *stats) /* Process one chromosome. */ { struct bed *bedList = NULL, *bed; struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); struct psl *pslList = NULL; struct dnaSeq *chromSeq = NULL; if (endsWith(bedTrack, ".bed")) { struct lineFile *lf = lineFileOpen(bedTrack, TRUE); char *row[3]; while (lineFileRow(lf, row)) { if (sameString(chrom, row[0])) { bed = bedLoad3(row); slAddHead(&bedList, bed); } } lineFileClose(&lf); } else { sr = hChromQuery(conn, bedTrack, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { bed = bedLoad3(row+rowOffset); slAddHead(&bedList, bed); } sqlFreeResult(&sr); } slReverse(&bedList); uglyf("Loaded beds\n"); sr = hChromQuery(conn, refAliTrack, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct psl *psl = pslLoad(row + rowOffset); slAddHead(&pslList, psl); binKeeperAdd(bk, psl->tStart, psl->tEnd, psl); } sqlFreeResult(&sr); uglyf("Loaded psls\n"); chromSeq = hLoadChrom(database, chrom); /* Fetch entire chromosome into memory. */ uglyf("Loaded human seq\n"); for (bed = bedList; bed != NULL; bed = bed->next) { struct binElement *el, *list = binKeeperFind(bk, bed->chromStart, bed->chromEnd); for (el = list; el != NULL; el = el->next) { struct psl *fullPsl = el->val; struct psl *psl = pslTrimToTargetRange(fullPsl, bed->chromStart, bed->chromEnd); if (psl != NULL) { foldPslIntoStats(psl, chromSeq, otherHash, stats); pslFree(&psl); } } slFreeList(&list); stats->bedCount += 1; stats->bedBaseCount += bed->chromEnd - bed->chromStart; sqlFreeResult(&sr); } freeDnaSeq(&chromSeq); pslFreeList(&pslList); binKeeperFree(&bk); hFreeConn(&conn); }
void reportAlt3Prime(struct altGraphX *ag, bool **em, int vs, int ve1, int ve2, int altBpStart, int altBpEnd, int startV, int endV, FILE *out) /* Write out an altGraphX record for an alt3Prime splicing event. Variable names are consistent with the rest of the program, but can be misleading. Specifically vs = start of alt splicing, ve1 = first end of alt splicing, etc. even though "vs" is really the end of an exon. For an alt5Prime splice the edges are: Name Vertexes Class ------ ---------- ----- exon1: startV->vs constituative (0) junction1: vs->ve1 alternative (1) junction2: vs->ve2 alternative (2) exon2: ve1->e2 alternative (1) exon3: ve2->endV constituative (0) */ { struct altGraphX *agLoc = NULL; /* Local altGraphX. */ struct evidence *ev = NULL, *evLoc = NULL; int *vPos = ag->vPositions; unsigned char *vT = ag->vTypes; int *vPosLoc = NULL; /* Vertex Positions. */ int *eStartsLoc = NULL; /* Edge Starts. */ int *eEndsLoc = NULL; /* Edge ends. */ unsigned char *vTLoc = NULL; /* Vertex Types. */ int *eTLoc = NULL; /* Edge Types. */ int vCLoc = 0; int eCLoc = 0; int edgeIx = 0, vertexIx = 0; int i =0; struct dyString *dy = NULL; if(out == NULL) return; AllocVar(agLoc); agLoc->tName = cloneString(ag->tName); agLoc->name = cloneString(ag->name); agLoc->tStart = vPos[startV]; agLoc->tEnd = vPos[endV]; agLoc->strand[0] = ag->strand[0]; agLoc->vertexCount = vCLoc = 6; agLoc->edgeCount = eCLoc = 5; agLoc->id = alt3Prime; /* Allocate some arrays. */ AllocArray(vPosLoc, vCLoc); AllocArray(eStartsLoc, eCLoc); AllocArray(eEndsLoc, eCLoc); AllocArray(vTLoc, vCLoc); AllocArray(eTLoc, eCLoc); /* Fill in the vertex positions. */ vertexIx = 0; vPosLoc[vertexIx++] = vPos[startV]; /* 0 */ vPosLoc[vertexIx++] = vPos[vs]; /* 1 */ vPosLoc[vertexIx++] = vPos[ve1]; /* 2 */ vPosLoc[vertexIx++] = vPos[ve2]; /* 3 */ vPosLoc[vertexIx++] = vPos[ve2]; /* 4 */ vPosLoc[vertexIx++] = vPos[endV]; /* 5 */ /* Fill in the vertex types. */ vertexIx = 0; vTLoc[vertexIx++] = vT[startV]; vTLoc[vertexIx++] = vT[vs]; vTLoc[vertexIx++] = vT[ve1]; vTLoc[vertexIx++] = vT[vs]; /* Faking a separate exon for the alt spliced portion. */ vTLoc[vertexIx++] = vT[ve2]; vTLoc[vertexIx++] = vT[endV]; edgeIx = 0; /* Constitutive first exon. */ eStartsLoc[edgeIx] = 0; eEndsLoc[edgeIx] = 1; eTLoc[edgeIx] = 0; ev = evidenceForEdge(ag, startV, vs); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Alternative1 junction (shorter). */ eStartsLoc[edgeIx] = 1; eEndsLoc[edgeIx] = 2; eTLoc[edgeIx] = 1; ev = evidenceForEdge(ag, vs, ve1); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Alt2 junction (longer). */ eStartsLoc[edgeIx] = 1; eEndsLoc[edgeIx] = 4; eTLoc[edgeIx] = 2; ev = evidenceForEdge(ag, vs, ve2); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Alt1 portion of second exon. */ eStartsLoc[edgeIx] = 2; eEndsLoc[edgeIx] = 3; eTLoc[edgeIx] = 1; ev = evidenceForEdge(ag, ve1, endV); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Exon 2 constitutive (shorter exon) */ eStartsLoc[edgeIx] = 4; eEndsLoc[edgeIx] = 5; eTLoc[edgeIx] = 0; ev = evidenceForEdge(ag, ve2, endV); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Package up the evidence, tissues, etc. */ slReverse(&agLoc->evidence); dy = newDyString(ag->mrnaRefCount*36); agLoc->mrnaRefCount = ag->mrnaRefCount; for(i=0; i<ag->mrnaRefCount; i++) dyStringPrintf(dy, "%s,", ag->mrnaRefs[i]); sqlStringDynamicArray(dy->string, &agLoc->mrnaRefs, &i); dyStringFree(&dy); agLoc->mrnaTissues = CloneArray(ag->mrnaTissues, ag->mrnaRefCount); agLoc->mrnaLibs = CloneArray(ag->mrnaLibs, ag->mrnaRefCount); agLoc->vPositions = vPosLoc; agLoc->edgeStarts = eStartsLoc; agLoc->edgeEnds = eEndsLoc; agLoc->vTypes = vTLoc; agLoc->edgeTypes = eTLoc; altGraphXTabOut(agLoc, out); altGraphXFree(&agLoc); }
int main(int argc, char *argv[]) { char *genoListName; char *cdnaListName; char *oocFileName; char *pairFileName; struct patSpace *patSpace; long startTime, endTime; char **genoList; int genoListSize; char *genoListBuf; char **cdnaList; int cdnaListSize; char *cdnaListBuf; char *genoName; int i; int estIx = 0; struct dnaSeq **seqListList = NULL, *seq; static char hitFileName[512], mergerFileName[512], okFileName[512]; char *outRoot; struct hash *pairHash; if (dumpMe) { bigHtmlFile = mustOpen("C:\\inetpub\\wwwroot\\test\\patAli.html", "w"); littleHtmlFile = mustOpen("C:\\inetpub\\wwwroot\\test\\patSpace.html", "w"); htmStart(bigHtmlFile, "PatSpace Alignments"); htmStart(littleHtmlFile, "PatSpace Index"); } if ((hostName = getenv("HOST")) == NULL) hostName = ""; if (argc != 6) usage(); pushWarnHandler(patSpaceWarnHandler); startTime = clock1000(); dnaUtilOpen(); makePolys(); genoListName = argv[1]; cdnaListName = argv[2]; oocFileName = argv[3]; pairFileName = argv[4]; outRoot = argv[5]; sprintf(hitFileName, "%s.hit", outRoot); sprintf(mergerFileName, "%s.glu", outRoot); sprintf(okFileName, "%s.ok", outRoot); readAllWords(genoListName, &genoList, &genoListSize, &genoListBuf); readAllWords(cdnaListName, &cdnaList, &cdnaListSize, &cdnaListBuf); pairHash = makePairHash(pairFileName); hitOut = mustOpen(hitFileName, "w"); mergerOut = mustOpen(mergerFileName, "w"); dumpOut = mustOpen("dump.out", "w"); seqListList = needMem(genoListSize*sizeof(seqListList[0]) ); fprintf(hitOut, "Pattern space 0.2 cDNA matcher\n"); fprintf(hitOut, "cDNA files: ", cdnaListSize); for (i=0; i<cdnaListSize; ++i) fprintf(hitOut, " %s", cdnaList[i]); fprintf(hitOut, "\n"); fprintf(hitOut, "%d genomic files\n", genoListSize); for (i=0; i<genoListSize; ++i) { genoName = genoList[i]; if (!startsWith("//", genoName) ) { seqListList[i] = seq = faReadAllDna(genoName); fprintf(hitOut, "%d els in %s ", slCount(seq), genoList[i]); for (; seq != NULL; seq = seq->next) fprintf(hitOut, "%d ", seq->size); fprintf(hitOut, "\n"); } } patSpace = makePatSpace(seqListList, genoListSize, oocFileName); for (i=0; i<cdnaListSize; ++i) { FILE *f; char *estFileName; DNA *dna; char *estName; int size; int c; int maxSizeForFuzzyFind = 20000; int dotCount = 0; estFileName = cdnaList[i]; if (startsWith("//", estFileName) ) continue; f = mustOpen(estFileName, "rb"); while ((c = fgetc(f)) != EOF) if (c == '>') break; printf("%s", cdnaList[i]); fflush(stdout); while (fastFaReadNext(f, &dna, &size, &estName)) { aliSeqName = estName; if (size < maxSizeForFuzzyFind) /* Some day need to fix this somehow... */ { struct hashEl *hel; struct cdnaAliList *calList = NULL; hel = hashLookup(pairHash, estName); if (hel != NULL) /* Do pair processing. */ { struct estPair *ep; struct seq *thisSeq, *otherSeq; ep = hel->val; if (hel->name == ep->name3) { thisSeq = &ep->seq3; otherSeq = &ep->seq5; } else { thisSeq = &ep->seq5; otherSeq = &ep->seq3; } if (otherSeq->dna == NULL) /* First in pair - need to save sequence. */ { thisSeq->size = size; thisSeq->dna = needMem(size); memcpy(thisSeq->dna, dna, size); } else /* Second in pair - do gluing and free partner. */ { char mergedName[64]; thisSeq->dna = dna; thisSeq->size = size; sprintf(mergedName, "%s_AND_%s", ep->name5, ep->name3); patSpaceFindOne(patSpace, ep->seq5.dna, ep->seq5.size, '+', '5', ep->name5, &calList); reverseComplement(ep->seq5.dna, ep->seq5.size); patSpaceFindOne(patSpace, ep->seq5.dna, ep->seq5.size, '-', '5', ep->name5, &calList); patSpaceFindOne(patSpace, ep->seq3.dna, ep->seq3.size, '+', '3', ep->name3, &calList); reverseComplement(ep->seq3.dna, ep->seq3.size); patSpaceFindOne(patSpace, ep->seq3.dna, ep->seq3.size, '-', '3', ep->name3, &calList); slReverse(&calList); writeMergers(calList, mergedName, genoList); freez(&otherSeq->dna); thisSeq->dna = NULL; thisSeq->size =otherSeq->size = 0; } } else { patSpaceFindOne(patSpace, dna, size, '+', '5', estName, &calList); reverseComplement(dna, size); patSpaceFindOne(patSpace, dna, size, '-', '5', estName, &calList); slReverse(&calList); writeMergers(calList, estName, genoList); } ++estIx; if ((estIx & 0xfff) == 0) { printf("."); ++dotCount; fflush(stdout); } } } printf("\n"); } aliSeqName = ""; printf("ffSubmitted %3d ffAccepted %3d ffOkScore %3d ffSolidMatch %2d\n", ffSubmitted, ffAccepted, ffOkScore, ffSolidMatch); endTime = clock1000(); printf("Total time is %4.2f\n", 0.001*(endTime-startTime)); /* Write out file who's presense say's we succeeded */ { FILE *f = mustOpen(okFileName, "w"); fputs("ok", f); fclose(f); } if (dumpMe) { htmEnd(bigHtmlFile); htmEnd(littleHtmlFile); } return 0; }
void reportCassette(struct altGraphX *ag, bool **em, int vs, int ve1, int ve2, int altBpStart, int altBpEnd, int startV, int endV, FILE *out) /* Write out both an altGraphX and two bed files. For a cassette exon the edges are - Name Vertexes Class ------ ---------- ----- exon1: startV->vs constitutive (cons 0) junction1: vs->ve1 alternative1 (alt1 1) exon2: ve1->altBpEnd alternative1 (alt1 1) junction2: altBpEnd->ve2 alternative1 (alt1 1) exon3: ve2->endV constitutive (cons 0) junction3: vs->ve2 alternative2 (alt2 2) */ { struct altGraphX *agLoc = NULL; /* Local altGraphX. */ struct evidence *ev = NULL, *evLoc = NULL; int *vPos = ag->vPositions; unsigned char *vT = ag->vTypes; int *vPosLoc = NULL; /* Vertex Positions. */ int *eStartsLoc = NULL; /* Edge Starts. */ int *eEndsLoc = NULL; /* Edge ends. */ unsigned char *vTLoc = NULL; /* Vertex Types. */ int *eTLoc = NULL; /* Edge Types. */ int vCLoc = 0; int eCLoc = 0; int i =0; struct dyString *dy = NULL; if(out == NULL) return; AllocVar(agLoc); agLoc->tName = cloneString(ag->tName); agLoc->name = cloneString(ag->name); agLoc->tStart = vPos[startV]; agLoc->tEnd = vPos[endV]; agLoc->strand[0] = ag->strand[0]; agLoc->vertexCount = vCLoc = 6; agLoc->edgeCount = eCLoc = 6; agLoc->id = altCassette; /* Allocate some arrays. */ AllocArray(vPosLoc, vCLoc); AllocArray(eStartsLoc, vCLoc); AllocArray(eEndsLoc, vCLoc); AllocArray(vTLoc, vCLoc); AllocArray(eTLoc, vCLoc); /* Fill in the vertex positions. */ vPosLoc[0] = vPos[startV]; vPosLoc[1] = vPos[vs]; vPosLoc[2] = vPos[ve1]; vPosLoc[3] = vPos[altBpEnd]; vPosLoc[4] = vPos[ve2]; vPosLoc[5] = vPos[endV]; /* Fill in the vertex types. */ vTLoc[0] = vT[startV]; vTLoc[1] = vT[vs]; vTLoc[2] = vT[ve1]; vTLoc[3] = vT[altBpEnd]; vTLoc[4] = vT[ve2]; vTLoc[5] = vT[endV]; /* Fill in the edges. */ /* Constitutive first exon. */ eStartsLoc[0] = 0; eEndsLoc[0] = 1; eTLoc[0] = 0; ev = evidenceForEdge(ag, startV, vs); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Exon inclusion junction. */ eStartsLoc[1] = 1; eEndsLoc[1] = 2; eTLoc[1] = 1; ev = evidenceForEdge(ag, vs, ve1); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Exon exclusion junction. */ eStartsLoc[2] = 1; eEndsLoc[2] = 4; eTLoc[2] = 2; ev = evidenceForEdge(ag, vs, ve2); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Cassette exon. */ eStartsLoc[3] = 2; eEndsLoc[3] = 3; eTLoc[3] = 1; ev = evidenceForEdge(ag, ve1, altBpEnd); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Exon inclusion junction. */ eStartsLoc[4] = 3; eEndsLoc[4] = 4; eTLoc[4] = 1; ev = evidenceForEdge(ag, altBpEnd, ve2); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Constitutive second exon. */ eStartsLoc[5] = 4; eEndsLoc[5] = 5; eTLoc[5] = 0; ev = evidenceForEdge(ag, ve2, endV); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); slReverse(&agLoc->evidence); dy = newDyString(ag->mrnaRefCount*36); agLoc->mrnaRefCount = ag->mrnaRefCount; for(i=0; i<ag->mrnaRefCount; i++) dyStringPrintf(dy, "%s,", ag->mrnaRefs[i]); sqlStringDynamicArray(dy->string, &agLoc->mrnaRefs, &i); dyStringFree(&dy); agLoc->mrnaTissues = CloneArray(ag->mrnaTissues, ag->mrnaRefCount); agLoc->mrnaLibs = CloneArray(ag->mrnaLibs, ag->mrnaRefCount); agLoc->vPositions = vPosLoc; agLoc->edgeStarts = eStartsLoc; agLoc->edgeEnds = eEndsLoc; agLoc->vTypes = vTLoc; agLoc->edgeTypes = eTLoc; altGraphXTabOut(agLoc, out); altGraphXFree(&agLoc); }
void loadSampleZoo(struct track *tg) /* Convert sample info in window to linked feature. */ { int maxWiggleTrackHeight = 2500; struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; struct sample *sample; struct linkedFeatures *lfList = NULL, *lf; char *hasDense = NULL; char *where = NULL; char query[256]; char option[64]; zooSpeciesHashInit(); /*see if we have a summary table*/ safef(query, sizeof(query), "select name from %s where name = '%s' limit 1", tg->table, tg->shortLabel); //errAbort( "%s", query ); hasDense = sqlQuickQuery(conn, query, query, sizeof(query)); /* If we're in dense mode and have a summary table load it. */ if(tg->visibility == tvDense) { if(hasDense != NULL) { safef(query, sizeof(query), " name = '%s' ", tg->shortLabel); where = cloneString(query); } } sr = hRangeQuery(conn, tg->table, chromName, winStart, winEnd, where, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { sample = sampleLoad(row + rowOffset); lf = lfFromSample(sample); safef( option, sizeof(option), "zooSpecies.%s", sample->name ); if( cartUsualBoolean(cart, option, TRUE )) slAddHead(&lfList, lf); sampleFree(&sample); } if(where != NULL) freez(&where); sqlFreeResult(&sr); hFreeConn(&conn); slReverse(&lfList); /* sort to bring items with common names to the same line * but only for tracks with a summary table * (with name=shortLabel) in dense mode */ if( hasDense != NULL ) { sortGroupList = tg; /* used to put track name at top of sorted list. */ slSort(&lfList, lfNamePositionCmp); sortGroupList = NULL; } /* Sort in species phylogenetic order */ slSort(&lfList, lfZooCmp); tg->items = lfList; /*turn off full mode if there are too many rows or each row is too * large. A total of maxWiggleTrackHeight is allowed for number of * rows times the rowHeight*/ if( tg->visibility == tvFull && sampleTotalHeight( tg, tvFull ) > maxWiggleTrackHeight ) { tg->limitedVisSet = TRUE; tg->limitedVis = tvDense; } }
void agpVsMap(char *agpName, char *infoName, char *gifName) /* agpVsMap - Plot clones in agp vs. map coordinates. */ { struct mapPos *mapList, *mp; struct agpFrag *agpList, *bp; struct hash *cloneHash = newHash(14); struct hashEl *hel; struct cloneInfo *cloneList = NULL, *clone; struct memGfx *mg = NULL; int pixWidth = 600; int pixHeight = 600; int rulerHeight = 20; int maxMapPos = 0, maxAgpPos = 0; double scaleMap, scaleAgp; Color orange, green; mapList = readInfoFile(infoName); agpList = readAgpFile(agpName); for (mp = mapList; mp != NULL; mp = mp->next) { if (mp->phase > 0) { AllocVar(clone); hel = hashAddUnique(cloneHash, mp->cloneName, clone); clone->name = hel->name; clone->mp = mp; slAddHead(&cloneList, clone); if (mp->pos > maxMapPos) maxMapPos = mp->pos; } } slReverse(&cloneList); for (bp = agpList; bp != NULL; bp = bp->next) { if (bp->chromStart > maxAgpPos) maxAgpPos = bp->chromStart; } /* Draw scatterplot on bitmap. */ mg = mgNew(pixWidth, pixHeight); mgClearPixels(mg); orange = mgFindColor(mg, 210, 150, 0); green = mgFindColor(mg, 0, 200, 0); mgDrawRuler(mg, 0, pixHeight-rulerHeight, rulerHeight, pixWidth, MG_BLACK, mgSmallFont(), 0, maxMapPos+1); scaleMap = (double)pixWidth/(double)(maxMapPos+1.0); scaleAgp = (double)(pixHeight)/(double)(maxAgpPos+1.0); for (bp = agpList; bp != NULL; bp = bp->next) { char cloneName[128]; fragToCloneName(bp->frag, cloneName); clone = hashFindVal(cloneHash, cloneName); if (clone == NULL) warn("%s is in %s but not %s", cloneName, agpName, infoName); else { int x = round(scaleMap*clone->mp->pos); int y = pixHeight - round(scaleAgp*bp->chromStart); int phase = clone->mp->phase; int back; if (phase <= 1) back = green; else if (phase == 2) back = orange; else back = MG_RED; drawPlus(mg, x, y, back); } } mgSaveGif(mg, gifName); }