struct genoLay *genoLayNew(struct genoLayChrom *chromList, MgFont *font, int picWidth, int betweenChromHeight, int minLeftLabelWidth, int minRightLabelWidth, char *how) /* Figure out layout. For human and most mammals this will be * two columns with sex chromosomes on bottom. This is complicated * by the platypus having a bunch of sex chromosomes. */ { int margin = 3; struct slRef *refList = NULL, *ref, *left, *right; struct genoLayChrom *chrom; struct genoLay *gl; int autoCount, halfCount, bases, chromInLine; int leftLabelWidth=0, rightLabelWidth=0, labelWidth; int spaceWidth = mgFontCharWidth(font, ' '); int extraLabelPadding = 0; int autosomeOtherPixels=0, sexOtherPixels=0; int autosomeBasesInLine=0; /* Maximum bases in a line for autosome. */ int sexBasesInLine=0; /* Bases in line for sex chromsome. */ double sexBasesPerPixel, autosomeBasesPerPixel, basesPerPixel; int pos = margin; int y = 0; int fontHeight = mgFontLineHeight(font); int chromHeight = fontHeight; int lineHeight = chromHeight + betweenChromHeight; boolean allOneLine = FALSE; refList = refListFromSlList(chromList); /* Allocate genoLay object and fill in simple fields. */ AllocVar(gl); gl->chromList = chromList; gl->chromHash = hashNew(0); gl->font = font; gl->picWidth = picWidth; gl->margin = margin; gl->spaceWidth = spaceWidth; gl->lineHeight = lineHeight; gl->betweenChromHeight = betweenChromHeight; gl->betweenChromOffsetY = 0; gl->chromHeight = chromHeight; gl->chromOffsetY = lineHeight - chromHeight; /* Save chromosomes in hash too, for easy access */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) hashAdd(gl->chromHash, chrom->fullName, chrom); if (sameString(how, genoLayOnePerLine)) { gl->leftList = refList; } else if (sameString(how, genoLayAllOneLine)) { gl->bottomList = refList; allOneLine = TRUE; } else { /* Put sex chromosomes on bottom, and rest on left. */ separateSexChroms(refList, &refList, &gl->bottomList); autoCount = slCount(refList); gl->leftList = refList; /* If there are a lot of chromosomes, then move later * (and smaller) chromosomes to a new right column */ if (autoCount > 12) { halfCount = (autoCount+1)/2; ref = slElementFromIx(refList, halfCount-1); gl->rightList = ref->next; ref->next = NULL; slReverse(&gl->rightList); } } if (allOneLine) { unsigned long totalBases = 0, bStart=0, bEnd; int chromCount = 0, chromIx=0; for (ref = gl->bottomList; ref != NULL; ref = ref->next) { chrom = ref->val; totalBases += chrom->size; chromCount += 1; } int availablePixels = picWidth - minLeftLabelWidth - minRightLabelWidth - 2*margin - (chromCount-1); double basesPerPixel = (double)totalBases/availablePixels; gl->picHeight = 2*margin + lineHeight + fontHeight; for (ref = gl->bottomList; ref != NULL; ref = ref->next) { chrom = ref->val; bEnd = bStart + chrom->size; int pixStart = round(bStart / basesPerPixel); int pixEnd = round(bEnd / basesPerPixel); chrom->width = pixEnd - pixStart; chrom->height = lineHeight; chrom->x = pixStart + margin + chromIx + minLeftLabelWidth; chrom->y = 0; chromIx += 1; bStart = bEnd; } gl->lineCount = 1; gl->picHeight = 2*margin + lineHeight + fontHeight + 1; gl->allOneLine = TRUE; gl->leftLabelWidth = minLeftLabelWidth; gl->rightLabelWidth = minRightLabelWidth; gl->basesPerPixel = basesPerPixel; gl->pixelsPerBase = 1.0/basesPerPixel; } else { /* Figure out space needed for autosomes. */ left = gl->leftList; right = gl->rightList; while (left || right) { bases = 0; chromInLine = 0; if (left) { chrom = left->val; labelWidth = mgFontStringWidth(font, chrom->shortName) + spaceWidth; if (leftLabelWidth < labelWidth) leftLabelWidth = labelWidth; bases = chrom->size; left = left->next; } if (right) { chrom = right->val; labelWidth = mgFontStringWidth(font, chrom->shortName) + spaceWidth; if (rightLabelWidth < labelWidth) rightLabelWidth = labelWidth; bases += chrom->size; right = right->next; } if (autosomeBasesInLine < bases) autosomeBasesInLine = bases; gl->lineCount += 1; } /* Figure out space needed for bottom chromosomes. */ if (gl->bottomList) { gl->lineCount += 1; sexOtherPixels = spaceWidth + 2*margin; for (ref = gl->bottomList; ref != NULL; ref = ref->next) { chrom = ref->val; sexBasesInLine += chrom->size; labelWidth = mgFontStringWidth(font, chrom->shortName) + spaceWidth; if (ref == gl->bottomList ) { if (leftLabelWidth < labelWidth) leftLabelWidth = labelWidth; sexOtherPixels = leftLabelWidth; } else if (ref->next == NULL) { if (rightLabelWidth < labelWidth) rightLabelWidth = labelWidth; sexOtherPixels += rightLabelWidth + spaceWidth; } else { sexOtherPixels += labelWidth + spaceWidth; } } } /* Do some adjustments if side labels are bigger than needed for * chromosome names. */ if (leftLabelWidth < minLeftLabelWidth) { extraLabelPadding += (minLeftLabelWidth - leftLabelWidth); leftLabelWidth = minLeftLabelWidth; } if (rightLabelWidth < minRightLabelWidth) { extraLabelPadding += (minRightLabelWidth - rightLabelWidth); rightLabelWidth = minRightLabelWidth; } sexOtherPixels += extraLabelPadding; /* Figure out the number of bases needed per pixel. */ autosomeOtherPixels = 2*margin + spaceWidth + leftLabelWidth + rightLabelWidth; basesPerPixel = autosomeBasesPerPixel = autosomeBasesInLine/(picWidth-autosomeOtherPixels); if (gl->bottomList) { sexBasesPerPixel = sexBasesInLine/(picWidth-sexOtherPixels); if (sexBasesPerPixel > basesPerPixel) basesPerPixel = sexBasesPerPixel; } /* Save positions and sizes of some things in layout structure. */ gl->leftLabelWidth = leftLabelWidth; gl->rightLabelWidth = rightLabelWidth; gl->basesPerPixel = basesPerPixel; gl->pixelsPerBase = 1.0/basesPerPixel; /* Set pixel positions for left autosomes */ for (ref = gl->leftList; ref != NULL; ref = ref->next) { chrom = ref->val; chrom->x = leftLabelWidth + margin; chrom->y = y; chrom->width = round(chrom->size/basesPerPixel); chrom->height = lineHeight; y += lineHeight; } /* Set pixel positions for right autosomes */ y = 0; for (ref = gl->rightList; ref != NULL; ref = ref->next) { chrom = ref->val; chrom->width = round(chrom->size/basesPerPixel); chrom->height = lineHeight; chrom->x = picWidth - margin - rightLabelWidth - chrom->width; chrom->y = y; y += lineHeight; } gl->picHeight = 2*margin + lineHeight * gl->lineCount; y = gl->picHeight - margin - lineHeight; /* Set pixel positions for sex chromosomes */ for (ref = gl->bottomList; ref != NULL; ref = ref->next) { chrom = ref->val; chrom->y = y; chrom->width = round(chrom->size/basesPerPixel); chrom->height = lineHeight; if (ref == gl->bottomList) chrom->x = leftLabelWidth + margin; else if (ref->next == NULL) chrom->x = picWidth - margin - rightLabelWidth - chrom->width; else chrom->x = 2*spaceWidth+mgFontStringWidth(font,chrom->shortName) + pos; pos = chrom->x + chrom->width; } } return gl; }
static bioSeq *nextSeqFromMem(char **pText, boolean isDna, boolean doFilter) /* Convert fa in memory to bioSeq. Update *pText to point to next * record. Returns NULL when no more sequences left. */ { char *name = ""; char *s, *d; struct dnaSeq *seq; int size = 0; char c; char *filter = (isDna ? ntChars : aaChars); char *text = *pText; char *p = skipLeadingSpaces(text); if (p == NULL) return NULL; dnaUtilOpen(); if (*p == '>') { char *end; s = strchr(p, '\n'); if (s != NULL) ++s; name = skipLeadingSpaces(p+1); end = skipToSpaces(name); if (end >= s || name >= s) errAbort("No name in line starting with '>'"); if (end != NULL) *end = 0; } else { s = p; if (s == NULL || s[0] == 0) return NULL; } name = cloneString(name); d = text; if (s != NULL) { for (;;) { c = *s; if (c == 0 || c == '>') break; ++s; if (!isalpha(c)) continue; if (doFilter) { if ((c = filter[(int)c]) == 0) { if (isDna) c = 'n'; else c = 'X'; } } d[size++] = c; } } d[size] = 0; /* Put sequence into our little sequence structure. */ AllocVar(seq); seq->name = name; seq->dna = text; seq->size = size; *pText = s; return seq; }
void reportAlt3Prime(struct altGraphX *ag, bool **em, int vs, int ve1, int ve2, int altBpStart, int altBpEnd, int startV, int endV, FILE *out) /* Write out an altGraphX record for an alt3Prime splicing event. Variable names are consistent with the rest of the program, but can be misleading. Specifically vs = start of alt splicing, ve1 = first end of alt splicing, etc. even though "vs" is really the end of an exon. For an alt5Prime splice the edges are: Name Vertexes Class ------ ---------- ----- exon1: startV->vs constituative (0) junction1: vs->ve1 alternative (1) junction2: vs->ve2 alternative (2) exon2: ve1->e2 alternative (1) exon3: ve2->endV constituative (0) */ { struct altGraphX *agLoc = NULL; /* Local altGraphX. */ struct evidence *ev = NULL, *evLoc = NULL; int *vPos = ag->vPositions; unsigned char *vT = ag->vTypes; int *vPosLoc = NULL; /* Vertex Positions. */ int *eStartsLoc = NULL; /* Edge Starts. */ int *eEndsLoc = NULL; /* Edge ends. */ unsigned char *vTLoc = NULL; /* Vertex Types. */ int *eTLoc = NULL; /* Edge Types. */ int vCLoc = 0; int eCLoc = 0; int edgeIx = 0, vertexIx = 0; int i =0; struct dyString *dy = NULL; if(out == NULL) return; AllocVar(agLoc); agLoc->tName = cloneString(ag->tName); agLoc->name = cloneString(ag->name); agLoc->tStart = vPos[startV]; agLoc->tEnd = vPos[endV]; agLoc->strand[0] = ag->strand[0]; agLoc->vertexCount = vCLoc = 6; agLoc->edgeCount = eCLoc = 5; agLoc->id = alt3Prime; /* Allocate some arrays. */ AllocArray(vPosLoc, vCLoc); AllocArray(eStartsLoc, eCLoc); AllocArray(eEndsLoc, eCLoc); AllocArray(vTLoc, vCLoc); AllocArray(eTLoc, eCLoc); /* Fill in the vertex positions. */ vertexIx = 0; vPosLoc[vertexIx++] = vPos[startV]; /* 0 */ vPosLoc[vertexIx++] = vPos[vs]; /* 1 */ vPosLoc[vertexIx++] = vPos[ve1]; /* 2 */ vPosLoc[vertexIx++] = vPos[ve2]; /* 3 */ vPosLoc[vertexIx++] = vPos[ve2]; /* 4 */ vPosLoc[vertexIx++] = vPos[endV]; /* 5 */ /* Fill in the vertex types. */ vertexIx = 0; vTLoc[vertexIx++] = vT[startV]; vTLoc[vertexIx++] = vT[vs]; vTLoc[vertexIx++] = vT[ve1]; vTLoc[vertexIx++] = vT[vs]; /* Faking a separate exon for the alt spliced portion. */ vTLoc[vertexIx++] = vT[ve2]; vTLoc[vertexIx++] = vT[endV]; edgeIx = 0; /* Constitutive first exon. */ eStartsLoc[edgeIx] = 0; eEndsLoc[edgeIx] = 1; eTLoc[edgeIx] = 0; ev = evidenceForEdge(ag, startV, vs); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Alternative1 junction (shorter). */ eStartsLoc[edgeIx] = 1; eEndsLoc[edgeIx] = 2; eTLoc[edgeIx] = 1; ev = evidenceForEdge(ag, vs, ve1); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Alt2 junction (longer). */ eStartsLoc[edgeIx] = 1; eEndsLoc[edgeIx] = 4; eTLoc[edgeIx] = 2; ev = evidenceForEdge(ag, vs, ve2); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Alt1 portion of second exon. */ eStartsLoc[edgeIx] = 2; eEndsLoc[edgeIx] = 3; eTLoc[edgeIx] = 1; ev = evidenceForEdge(ag, ve1, endV); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Exon 2 constitutive (shorter exon) */ eStartsLoc[edgeIx] = 4; eEndsLoc[edgeIx] = 5; eTLoc[edgeIx] = 0; ev = evidenceForEdge(ag, ve2, endV); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); edgeIx++; /* Package up the evidence, tissues, etc. */ slReverse(&agLoc->evidence); dy = newDyString(ag->mrnaRefCount*36); agLoc->mrnaRefCount = ag->mrnaRefCount; for(i=0; i<ag->mrnaRefCount; i++) dyStringPrintf(dy, "%s,", ag->mrnaRefs[i]); sqlStringDynamicArray(dy->string, &agLoc->mrnaRefs, &i); dyStringFree(&dy); agLoc->mrnaTissues = CloneArray(ag->mrnaTissues, ag->mrnaRefCount); agLoc->mrnaLibs = CloneArray(ag->mrnaLibs, ag->mrnaRefCount); agLoc->vPositions = vPosLoc; agLoc->edgeStarts = eStartsLoc; agLoc->edgeEnds = eEndsLoc; agLoc->vTypes = vTLoc; agLoc->edgeTypes = eTLoc; altGraphXTabOut(agLoc, out); altGraphXFree(&agLoc); }
struct dgNodeRef *dgFindPath(struct diGraph *dg, struct dgNode *a, struct dgNode *b) /* Find shortest path from a to b. Return NULL if can't be found. */ { struct dgNodeRef *refList = NULL, *ref; struct dgConnection *con; struct dgNode *node, *nNode; struct dlList *fifo; struct dlNode *ffNode; struct dgNode endNode; int fifoSize = 1; /* Do some quick and easy tests first to return if have no way out * of node A, or if B directly follows A. */ if (a->nextList == NULL) return NULL; if (a == b) { AllocVar(ref); ref->node = a; return ref; } if ((con = dgFindNodeInConList(a->nextList, b)) != NULL) { AllocVar(refList); refList->node = a; node = con->node; AllocVar(ref); ref->node = node; slAddTail(&refList, ref); return refList; } /* Set up for breadth first traversal. Will use a doubly linked * list as a fifo. */ for (node = dg->nodeList; node != NULL; node = node->next) node->tempEntry = NULL; fifo = newDlList(); dlAddValTail(fifo, a); a->tempEntry = &endNode; while ((ffNode = dlPopHead(fifo)) != NULL) { --fifoSize; node = ffNode->val; freeMem(ffNode); for (con = node->nextList; con != NULL; con = con->next) { nNode = con->node; if (nNode->tempEntry == NULL) { nNode->tempEntry = node; if (nNode == b) { while (nNode != &endNode && nNode != NULL) { AllocVar(ref); ref->node = nNode; slAddHead(&refList, ref); nNode = nNode->tempEntry; } break; } else { dlAddValTail(fifo, nNode); ++fifoSize; if (fifoSize > 100000) errAbort("Internal error in dgFindPath"); } } } } freeDlList(&fifo); return refList; }
struct sufa *sufaRead(char *fileName, boolean memoryMap) /* Read in a sufa from a file. Does this via memory mapping if you like, * which will be faster typically for about 100 reads, and slower for more * than that (_much_ slower for thousands of reads and more). */ { /* Open file (low level), read in header, and check it. */ int fd = open(fileName, O_RDONLY); if (fd < 0) errnoAbort("Can't open %s", fileName); struct sufaFileHeader h; if (read(fd, &h, sizeof(h)) < sizeof(h)) errnoAbort("Couldn't read header of file %s", fileName); if (h.magic != SUFA_MAGIC) errAbort("%s does not seem to be a sufa file.", fileName); if (h.majorVersion > SUFA_MAJOR_VERSION) errAbort("%s is a newer, incompatible version of sufa format. " "This program works on version %d and below. " "%s is version %d.", fileName, SUFA_MAJOR_VERSION, fileName, h.majorVersion); struct sufa *sufa; verbose(2, "sufa file %s size %lld\n", fileName, h.size); /* Get a pointer to data in memory, via memory map, or allocation and read. */ struct sufaFileHeader *header ; if (memoryMap) { #ifdef MACHTYPE_sparc header = (struct sufaFileHeader *)mmap(NULL, h.size, PROT_READ, MAP_SHARED, fd, 0); #else header = mmap(NULL, h.size, PROT_READ, MAP_FILE|MAP_SHARED, fd, 0); #endif if (header == (void*)(-1)) errnoAbort("Couldn't mmap %s, sorry", fileName); } else { header = needHugeMem(h.size); if (lseek(fd, 0, SEEK_SET) < 0) errnoAbort("Couldn't seek back to start of sufa file %s. " "Splix files must be random access files, not pipes and the like" , fileName); if (read(fd, header, h.size) < h.size) errnoAbort("Couldn't read all of sufa file %s.", fileName); } /* Allocate wrapper structure and fill it in. */ AllocVar(sufa); sufa->header = header; sufa->isMapped = memoryMap; /* Make an array for easy access to chromosome names. */ int chromCount = header->chromCount; char **chromNames = AllocArray(sufa->chromNames, chromCount); char *s = pointerOffset(header, sizeof(*header) ); int i; for (i=0; i<chromCount; ++i) { chromNames[i] = s; s += strlen(s)+1; } /* Keep track of where we are in memmap. */ bits64 mapOffset = sizeof(*header) + header->chromNamesSize; /* Point into chromSizes array. */ bits32 *chromSizes = sufa->chromSizes = pointerOffset(header, mapOffset); mapOffset += sizeof(bits32) * chromCount; verbose(2, "total dna size %lld in %d chromosomes\n", (long long)header->dnaDiskSize, header->chromCount); sufa->allDna = pointerOffset(header, mapOffset); mapOffset += header->dnaDiskSize; /* Calculate chromOffset array. */ bits32 offset = 0; bits32 *chromOffsets = AllocArray(sufa->chromOffsets, chromCount); for (i=0; i<chromCount; ++i) { chromOffsets[i] = offset; offset += chromSizes[i] + 1; verbose(2, "sufa contains %s, %d bases, %d offset\n", sufa->chromNames[i], (int)sufa->chromSizes[i], (int)chromOffsets[i]); } /* Finally point to the suffix array!. */ sufa->array = pointerOffset(header, mapOffset); mapOffset += header->arraySize * sizeof(bits32); assert(mapOffset == header->size); /* Sanity check */ return sufa; }
void doChainScore(char *chainIn, char *tNibDir, char *qNibDir, char *chainOut) { char qStrand = 0, tStrand = 0; struct dnaSeq *qSeq = NULL, *tSeq = NULL; char *qName = "", *tName = ""; FILE *f = mustOpen(chainOut, "w"); struct chain *chainList = NULL, *chain; struct chain *inputChains, *next; FILE *details = NULL; struct lineFile *lf = NULL; struct dnaSeq *seq, *seqList = NULL; struct hash *faHash = newHash(0); struct hash *chainHash = newHash(0); char comment[1024]; FILE *faF; struct seqPair *spList = NULL, *sp; struct dyString *dy = newDyString(512); struct lineFile *chainsLf = lineFileOpen(chainIn, TRUE); while ((chain = chainRead(chainsLf)) != NULL) { dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", chain->qName, chain->qStrand, chain->tName); sp = hashFindVal(chainHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(chainHash, dy->string, sp, &sp->name); sp->qName = cloneString(chain->qName); sp->tName = cloneString(chain->tName); sp->qStrand = chain->qStrand; } slAddHead(&sp->chain, chain); } slSort(&spList, seqPairCmp); lineFileClose(&chainsLf); if (optionExists("faQ")) { faF = mustOpen(qNibDir, "r"); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) { hashAdd(faHash, seq->name, seq); slAddHead(&seqList, seq); } fclose(faF); } for (sp = spList; sp != NULL; sp = sp->next) { if (optionExists("faQ")) { assert (faHash != NULL); loadFaSeq(faHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } else loadIfNewSeq(qNibDir, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); loadIfNewSeq(tNibDir, sp->tName, '+', &tName, &tSeq, &tStrand); scorePair(sp, qSeq, tSeq, &chainList, sp->chain); } slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { assert(chain->qStart == chain->blockList->qStart && chain->tStart == chain->blockList->tStart); chainWrite(chain, f); } carefulClose(&f); }
static struct dnaSeq *dnaLoadNextFromStack(struct dnaLoad *dl) /* Load next piece of DNA from stack of files. Return NULL * when stack is empty. */ { struct dnaLoadStack *dls; struct dnaSeq *seq = NULL; while ((dls = dl->stack) != NULL) { if (dls->twoBit) { if (dls->tbi != NULL) { seq = twoBitReadSeqFrag(dls->twoBit, dls->tbi->name, 0, 0); dls->tbi = dls->tbi->next; return seq; } else { dl->stack = dls->next; dnaLoadStackFree(&dls); } } else if (dls->textIsFa) { DNA *dna; char *name; int size; if (faMixedSpeedReadNext(dls->textFile, &dna, &size, &name)) { AllocVar(seq); seq->dna = needLargeMem(size+1); memcpy((void *)seq->dna, (void *)dna, size); seq->dna[size] = 0; seq->size = size; seq->name = cloneString(name); dl->curStart = 0; dl->curEnd = size; dl->curSize = size; return seq; } else { dl->stack = dls->next; dnaLoadStackFree(&dls); } } else /* It's a file full of file names. */ { char *line; if (lineFileNextReal(dls->textFile, &line)) { line = trimSpaces(line); if ((seq = dnaLoadSingle(line, &dl->curStart, &dl->curEnd, &dl->curSize)) != NULL) return seq; else { struct dnaLoadStack *newDls; newDls = dnaLoadStackNew(line); slAddHead(&dl->stack, newDls); } } else { dl->stack = dls->next; dnaLoadStackFree(&dls); } } } dl->finished = TRUE; return NULL; }
int main (int argc, char **argv) { LineStream ls; Texta tokens = NULL; char *line; int hasQual = 0; int hasSeqs = 0; int start=1; ls = ls_createFromFile ("-"); while (line = ls_nextLine (ls)) { // Put all the lines of the SAM header in comments if (line[0] == '@') { printf ("# %s\n", line); continue; } // Parse each SAM entry and store into array tokens = textFieldtokP (line, "\t"); if (arrayMax (tokens) < 11) { textDestroy( tokens ); ls_destroy (ls); die ("Invalid SAM entry: %s", line); } SamEntry *currSamE = NULL; SamEntry *mateSamE = NULL; AllocVar(currSamE ); int ret = generateSamEntry( tokens, currSamE, &hasSeqs, &hasQual ); textDestroy( tokens ); if ( ret==0 ) { if ( isPaired ( currSamE ) ) ls_nextLine( ls ); // discarding next entry too (the mate) destroySamEntry( currSamE ); freeMem( currSamE ); continue; } if ( isPaired( currSamE ) ) { int hasQual2, hasSeq2; AllocVar( mateSamE ); Texta secondEnd = NULL; secondEnd = textFieldtok (ls_nextLine( ls ) , "\t"); ret = generateSamEntry( secondEnd, mateSamE, &hasSeq2, &hasQual2 ); textDestroy( secondEnd ); if( ret == 0 ) { destroySamEntry( currSamE ); destroySamEntry( mateSamE ); freeMem( currSamE ); freeMem( mateSamE ); continue; } if (strcmp (currSamE->qname, mateSamE->qname) != 0) { die ("Please note that for paired-end data, sam2mrf requires the mate pairs to be on subsequent lines. You may want to sort the SAM file first.\nEx: sort -r file.sam | sam2mrf > file.mrf\n"); } } // Print MRF headers if( start ) { printf ("%s", MRF_COLUMN_NAME_BLOCKS); if (hasSeqs) printf("\t%s", MRF_COLUMN_NAME_SEQUENCE); if (hasQual) printf("\t%s", MRF_COLUMN_NAME_QUALITY_SCORES); printf ("\t%s\n", MRF_COLUMN_NAME_QUERY_ID); start=0; } // Print AlignmentBlocks printMrfAlignBlocks (currSamE, R_FIRST); if( isPaired ( currSamE ) ) { printf ("|"); printMrfAlignBlocks (mateSamE, R_SECOND); } seq_init(); // Print Sequence if (hasSeqs) { if (!currSamE->seq) die ("Entry missing sequence column\n"); if( currSamE->flags & S_QUERY_STRAND ) seq_reverseComplement( currSamE->seq, strlen(currSamE->seq)); printf ("\t%s", currSamE->seq); if (mateSamE) { if (!mateSamE->seq) die ("Entry missing sequence column\n"); if( mateSamE->flags & S_MATE_STRAND ) seq_reverseComplement( mateSamE->seq, strlen(mateSamE->seq)); printf ("|%s", mateSamE->seq); } } // Print quality scores if (hasQual) { if (!currSamE->qual) die ("Entry missing quality scores column\n"); printf ("\t%s", currSamE->qual); if (mateSamE) { if (!mateSamE->qual) die ("Entry missing quality scores column\n"); printf ("|%s", mateSamE->qual); } } // Print queryID if (mateSamE) { printf ("\t%s|%s", currSamE->qname,"2"); // No need to print out both IDs, but need the pipe symbol for consistency } else { printf ("\t%s", currSamE->qname); } printf("\n"); destroySamEntry( currSamE ); freeMem( currSamE ); if( isPaired( currSamE ) ) { destroySamEntry ( mateSamE ); freeMem( mateSamE ); } } // clean up ls_destroy (ls); return EXIT_SUCCESS; }
void *esmStartHandler(struct xap *xp, char *name, char **atts) /* Called by expat with start tag. Does most of the parsing work. */ { struct xapStack *st = xp->stack+1; int depth = xp->stackDepth; int i; if (sameString(name, "Motifs")) { struct esmMotifs *obj; AllocVar(obj); for (i=0; atts[i] != NULL; i += 2) { char *name = atts[i], *val = atts[i+1]; if (sameString(name, "SeqFile")) obj->SeqFile = cloneString(val); } if (obj->SeqFile == NULL) xapError(xp, "missing SeqFile"); return obj; } else if (sameString(name, "Motif")) { struct esmMotif *obj; AllocVar(obj); for (i=0; atts[i] != NULL; i += 2) { char *name = atts[i], *val = atts[i+1]; if (sameString(name, "Consensus")) obj->Consensus = cloneString(val); else if (sameString(name, "Source")) obj->Source = cloneString(val); else if (sameString(name, "Name")) obj->Name = cloneString(val); else if (sameString(name, "Description")) obj->Description = cloneString(val); } if (obj->Consensus == NULL) xapError(xp, "missing Consensus"); if (obj->Source == NULL) xapError(xp, "missing Source"); if (obj->Name == NULL) xapError(xp, "missing Name"); if (depth > 1) { if (sameString(st->elName, "Motifs")) { struct esmMotifs *parent = st->object; slAddHead(&parent->esmMotif, obj); } } return obj; } else if (sameString(name, "Weights")) { struct esmWeights *obj; AllocVar(obj); for (i=0; atts[i] != NULL; i += 2) { char *name = atts[i], *val = atts[i+1]; if (sameString(name, "ZeroWeight")) obj->ZeroWeight = atof(val); } if (depth > 1) { if (sameString(st->elName, "Motif")) { struct esmMotif *parent = st->object; slAddHead(&parent->esmWeights, obj); } } return obj; } else if (sameString(name, "Position")) { struct esmPosition *obj; AllocVar(obj); for (i=0; atts[i] != NULL; i += 2) { char *name = atts[i], *val = atts[i+1]; if (sameString(name, "Num")) obj->Num = atoi(val); else if (sameString(name, "Weights")) obj->Weights = cloneString(val); } if (obj->Weights == NULL) xapError(xp, "missing Weights"); if (depth > 1) { if (sameString(st->elName, "Weights")) { struct esmWeights *parent = st->object; slAddHead(&parent->esmPosition, obj); } } return obj; } else { xapSkip(xp); return NULL; } }
void agpVsMap(char *agpName, char *infoName, char *gifName) /* agpVsMap - Plot clones in agp vs. map coordinates. */ { struct mapPos *mapList, *mp; struct agpFrag *agpList, *bp; struct hash *cloneHash = newHash(14); struct hashEl *hel; struct cloneInfo *cloneList = NULL, *clone; struct memGfx *mg = NULL; int pixWidth = 600; int pixHeight = 600; int rulerHeight = 20; int maxMapPos = 0, maxAgpPos = 0; double scaleMap, scaleAgp; Color orange, green; mapList = readInfoFile(infoName); agpList = readAgpFile(agpName); for (mp = mapList; mp != NULL; mp = mp->next) { if (mp->phase > 0) { AllocVar(clone); hel = hashAddUnique(cloneHash, mp->cloneName, clone); clone->name = hel->name; clone->mp = mp; slAddHead(&cloneList, clone); if (mp->pos > maxMapPos) maxMapPos = mp->pos; } } slReverse(&cloneList); for (bp = agpList; bp != NULL; bp = bp->next) { if (bp->chromStart > maxAgpPos) maxAgpPos = bp->chromStart; } /* Draw scatterplot on bitmap. */ mg = mgNew(pixWidth, pixHeight); mgClearPixels(mg); orange = mgFindColor(mg, 210, 150, 0); green = mgFindColor(mg, 0, 200, 0); mgDrawRuler(mg, 0, pixHeight-rulerHeight, rulerHeight, pixWidth, MG_BLACK, mgSmallFont(), 0, maxMapPos+1); scaleMap = (double)pixWidth/(double)(maxMapPos+1.0); scaleAgp = (double)(pixHeight)/(double)(maxAgpPos+1.0); for (bp = agpList; bp != NULL; bp = bp->next) { char cloneName[128]; fragToCloneName(bp->frag, cloneName); clone = hashFindVal(cloneHash, cloneName); if (clone == NULL) warn("%s is in %s but not %s", cloneName, agpName, infoName); else { int x = round(scaleMap*clone->mp->pos); int y = pixHeight - round(scaleAgp*bp->chromStart); int phase = clone->mp->phase; int back; if (phase <= 1) back = green; else if (phase == 2) back = orange; else back = MG_RED; drawPlus(mg, x, y, back); } } mgSaveGif(mg, gifName); }
struct fullExperiment *getFullExperimentList(struct sqlConnection *conn, struct edwExperiment *eeList, char *assembly, struct hash **retHash) /* Given list of edwExperiments, return list of ones replicated with full file sets on * both replicates. If optional retHash is non-NULL then return a hash full of same * experiments keyed by experiment accession */ { /* Build up a list of fullExperiments and a hash keyed by name. */ struct hash *hash = hashNew(14); struct fullExperiment *fullList = NULL; struct edwExperiment *ee; for (ee = eeList; ee != NULL; ee = ee->next) { struct fullExperiment *full = hashFindVal(hash, ee->accession); if (full == NULL) { AllocVar(full); full->name = cloneString(ee->accession); full->exp = ee; slAddHead(&fullList, full); hashAdd(hash, full->name, full); } } uglyf("Got %d in eeList, %d in fullList, %d in hash\n", slCount(eeList), slCount(fullList), hash->elCount); /* Build up SQL query to efficiently fetch all good files and valid files from our experiment */ struct dyString *q = dyStringNew(16*1024); sqlDyStringPrintf(q, "select edwValidFile.*,edwFile.*,eapOutput.* " " from edwValidFile,edwFile,eapOutput " " where edwValidFile.fileId = edwFile.id and edwFile.id = eapOutput.fileId " " and edwFile.deprecated='' and edwFile.errorMessage='' " " and edwValidFile.ucscDb != 'centro.hg19' " " and edwValidFile.ucscDb like '%%%s' and edwValidFile.experiment in (" , assembly); for (ee = eeList; ee != NULL; ee = ee->next) { dyStringPrintf(q, "'%s'", ee->accession); if (ee->next != NULL) dyStringAppendC(q, ','); } dyStringAppendC(q, ')'); /* Loop through this making up vFiles that ultimately are attached to replicates. */ int vCount = 0; struct sqlResult *sr = sqlGetResult(conn, q->string); char **row; while ((row = sqlNextRow(sr)) != NULL) { ++vCount; struct edwValidFile *valid = edwValidFileLoad(row); fixOutputType(valid); struct edwFile *file = edwFileLoad(row + EDWVALIDFILE_NUM_COLS); struct eapOutput *eapOutput = eapOutputLoad(row + EDWVALIDFILE_NUM_COLS + EDWFILE_NUM_COLS); struct vFile *vf = vFileNew(file, valid, eapOutput); struct fullExperiment *full = hashMustFindVal(hash, valid->experiment); struct replicate *rep = findOrMakeReplicate(valid->replicate, &full->repList); char *format = valid->format; if (sameString(format, "bam")) slAddHead(&rep->bamList, vf); else if (sameString(format, "bigWig")) slAddHead(&rep->bigWigList, vf); else if (sameString(format, "narrowPeak") && !sameString(valid->outputType, "replicated_narrowPeak")) slAddHead(&rep->narrowList, vf); else if (sameString(format, "broadPeak") && !sameString(valid->outputType, "replicated_broadPeak")) slAddHead(&rep->broadList, vf); } sqlFreeResult(&sr); uglyf("Got %d vFiles\n", vCount); dyStringFree(&q); /* Free hash or return it, and return list. */ if (retHash == NULL) hashFree(&hash); else *retHash = hash; return fullList; }
static void trackConfig(struct track *trackList, struct group *groupList, char *groupTarget, int changeVis) /* Put up track configurations. If groupTarget is * NULL then set visibility for tracks in all groups. Otherwise, * just set it for the given group. If vis is -2, then visibility is * unchanged. If -1 then set visibility to default, otherwise it should * be tvHide, tvDense, etc. */ { #ifdef PRIORITY_CHANGES_IN_CONFIG_UI char pname[512]; char gname[512]; #endif///def PRIORITY_CHANGES_IN_CONFIG_UI struct group *group; boolean showedRuler = FALSE; setRulerMode(); changeTrackVis(groupList, groupTarget, changeVis); /* Set up ruler mode according to changeVis. */ #ifdef BOB_DOESNT_LIKE if (changeVis != -2) { if (groupTarget == NULL || (groupList != NULL && sameString(groupTarget, groupList->name))) { if (changeVis == -1) rulerMode = tvFull; else rulerMode = changeVis; } } #endif /* BOB_DOESNT_LIKE */ jsInit(); cgiMakeHiddenVar(configGroupTarget, "none"); boolean isFirstNotCtGroup = TRUE; for (group = groupList; group != NULL; group = group->next) { struct trackRef *tr; if (group->trackList == NULL) continue; /* check if group section should be displayed */ char *otherState; char *indicator; char *indicatorImg; boolean isOpen = !isCollapsedGroup(group); collapseGroupGoodies(isOpen, FALSE, &indicatorImg, &indicator, &otherState); hTableStart(); hPrintf("<TR NOWRAP>"); hPrintf("<TH NOWRAP align=\"left\" colspan=3 BGCOLOR=#536ED3>"); hPrintf("\n<A NAME='%sGroup'></A>",group->name); hPrintf("<input type=hidden name='%s' id='%s' value=%d>", collapseGroupVar(group->name),collapseGroupVar(group->name), (isOpen?0:1)); hPrintf("<A HREF='%s?%s&%s=%s#%sGroup' class='bigBlue'><IMG height=22 width=22 onclick=\"return toggleTrackGroupVisibility(this,'%s');\" id='%s_button' src='%s' alt='%s' class='bigBlue' title='%s this group'></A> ", hgTracksName(), cartSidUrlString(cart),collapseGroupVar(group->name), otherState, group->name, group->name, group->name, indicatorImg, indicator,isOpen?"Collapse":"Expand"); hPrintf("<B> %s</B> ", wrapWhiteFont(group->label)); hPrintf(" "); hPrintf("<INPUT TYPE=SUBMIT NAME=\"%s\" VALUE=\"%s\" " "onClick=\"document.mainForm.%s.value='%s'; %s\" title='Hide all tracks in this groups'>", configHideAll, "hide all", configGroupTarget, group->name, jsSetVerticalPosition("mainForm")); hPrintf(" "); hPrintf("<INPUT TYPE=SUBMIT NAME=\"%s\" VALUE=\"%s\" " "onClick=\"document.mainForm.%s.value='%s'; %s\" title='Show all tracks in this groups'>", configShowAll, "show all", configGroupTarget, group->name, jsSetVerticalPosition("mainForm")); hPrintf(" "); hPrintf("<INPUT TYPE=SUBMIT NAME=\"%s\" VALUE=\"%s\" " "onClick=\"document.mainForm.%s.value='%s'; %s\" title='Show default tracks in this group'>", configDefaultAll, "default", configGroupTarget, group->name, jsSetVerticalPosition("mainForm")); hPrintf(" "); /* do not want all the submit buttons named the same. It is * confusing to the javascript submit() function. */ char submitName[256]; safef(submitName, sizeof(submitName), "%sSubmit", group->name); cgiMakeButtonWithMsg(submitName, "submit","Submit your selections and view them in the browser"); #ifdef PRIORITY_CHANGES_IN_CONFIG_UI if (withPriorityOverride) { hPrintf(" "); hPrintf(" "); hPrintf(" "); hPrintf("%s", wrapWhiteFont("Group Order: ")); } #endif///def PRIORITY_CHANGES_IN_CONFIG_UI hPrintf("</TH>\n"); #ifdef PRIORITY_CHANGES_IN_CONFIG_UI if (withPriorityOverride) { hPrintf("<TH>\n"); safef(pname, sizeof(pname), "%s.priority",group->name); hDoubleVar(pname, (double)group->priority, 4); hPrintf("</TH>\n"); if (isOpen) hPrintf("<TH align=CENTER BGCOLOR=#536ED3><B> %s</B></TH> ", wrapWhiteFont("Group")); hPrintf("\n"); } #endif///def PRIORITY_CHANGES_IN_CONFIG_UI hPrintf("</TR>\n"); /* First non-CT group gets ruler. */ if (!showedRuler && isFirstNotCtGroup && differentString(group->name, "user")) { showedRuler = TRUE; hPrintf("<TR %sid='%s-0'>",(isOpen ? "" : "style='display: none'"), group->name); hPrintf("<TD>"); hPrintf("<A HREF=\"%s?%s=%u&c=%s&g=%s&hgTracksConfigPage=configure\">", hgTrackUiName(), cartSessionVarName(), cartSessionId(cart), chromName, RULER_TRACK_NAME); hPrintf("%s</A>", RULER_TRACK_LABEL); hPrintf("</TD>"); hPrintf("<TD>"); hTvDropDownClass("ruler", rulerMode, FALSE, rulerMode ? "normalText" : "hiddenText"); hPrintf("</TD>"); hPrintf("<TD>"); hPrintf("Chromosome position in bases. (Clicks here zoom in 3x)"); hPrintf("</TD>"); #ifdef PRIORITY_CHANGES_IN_CONFIG_UI if (withPriorityOverride) { hPrintf("<TD>"); hPrintf("</TD>"); hPrintf("<TD>"); hPrintf("</TD>"); } #endif///def PRIORITY_CHANGES_IN_CONFIG_UI hPrintf("</TR>\n"); } if (differentString(group->name, "user")) isFirstNotCtGroup = FALSE; /* Scan track list to determine which supertracks have visible member * tracks, and to insert a track in the list for the supertrack. * Sort tracks and supertracks together by priority */ groupTrackListAddSuper(cart, group); if (!withPriorityOverride) { /* sort hierarchically by priority, considering supertracks */ struct trackRef *refList = NULL, *ref; for (tr = group->trackList; tr != NULL; tr = tr->next) { struct track *track = tr->track; if (tdbIsSuperTrackChild(track->tdb)) /* ignore supertrack member tracks till supertrack is found */ continue; AllocVar(ref); ref->track = track; slAddTail(&refList, ref); if (tdbIsSuper(track->tdb)) { struct trackRef *tr2; for (tr2 = group->trackList; tr2 != NULL; tr2 = tr2->next) { char *parent = tr2->track->tdb->parentName; if (parent && sameString(parent, track->track)) { AllocVar(ref); ref->track = tr2->track; slAddTail(&refList, ref); } } } } group->trackList = refList; } /* Loop through this group and display */ int rowCount=1; for (tr = group->trackList; tr != NULL; tr = tr->next) { struct track *track = tr->track; struct trackDb *tdb = track->tdb; hPrintf("<TR %sid='%s-%d'>",(isOpen ? "" : "style='display: none'"),group->name, rowCount++); hPrintf("<TD NOWRAP>"); if (tdbIsSuperTrackChild(tdb)) /* indent members of a supertrack */ hPrintf(" "); // Print an icon before the title when one is defined hPrintPennantIcon(track->tdb); if (track->hasUi) hPrintf("<A %s%s%s HREF=\"%s?%s=%u&g=%s&hgTracksConfigPage=configure\">", tdb->parent ? "TITLE=\"Part of super track: " : "", tdb->parent ? tdb->parent->shortLabel : "", tdb->parent ? "...\"" : "", hgTrackUiName(), cartSessionVarName(), cartSessionId(cart), track->track); hPrintf(" %s", track->shortLabel); if (tdbIsSuper(track->tdb)) hPrintf("..."); if (track->hasUi) hPrintf("</A>"); hPrintf("</TD>"); hPrintf("<TD NOWRAP>"); if (tdbIsSuperTrackChild(tdb)) /* indent members of a supertrack */ hPrintf(" "); /* If track is not on this chrom print an informational message for the user. */ if (hTrackOnChrom(track->tdb, chromName)) { if (tdbIsSuper(track->tdb)) { /* supertrack dropdown is hide/show */ superTrackDropDown(cart, track->tdb, 1); } else { /* check for option of limiting visibility to one mode */ hTvDropDownClassVisOnly(track->track, track->visibility, track->canPack, (track->visibility == tvHide) ? "hiddenText" : "normalText", trackDbSetting(track->tdb, "onlyVisibility")); } } else hPrintf("[No data-%s]", chromName); hPrintf("</TD>"); hPrintf("<TD NOWRAP>"); hPrintf("%s", track->longLabel); hPrintf("</TD>"); #ifdef PRIORITY_CHANGES_IN_CONFIG_UI if (withPriorityOverride) { hPrintf("<TD>"); safef(pname, sizeof(pname), "%s.priority",track->track); hDoubleVar(pname, (double)track->priority, 4); hPrintf("</TD>"); hPrintf("<TD>\n"); /* suppress group pull-down for supertrack members */ if (tdbIsSuperTrackChild(track->tdb)) hPrintf(" "); else { safef(gname, sizeof(gname), "%s.group",track->track); printGroupListHtml(gname, groupList, track->groupName); } hPrintf("</TD>"); } #endif///def PRIORITY_CHANGES_IN_CONFIG_UI hPrintf("</TR>\n"); } hTableEnd(); hPrintf("<BR>"); } }
static struct grp *makeGroupList(char *db, struct trackDb *trackList, struct grp **pHubGrpList, boolean allTablesOk) /* Get list of groups that actually have something in them. */ { struct grp *groupsAll, *groupList = NULL, *group; struct hash *groupsInTrackList = newHash(0); struct hash *groupsInDatabase = newHash(0); struct trackDb *track; /* Stream through track list building up hash of active groups. */ for (track = trackList; track != NULL; track = track->next) { if (!hashLookup(groupsInTrackList,track->grp)) hashAdd(groupsInTrackList, track->grp, NULL); } /* Scan through group table, putting in ones where we have data. */ groupsAll = hLoadGrps(db); for (group = slPopHead(&groupsAll); group != NULL; group = slPopHead(&groupsAll)) { if (hashLookup(groupsInTrackList, group->name)) { slAddTail(&groupList, group); hashAdd(groupsInDatabase, group->name, group); } else grpFree(&group); } /* if we have custom tracks, we want to add the track hubs * after that group */ struct grp *addAfter = NULL; if ((groupList != NULL) && sameString(groupList->name, "user")) addAfter = groupList; /* Add in groups from hubs. */ for (group = slPopHead(pHubGrpList); group != NULL; group = slPopHead(pHubGrpList)) { // if the group isn't represented in any track, don't add it to list if (!hashLookup(groupsInTrackList,group->name)) continue; /* check to see if we're inserting hubs rather than * adding them to the front of the list */ struct grp *newGrp = grpDup(group); if (addAfter != NULL) { newGrp->next = addAfter->next; addAfter->next = newGrp; } else slAddHead(&groupList, newGrp); hashAdd(groupsInDatabase, newGrp->name, newGrp); } /* Do some error checking for tracks with group names that are * not in database. Just warn about them. */ if (!trackHubDatabase(db)) for (track = trackList; track != NULL; track = track->next) { if (!hashLookup(groupsInDatabase, track->grp)) warn("Track %s has group %s, which isn't in grp table", track->table, track->grp); } /* Create dummy group for all tracks. */ AllocVar(group); group->name = cloneString("allTracks"); group->label = cloneString("All Tracks"); slAddTail(&groupList, group); /* Create another dummy group for all tables. */ if (allTablesOk) { AllocVar(group); group->name = cloneString("allTables"); group->label = cloneString("All Tables"); slAddTail(&groupList, group); } hashFree(&groupsInTrackList); hashFree(&groupsInDatabase); return groupList; }
struct improbRunInfo * analyseOneMotifRun(char *runName, char *seqDir, char *motifDir, int controlCount, char *controls[]) /* Bundle up data on one improbizer run and associated control runs. */ { char fileName[512]; char motifName[256]; int seqCount, baseCount; struct improbRunInfo *iriList = NULL, *iri; struct lineFile *lf = NULL; struct motif motif; int motifIx = 0; int i; float acc, best, mean, x; printf("%s\n", runName); /* Count bases in sequences - this will be used in each iri. */ sprintf(fileName, "%s/%s.fa", seqDir, runName); countSeq(fileName, &seqCount, &baseCount); /* Allocate iri and read the main run. */ sprintf(fileName, "%s/%s", motifDir, runName); lf = lineFileOpen(fileName, TRUE); while (readMotif(lf, &motif)) { AllocVar(iri); slAddTail(&iriList, iri); ++motifIx; snprintf(motifName, sizeof(motifName), "%s.%d", runName, motifIx); iri->name = cloneString(motifName); iri->seqCount = seqCount; iri->runScore = motif.score; iri->runPos = motif.pos; iri->runPosSd = motif.posSd; iri->columnCount = motif.size; iri->consensus = cloneString(motif.consensus); iri->aProb = CloneArray(motif.profile[0], motif.size); iri->cProb = CloneArray(motif.profile[1], motif.size); iri->gProb = CloneArray(motif.profile[2], motif.size); iri->tProb = CloneArray(motif.profile[3], motif.size); iri->controlCount = controlCount; AllocArray(iri->controlScores, controlCount); } lineFileClose(&lf); /* Read the control runs. */ for (i=0; i<controlCount; ++i) { sprintf(fileName, "%s/%s", controls[i], runName); lf = lineFileOpen(fileName, TRUE); for (iri = iriList; iri != NULL; iri = iri->next) { if (!readMotif(lf, &motif)) errAbort("%s doesn't contain the expected number of motifs", lf->fileName); iri->controlScores[i] = motif.score; } lineFileClose(&lf); } /* Calculate best and mean on control runs. */ for (iri = iriList; iri != NULL; iri = iri->next) { acc = best = 0; for (i=0; i<controlCount; ++i) { x = iri->controlScores[i]; acc += x; if (x > best) best = x; } iri->bestControlScore = best; iri->meanControlScore = acc/controlCount; } /* Calculate standard deviation of control runs. */ for (iri = iriList; iri != NULL; iri = iri->next) { acc = 0; mean = iri->meanControlScore; for (i=0; i<controlCount; ++i) { x = iri->controlScores[i] - mean; acc += x*x; } if (controlCount > 1) acc /= controlCount; iri->sdControlScore = sqrt(acc); } return iriList; }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. */ { char *row[3]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, ArraySize(row)); if (rowSize == 0) break; lineFileExpectWords(lf, 3, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); *retMinDiff = minDiff; *retAveSize = (double)totalBases/bedCount; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }
static struct blastBlock *nextBlock(struct blastFile *bf, struct blastQuery *bq, struct blastGappedAli *bga, boolean *skipRet) /* Read in next blast block. Return NULL at EOF or end of gapped * alignment. If an unparsable block is found, set skipRet to TRUE and return * NULL. */ { struct blastBlock *bb; char *line; char *words[16]; int wordCount; char *parts[3]; int partCount; static struct dyString *qString = NULL, *tString = NULL; verbose(TRACE_LEVEL, "blastFileNextBlock\n"); *skipRet = FALSE; /* Seek until get something like: * Score = 8770 bits (4424), Expect = 0.0 * or something that looks like we're done with this gapped * alignment. */ for (;;) { if (!nextBlockLine(bf, bq, &line)) return NULL; if (startsWith(" Score", line)) break; } AllocVar(bb); bb->gappedAli = bga; wordCount = chopLine(line, words); if (wordCount < 8 || !sameWord("Score", words[0]) || !isdigit(words[2][0]) || !(isdigit(words[7][0]) || words[7][0] == 'e') || !startsWith("Expect", words[5])) { bfError(bf, "Expecting something like:\n" "Score = 8770 bits (4424), Expect = 0.0"); } bb->bitScore = atof(words[2]); bb->eVal = evalToDouble(words[7]); /* Process something like: * Identities = 8320/9618 (86%), Gaps = 3/9618 (0%) * or * Identities = 8320/9618 (86%) * or * Identities = 10/19 (52%), Positives = 15/19 (78%), Frame = +2 * (wu-tblastn) * or * Identities = 256/400 (64%), Positives = 306/400 (76%) * Frame = +1 / -2 * (tblastn) * * Identities = 1317/10108 (13%), Positives = 2779/10108 (27%), Gaps = 1040/10108 * (10%) * - wrap on long lines * * Handle weird cases where the is only a `Score' line, with no `Identities' * lines by skipping the alignment; they seem line small, junky alignments. */ line = bfNeedNextLine(bf); wordCount = chopLine(line, words); if (wordCount < 3 || !sameWord("Identities", words[0])) { if (wordCount > 1 || sameWord("Score", words[0])) { /* ugly hack to skip block with no identities */ *skipRet = TRUE; blastBlockFree(&bb); return NULL; } bfError(bf, "Expecting identity count"); } partCount = chopByChar(words[2], '/', parts, ArraySize(parts)); if (partCount != 2 || !isdigit(parts[0][0]) || !isdigit(parts[1][0])) bfSyntax(bf); bb->matchCount = atoi(parts[0]); bb->totalCount = atoi(parts[1]); if (wordCount >= 7 && sameWord("Gaps", words[4])) { if (!isdigit(words[6][0])) bfSyntax(bf); bb->insertCount = atoi(words[6]); } if ((wordCount >= 11) && sameWord("Frame", words[8])) { bb->qStrand = '+'; bb->tStrand = words[10][0]; bb->tFrame = atoi(words[10]); } line = bfNeedNextLine(bf); boolean wrapped = (startsWith("(", line)); /* Process something like: * Strand = Plus / Plus (blastn) * Frame = +1 (tblastn) * Frame = +1 / -2 (tblastx) * <blank line> (blastp) * note that wu-tblastn puts frame on Identities line */ if (wrapped) line = bfNeedNextLine(bf); wordCount = chopLine(line, words); if ((wordCount >= 5) && sameWord("Strand", words[0])) { bb->qStrand = getStrand(bf, words[2]); bb->tStrand = getStrand(bf, words[4]); } else if ((wordCount >= 5) && sameWord("Frame", words[0]) && (words[3][0] == '/')) { // Frame = +1 / -2 (tblastx) bb->qStrand = (words[2][0] == '-') ? -1 : 1; bb->tStrand = (words[4][0] == '-') ? -1 : 1; bb->qFrame = atoi(words[2]); bb->tFrame = atoi(words[4]); } else if ((wordCount >= 3) && sameWord("Frame", words[0])) { // Frame = +1 (tblastn) bb->qStrand = 1; bb->tStrand = (words[2][0] == '-') ? -1 : 1; bb->qFrame = atoi(words[2]); bb->tFrame = 1; } else if (wordCount == 0) { /* if we didn't parse frame, default it */ if (bb->qStrand == 0) { bb->qStrand = '+'; bb->tStrand = '+'; } } else bfError(bf, "Expecting Strand, Frame or blank line"); /* Process alignment lines. They come in groups of three * separated by a blank line - something like: * Query: 26429 taccttgacattcctcagtgtgtcatcatcgttctctcctccaaacggcgagagtccgga 26488 * |||||| |||||||||| ||| ||||||||||||||||||||||| || || |||||||| * Sbjct: 62966 taccttaacattcctcaatgtttcatcatcgttctctcctccaaatggtgaaagtccgga 63025 */ if (qString == NULL) { qString = newDyString(50000); tString = newDyString(50000); } clearBlastBlock(bb, qString, tString); for (;;) { if (!findBlockSeqPair(bf, bq)) break; parseBlockSeqPair(bf, bb, qString, tString); } /* convert to [0..n) and move to strand coords if necessary */ bb->qStart--; if (bb->qStrand < 0) reverseIntRange(&bb->qStart, &bb->qEnd, bq->queryBaseCount); bb->tStart--; if (bb->tStrand < 0) reverseIntRange(&bb->tStart, &bb->tEnd, bga->targetSize); bb->qSym = cloneMem(qString->string, qString->stringSize+1); bb->tSym = cloneMem(tString->string, tString->stringSize+1); return bb; }
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed) /* Create a new broken-up that excludes part of gene between CDS breaks. * Also jiggles cds->end coordinate to cope with the sequence we remove. * Deals with transcript to genome coordinate mapping including negative * strand. Be afraid, be very afraid! */ { /* Create range tree covering all breaks. The coordinates here * are transcript coordinates. While we're out it shrink outer CDS * since we are actually shrinking transcript. */ struct rbTree *gapTree = rangeTreeNew(); int bedSize = bed->chromEnd - bed->chromStart; struct lm *lm = gapTree->lm; /* Convenient place to allocate memory. */ int i, lastCds = cds->cdsCount-1; for (i=0; i<lastCds; ++i) { int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i]; int gapEnd = cds->cdsStarts[i+1]; int gapSize = gapEnd - gapStart; cds->end -= gapSize; rangeTreeAdd(gapTree, gapStart, gapEnd); } /* Get list of exons in bed, flipped to reverse strand if need be. */ struct range *exon, *exonList = bedToExonList(bed, lm); if (bed->strand[0] == '-') flipExonList(&exonList, bedSize); /* Go through exon list, mapping each exon to transcript * coordinates. Check if exon needs breaking up, and if * so do so, as we copy it to new list. */ /* Copy exons to new list, breaking them up if need be. */ struct range *newList = NULL, *nextExon, *newExon; int txStartPos = 0, txEndPos; for (exon = exonList; exon != NULL; exon = nextExon) { txEndPos = txStartPos + exon->end - exon->start; nextExon = exon->next; struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos); if (gapList != NULL) { verbose(3, "Splitting exon because of CDS gap\n"); /* Make up exons from current position up to next gap. This is a little * complicated by possibly the gap starting before the exon. */ int exonStart = exon->start; int txStart = txStartPos; struct range *gap; for (gap = gapList; gap != NULL; gap = gap->next) { int txEnd = gap->start; int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos); int exonSize = txEnd - txStart; if (exonSize > 0) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exonStart + exonSize; slAddHead(&newList, newExon); } else /* This case happens if gap starts before exon */ { exonSize = 0; } /* Update current position in both transcript and genome space. */ exonStart += exonSize + gapSize; txStart += exonSize + gapSize; } /* Make up final exon from last gap to end, at least if we don't end in a gap. */ if (exonStart < exon->end) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exon->end; slAddHead(&newList, newExon); } } else { /* Easy case where we don't intersect any gaps. */ slAddHead(&newList, exon); } txStartPos= txEndPos; } slReverse(&newList); /* Flip exons back to forward strand if need be */ if (bed->strand[0] == '-') flipExonList(&newList, bedSize); /* Convert exons to bed12 */ struct bed *newBed; AllocVar(newBed); newBed->chrom = cloneString(bed->chrom); newBed->chromStart = newList->start + bed->chromStart; newBed->chromEnd = newList->end + bed->chromStart; newBed->name = cloneString(bed->name); newBed->score = bed->score; newBed->strand[0] = bed->strand[0]; newBed->blockCount = slCount(newList); AllocArray(newBed->blockSizes, newBed->blockCount); AllocArray(newBed->chromStarts, newBed->blockCount); for (exon = newList, i=0; exon != NULL; exon = exon->next, i++) { newBed->chromStarts[i] = exon->start; newBed->blockSizes[i] = exon->end - exon->start; newBed->chromEnd = exon->end + bed->chromStart; } /* Clean up and go home. */ rbTreeFree(&gapTree); return newBed; }
struct altGraphX *txGraphToAltGraphX(struct txGraph *tx) /* Copy transcription graph to altSpliceX format. */ { /* Allocate struct and deal with easy fields. */ struct altGraphX *ag; AllocVar(ag); ag->tName = cloneString(tx->tName); ag->tStart = tx->tStart; ag->tEnd = tx->tEnd; ag->name = cloneString(tx->name); ag->id = 0; ag->strand[0] = tx->strand[0]; /* Deal with vertices. */ int vertexCount = ag->vertexCount = tx->vertexCount; AllocArray(ag->vTypes, vertexCount); AllocArray(ag->vPositions, vertexCount); int i; for (i=0; i<vertexCount; ++i) { struct txVertex *v = &tx->vertices[i]; ag->vTypes[i] = v->type; ag->vPositions[i] = v->position; } /* Deal with edges. */ int edgeCount = ag->edgeCount = tx->edgeCount; AllocArray(ag->edgeStarts, edgeCount); AllocArray(ag->edgeEnds, edgeCount); AllocArray(ag->edgeTypes, edgeCount); struct txEdge *edge; for (edge = tx->edgeList, i=0; edge != NULL; edge = edge->next, ++i) { assert(i < edgeCount); ag->edgeStarts[i] = edge->startIx; ag->edgeEnds[i] = edge->endIx; ag->edgeTypes[i] = edge->type; } /* Deal with evidence inside of edges. */ for (edge = tx->edgeList; edge != NULL; edge = edge->next) { struct evidence *ev; AllocVar(ev); int *mrnaIds = AllocArray(ev->mrnaIds, edge->evCount); int i; struct txEvidence *txEv; for (txEv = edge->evList, i=0; txEv != NULL; txEv = txEv->next, ++i) { assert(i < edge->evCount); struct txSource *source = &tx->sources[txEv->sourceId]; char *sourceType = source->type; if (sameString(sourceType, "refSeq") || sameString(sourceType, "mrna") || sameString(sourceType, "est")) { mrnaIds[ev->evCount] = txEv->sourceId; ev->evCount += 1; } } slAddHead(&ag->evidence, ev); } slReverse(&ag->evidence); /* Convert sources into mrnaRefs. */ int sourceCount = ag->mrnaRefCount = tx->sourceCount; AllocArray(ag->mrnaRefs, sourceCount); int sourceIx; for (sourceIx=0; sourceIx<sourceCount; ++sourceIx) { struct txSource *source = &tx->sources[sourceIx]; ag->mrnaRefs[sourceIx] = cloneString(source->accession); } /* Deal with tissues and libs by just making arrays of all zero. */ AllocArray(ag->mrnaTissues, tx->sourceCount); AllocArray(ag->mrnaLibs, tx->sourceCount); return ag; }
struct mrnaAli *mrnaAliCommaIn(char **pS) /* Create a mrnaAli out of a comma separated string. */ { struct mrnaAli *ret; char *s = *pS; int i; AllocVar(ret); ret->id = sqlUnsignedComma(&s); ret->readDir = sqlSignedComma(&s); ret->orientation = sqlSignedComma(&s); ret->hasIntrons = sqlUnsignedComma(&s); ret->isEst = sqlUnsignedComma(&s); ret->score = sqlSignedComma(&s); ret->qAcc = sqlSignedComma(&s); ret->qId = sqlUnsignedComma(&s); ret->qTotalSize = sqlUnsignedComma(&s); ret->qStart = sqlUnsignedComma(&s); ret->qEnd = sqlUnsignedComma(&s); ret->tStartBac = sqlUnsignedComma(&s); ret->tStartPos = sqlUnsignedComma(&s); ret->tEndBac = sqlUnsignedComma(&s); ret->tEndPos = sqlUnsignedComma(&s); ret->blockCount = sqlUnsignedComma(&s); s = sqlEatChar(s, '{'); AllocArray(ret->blockSizes, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->blockSizes[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->qBlockStarts, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->qBlockStarts[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->tBlockBacs, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->tBlockBacs[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->tBlockStarts, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->tBlockStarts[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->startGoods, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->startGoods[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->endGoods, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->endGoods[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); *pS = s; return ret; }
struct correlate *correlateNew() /* Return new correlation handler. */ { struct correlate *c; return AllocVar(c); }
struct hash *readKeyHash(char *db, struct joiner *joiner, struct joinerField *keyField, struct keyHitInfo **retList) /* Read key-field into hash. Check for dupes if need be. */ { struct sqlConnection *conn = sqlWarnConnect(db); struct hash *keyHash = NULL; struct keyHitInfo *khiList = NULL, *khi; if (conn == NULL) { return NULL; } else { struct slName *table; struct slName *tableList = getTablesForField(conn,keyField->splitPrefix, keyField->table, keyField->splitSuffix); int rowCount = totalTableRows(conn, tableList); int hashSize = digitsBaseTwo(rowCount)+1; char query[256], **row; struct sqlResult *sr; int itemCount = 0; int dupeCount = 0; char *dupe = NULL; if (rowCount > 0) { if (hashSize > hashMaxSize) hashSize = hashMaxSize; keyHash = hashNew(hashSize); for (table = tableList; table != NULL; table = table->next) { safef(query, sizeof(query), "select %s from %s", keyField->field, table->name); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *id = doChopsAndUpper(keyField, row[0]); if (hashLookup(keyHash, id)) { if (keyField->unique) { if (keyField->exclude == NULL || !slNameInList(keyField->exclude, id)) { if (dupeCount == 0) dupe = cloneString(id); ++dupeCount; } } } else { AllocVar(khi); hashAddSaveName(keyHash, id, khi, &khi->name); slAddHead(&khiList, khi); ++itemCount; } } sqlFreeResult(&sr); } if (dupe != NULL) { warn("Error: %d duplicates in %s.%s.%s including '%s'", dupeCount, db, keyField->table, keyField->field, dupe); freez(&dupe); } verbose(2, " %s.%s.%s - %d unique identifiers\n", db, keyField->table, keyField->field, itemCount); } slFreeList(&tableList); } sqlDisconnect(&conn); *retList = khiList; return keyHash; }
struct codonBias *codonLoadBias(char *fileName) /* Create scaled log codon bias tables based on .cod file. * You can freeMem it when you're done. */ { struct codonBias *cb; char line[1024]; int lineCount = 0; char *words[128]; int wordCount; int i = 0, j = 0; int skip = 0; boolean getMark0 = FALSE; boolean getMark1 = FALSE; FILE *f = mustOpen(fileName, "r"); int val; AllocVar(cb); while (fgets(line, sizeof(line), f) ) { ++lineCount; if (skip) { skip -= 1; continue; } if (getMark1) { wordCount = chopLine(line, words); if (wordCount != 65) errAbort("Bad line %d of %s\n", lineCount, fileName); for (j=0; j<64; ++j) { val = atoi(words[j+1]); if (val == 0) cb->mark1[i][j] = scaledLog(1.0E-20); else cb->mark1[i][j] = scaledLog(0.001*val); } if ((i += 1) == 64) getMark1 = FALSE; } else if (getMark0) { wordCount = chopLine(line, words); if (wordCount != 64) errAbort("Bad line %d of %s\n", lineCount, fileName); for (j=0; j<64; ++j) { val = atoi(words[j]); if (val == 0) cb->mark0[j] = scaledLog(1.0E-20); else cb->mark0[j] = scaledLog(0.001*val); } getMark0 = FALSE; } else if (startsWith("Markov", line)) { wordCount = chopLine(line, words); if (wordCount != 2) errAbort("Bad line %d of %s\n", lineCount, fileName); if (sameString(words[1], "0")) getMark0 = TRUE; else if (sameString(words[1], "1")) getMark1 = TRUE; else errAbort("Bad line %d of %s\n", lineCount, fileName); skip = 3; } } fclose(f); return cb; }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *geneList = NULL, *curGene, *gene; int geneIx, geneCount = 0; struct microData **geneArray = NULL; float *weights = NULL; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); long time1, time2; time1 = clock1000(); /* Get list/hash of all items with expression values. */ /* uglyf("warning: temporarily limited to 1000 records\n"); */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); time2 = clock1000(); verbose(2, "records read time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Get an array for sorting. */ AllocArray(geneArray, geneCount); for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx) geneArray[geneIx] = gene; /* Print out closest 1000 in tab file. */ for (curGene = geneList; curGene != NULL; curGene = curGene->next) { calcDistances(curGene, geneList, weights); qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance); for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx) { gene = geneArray[geneIx]; fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance); } dotOut(); } printf("Made %s.tab\n", outTable); time1 = time2; time2 = clock1000(); verbose(2, "distance computation time: %.2f seconds\n", (time2 - time1) / 1000.0); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query(12))", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target(12))", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); time1 = time2; time2 = clock1000(); verbose(2, "table create/load/index time: %.2f seconds\n", (time2 - time1) / 1000.0); }
void writeGap(struct gapInfo *gap, struct xaAli *xa, int symStart, int symEnd, char geneStrand, FILE *f) /* Write out info on one gap to file. */ { char qStart[totSize+1], qEnd[totSize+1]; char tStart[totSize+1], tEnd[totSize+1]; char hStart[totSize+1], hEnd[totSize+1]; int s, e, size; int midSize; int i; char *threePrime, *fivePrime; boolean isQgap; fprintf(f, "%s %s %s hom %s:%d-%d %c %s:%d-%d %c slide %d\n", gapTypeStrings[gap->type], (gap->hasIntronEnds ? " intron" : "!intron"), (gap->hasStrongHomology ? "heavy" : "light"), gap->query, gap->qStart, gap->qEnd, xa->qStrand, gap->target, gap->tStart, gap->tEnd, geneStrand, gap->slideCount); s = symStart-exSize; e = symStart + inSize; if (s < 0) s = 0; size = e-s; uglyf("s %d size %d e %d totSize %d\n", s, size, e, totSize); strncpy(qStart, xa->qSym+s, size); strncpy(tStart, xa->tSym+s, size); strncpy(hStart, xa->hSym+s, size); qStart[size] = tStart[size] = hStart[size] = 0; // uglyf - crashes by here s = symEnd-inSize; midSize = s - e; e = symEnd+exSize; if (e > xa->symCount) e = xa->symCount; size = e-s; strncpy(qEnd, xa->qSym+s, size); strncpy(tEnd, xa->tSym+s, size); strncpy(hEnd, xa->hSym+s, size); qEnd[size] = tEnd[size] = hEnd[size] = 0; if (gap->isRc) { swapBytes(qStart, qEnd, totSize); swapBytes(tStart, tEnd, totSize); swapBytes(hStart, hEnd, totSize); reverseComplement(qStart, totSize); reverseComplement(qEnd, totSize); reverseComplement(tStart, totSize); reverseComplement(tEnd, totSize); reverseBytes(hStart, totSize); reverseBytes(hEnd, totSize); } /* Write out ends of gap to file. */ fprintf(f, "%s ...%d... %s\n", qStart, midSize, qEnd); fprintf(f, "%s ...%d... %s\n", tStart, midSize, tEnd); fprintf(f, "%s ...%d... %s\n\n", hStart, midSize, hEnd); /* Add intron ends to consensus sequence histogram. */ if (gap->hasIntronEnds && gap->type == cCodingGap) { isQgap = (qStart[exSize] == '-'); if (isQgap) { fivePrime = tStart; threePrime = tEnd; } else { fivePrime = qStart; threePrime = qEnd; } if (noInserts(threePrime, totSize) && noInserts(fivePrime, totSize) ) { int *homoCount; for (i=0; i<totSize; ++i) { hist5[i][histIx(fivePrime[i])] += 1; hist3[i][histIx(threePrime[i])] += 1; } ++histCount; if (isQgap) { ++ceOnlyCount; homoCount = ceOnlyHomoCount; } else { ++cbOnlyCount; homoCount = cbOnlyHomoCount; } ++bothCount; for (i=0; i<totSize; ++i) { if (fivePrime[i] == threePrime[i]) { homoCount[i] += 1; bothHomoCount[i] += 1; } } /* Add introns to list. */ { char idBuf[2*intronEndsSize+1]; struct intronList *il; struct hashEl *hel; memcpy(idBuf, fivePrime+exSize, intronEndsSize); memcpy(idBuf+intronEndsSize, threePrime, intronEndsSize); idBuf[ sizeof(idBuf)-1 ] = 0; if ((hel = hashLookup(intronHash, idBuf)) != NULL) { il = hel->val; il->count += 1; fprintf(f, ">>>%d of set<<<\n", il->count); if (il->isQgap != isQgap) { il->onBoth = TRUE; } } else { AllocVar(il); strcpy(il->ends, idBuf); il->count = 1; il->isQgap = isQgap; slAddHead(&intronList, il); hashAdd(intronHash, idBuf, il); } } } else { static insertCount = 0; warn("Skipping intron with flanking inserts %d", ++insertCount); } } }
struct cutter *readGcg(char *gcgFile) /* Parse a GCG file and load it into cutter format. */ { struct lineFile *lf = lineFileOpen(gcgFile,TRUE); struct cutter *enzList = NULL; char *line = "whatever", *words[10], numWords; /* Skip to the right line. */ while (lineFileNext(lf,&line,NULL) && !startsWith("..",line)); /* */ while ((numWords=lineFileChop(lf,words))) { struct cutter *newone = NULL; int comIx = (numWords==7) ? 5 : 6; int refIx = (numWords==7) ? 6 : 7; int i; char *items[100]; /* Skip ones */ if (words[4][0] == '?') continue; AllocVar(newone); newone->semicolon = (words[0][0] == ';') ? TRUE : FALSE; /* Deal with the first few columns */ if (!isdigit(words[1][0])) errAbort("Error: expecting a number in cut site column on line %d\n", lf->lineIx+1); if (!isdigit(words[3][0]) && words[3][0]!='-') errAbort("Error: expecting a number in the overhang column on line %d\n", lf->lineIx+1); if (words[comIx][0] != '>') errAbort("Error: expecting a \'>\' in the commercial sources column of line %d\n", lf->lineIx+1); newone->name = (words[0][0] == ';') ? cloneString(words[0]+1) : cloneString(words[0]); newone->cut = atoi(words[1]); newone->seq = cloneString(words[2]); touppers(newone->seq); stripChar(newone->seq,'\''); stripChar(newone->seq,'_'); newone->size = strlen(newone->seq); newone->matchSize = newone->size - countChars(newone->seq, 'N'); newone->palindromic = isPalindrome(newone->seq); newone->overhang = atoi(words[3]); newone->numCompanies = strlen(words[comIx]+1); if (newone->numCompanies > 0) newone->companies = cloneMem(words[comIx]+1, newone->numCompanies*sizeof(char)); newone->numRefs = chopString(words[refIx], ",", items, ArraySize(items)); AllocArray(newone->refs, newone->numRefs); for (i = 0; i < newone->numRefs; i++) { if (i == 100) errAbort("Error: Andy didn't make the array for holding references big enough\n"); if (!isdigit(items[i][0])) errAbort("Error: expecting number in references column in line %d\n", lf->lineIx+1); newone->refs[i] = atoi(items[i]); } /* Deal with isoscizomers. */ if (numWords == 8) { newone->numSciz = chopString(words[5], ",", items, ArraySize(items)); AllocArray(newone->scizs, newone->numSciz*sizeof(int)); for (i = 0; i < newone->numSciz; i++) { if (i == 100) errAbort("Error: Andy didn't make the array for having isoscizomers big enough\n"); newone->scizs[i] = cloneString(items[i]); } } else newone->numSciz = 0; slAddHead(&enzList, newone); } slReverse(&enzList); lineFileClose(&lf); return enzList; }
struct gapInfo *findLargeGaps(struct xaAli *xa, struct gapInfo *oldList) /* Find large gaps in alignment and classify them. */ { struct gdfGene *gdfList; struct gapInfo *gapList = NULL, *gap; int ceIx=0, cbIx=0, symIx=0; int ceStart=0, cbStart=0, symStart=0; int runSize = 0; char sym, lastSym = 0; int symCount = xa->symCount; /* Fetch C. elegans region. */ gdfList = wormGdfGenesInRange(xa->target, xa->tStart, xa->tEnd, &wormSangerGdfCache); /* Run a little state machine that does something at the end of each solid run * of a symbol. */ for (symIx = 0; symIx <= symCount; ++symIx) { sym = xa->hSym[symIx]; if (sym != lastSym) { if (runSize > 32) /* Introns need to be at least this long. */ { /* We're at end of a solid run. */ if (lastSym == 'Q' || lastSym == 'T') { int ceGapStart = xa->tStart + ceStart; int ceGapEnd = xa->tStart + ceIx; struct gdfGene *gdf; char hBefore = xa->hSym[symStart-1]; char hAfter = sym; char strand = '.'; AllocVar(gap); gap->query = cloneString(xa->query); gap->qStart = xa->qStart + cbStart; gap->qEnd = xa->qStart + cbIx; gap->target = cloneString(xa->target); gap->tStart = ceGapStart; gap->tEnd = ceGapEnd; gap->name = cloneString(xa->name); gap->size = runSize; gap->hSym = lastSym; if (uniqueGap(oldList, gap)) { slAddHead(&gapList, gap); classifyGap(gdfList, xa->target, ceGapStart, ceGapEnd, lastSym, &gap->type, &gdf); if (gdf != NULL) strand = gdf->strand; gap->hasIntronEnds = isIntron(xa, symStart, symIx, lastSym, strand, &gap->slideCount, &gap->isRc); if (gap->hasIntronEnds) slideGap(gap, xa, lastSym, symStart, symIx); if (isConserved(hBefore) && isConserved(hAfter)) gap->hasStrongHomology = TRUE; if (gap->hasStrongHomology) { if (lastSym == 'T') writeGap(gap, xa, symStart+gap->slideCount, symIx+gap->slideCount, strand, out); } } } } runSize = 0; ceStart = ceIx; cbStart = cbIx; symStart = symIx; lastSym = sym; } ++runSize; if (xa->qSym[symIx] != '-') ++cbIx; if (xa->tSym[symIx] != '-') ++ceIx; } gdfFreeGeneList(&gdfList); slReverse(&gapList); return gapList; }
struct altSpliceSite *initASplice(struct altGraphX *ag, bool **agEm, int vs, int ve1, int ve2) /* Initialize an altSplice site report with vlaues. */ { struct altSpliceSite *as = NULL; struct evidence *ev = NULL; int edgeNum = 0; int altBpStart=0, altBpEnd=0; int i=0; int vMax = 2*ag->vertexCount+ag->edgeCount; AllocVar(as); as->chrom = cloneString(ag->tName); as->chromStart = ag->vPositions[vs]; as->chromEnd = ag->vPositions[vs]; as->agName = cloneString(ag->name); safef(as->strand, sizeof(as->strand), "%s", ag->strand); as->index = vs; as->type = ag->vTypes[vs]; as->altCount+=2; as->altMax = vMax; /* Return starts of vertices. */ AllocArray(as->altStarts, vMax); as->altStarts[0] = ag->vPositions[ve1]; as->altStarts[1] = ag->vPositions[ve2]; /* Record indices of vertices. */ AllocArray(as->vIndexes, vMax); as->vIndexes[0] = ve1; as->vIndexes[1] = ve2; /* Record type of vertices. */ AllocArray(as->altTypes, vMax); as->altTypes[0] = ag->vTypes[ve1]; as->altTypes[1] = ag->vTypes[ve2]; /* Record splice types and bases alt spliced. */ AllocArray(as->spliceTypes, vMax); AllocArray(as->altBpEnds, vMax); AllocArray(as->altBpStarts, vMax); as->spliceTypes[0] = altSpliceType(ag, agEm, vs, ve1, ve1, &altBpStart, &altBpEnd); as->altBpStarts[0] = ag->vPositions[altBpStart]; as->altBpEnds[0] = ag->vPositions[altBpEnd]; as->spliceTypes[1] = altSpliceType(ag, agEm,vs, ve1, ve2, &altBpStart, &altBpEnd); as->altBpStarts[1] = ag->vPositions[altBpStart]; as->altBpEnds[1] = ag->vPositions[altBpEnd]; /* Look up the evidence. */ AllocArray(as->support, vMax); edgeNum = getEdgeNum(ag, vs, ve1); ev = slElementFromIx(ag->evidence, edgeNum); as->support[0] = ev->evCount; edgeNum = getEdgeNum(ag, vs, ve2); ev = slElementFromIx(ag->evidence, edgeNum); as->support[1] = ev->evCount; AllocArray(as->altCons, vMax); AllocArray(as->upStreamCons, vMax); AllocArray(as->downStreamCons, vMax); for(i=0; i<vMax; i++) as->altCons[i] = as->upStreamCons[i] = as->downStreamCons[i] = -1; return as; }
void doRun(char *line, struct sockaddr_in *hubIp) /* Execute command. */ { char *jobMessage = cloneString(line); static char *args[1024]; int argCount; char hubDottedQuad[17]; nextRandom(); if (line == NULL) warn("Executing nothing..."); else if (!internetIpToDottedQuad(ntohl(hubIp->sin_addr.s_addr), hubDottedQuad)) warn("Can't convert ipToDottedQuad"); else { struct runJobMessage rjm; if (parseRunJobMessage(line, &rjm)) { int jobId = atoi(rjm.jobIdString); if (findRunningJob(jobId) == NULL && findFinishedJob(jobId) == NULL) { if (busyProcs < maxProcs) { int childPid; argCount = chopLine(rjm.command, args); if (argCount >= ArraySize(args)) warn("Too many arguments to run"); else { args[argCount] = NULL; if ((childPid = forkOrDie()) == 0) { /* Do JOB_ID substitutions */ struct subText *st = subTextNew("$JOB_ID", rjm.jobIdString); int i; rjm.in = subTextString(st, rjm.in); rjm.out = subTextString(st, rjm.out); rjm.err = subTextString(st, rjm.err); for (i=0; i<argCount; ++i) args[i] = subTextString(st, args[i]); execProc(hubDottedQuad, rjm.jobIdString, rjm.reserved, rjm.user, rjm.dir, rjm.in, rjm.out, rjm.err, rjm.ram, args[0], args); exit(0); } else { struct job *job; AllocVar(job); job->jobId = atoi(rjm.jobIdString); job->pid = childPid; job->startMessage = jobMessage; jobMessage = NULL; /* No longer own memory. */ job->node = dlAddValTail(jobsRunning, job); ++busyProcs; } } } else { warn("Trying to run when busy."); } } else { warn("Duplicate run-job %d\n", jobId); } } } freez(&jobMessage); }
void reportCassette(struct altGraphX *ag, bool **em, int vs, int ve1, int ve2, int altBpStart, int altBpEnd, int startV, int endV, FILE *out) /* Write out both an altGraphX and two bed files. For a cassette exon the edges are - Name Vertexes Class ------ ---------- ----- exon1: startV->vs constitutive (cons 0) junction1: vs->ve1 alternative1 (alt1 1) exon2: ve1->altBpEnd alternative1 (alt1 1) junction2: altBpEnd->ve2 alternative1 (alt1 1) exon3: ve2->endV constitutive (cons 0) junction3: vs->ve2 alternative2 (alt2 2) */ { struct altGraphX *agLoc = NULL; /* Local altGraphX. */ struct evidence *ev = NULL, *evLoc = NULL; int *vPos = ag->vPositions; unsigned char *vT = ag->vTypes; int *vPosLoc = NULL; /* Vertex Positions. */ int *eStartsLoc = NULL; /* Edge Starts. */ int *eEndsLoc = NULL; /* Edge ends. */ unsigned char *vTLoc = NULL; /* Vertex Types. */ int *eTLoc = NULL; /* Edge Types. */ int vCLoc = 0; int eCLoc = 0; int i =0; struct dyString *dy = NULL; if(out == NULL) return; AllocVar(agLoc); agLoc->tName = cloneString(ag->tName); agLoc->name = cloneString(ag->name); agLoc->tStart = vPos[startV]; agLoc->tEnd = vPos[endV]; agLoc->strand[0] = ag->strand[0]; agLoc->vertexCount = vCLoc = 6; agLoc->edgeCount = eCLoc = 6; agLoc->id = altCassette; /* Allocate some arrays. */ AllocArray(vPosLoc, vCLoc); AllocArray(eStartsLoc, vCLoc); AllocArray(eEndsLoc, vCLoc); AllocArray(vTLoc, vCLoc); AllocArray(eTLoc, vCLoc); /* Fill in the vertex positions. */ vPosLoc[0] = vPos[startV]; vPosLoc[1] = vPos[vs]; vPosLoc[2] = vPos[ve1]; vPosLoc[3] = vPos[altBpEnd]; vPosLoc[4] = vPos[ve2]; vPosLoc[5] = vPos[endV]; /* Fill in the vertex types. */ vTLoc[0] = vT[startV]; vTLoc[1] = vT[vs]; vTLoc[2] = vT[ve1]; vTLoc[3] = vT[altBpEnd]; vTLoc[4] = vT[ve2]; vTLoc[5] = vT[endV]; /* Fill in the edges. */ /* Constitutive first exon. */ eStartsLoc[0] = 0; eEndsLoc[0] = 1; eTLoc[0] = 0; ev = evidenceForEdge(ag, startV, vs); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Exon inclusion junction. */ eStartsLoc[1] = 1; eEndsLoc[1] = 2; eTLoc[1] = 1; ev = evidenceForEdge(ag, vs, ve1); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Exon exclusion junction. */ eStartsLoc[2] = 1; eEndsLoc[2] = 4; eTLoc[2] = 2; ev = evidenceForEdge(ag, vs, ve2); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Cassette exon. */ eStartsLoc[3] = 2; eEndsLoc[3] = 3; eTLoc[3] = 1; ev = evidenceForEdge(ag, ve1, altBpEnd); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Exon inclusion junction. */ eStartsLoc[4] = 3; eEndsLoc[4] = 4; eTLoc[4] = 1; ev = evidenceForEdge(ag, altBpEnd, ve2); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); /* Constitutive second exon. */ eStartsLoc[5] = 4; eEndsLoc[5] = 5; eTLoc[5] = 0; ev = evidenceForEdge(ag, ve2, endV); evLoc = CloneVar(ev); evLoc->mrnaIds = CloneArray(ev->mrnaIds, ev->evCount); slAddHead(&agLoc->evidence, evLoc); slReverse(&agLoc->evidence); dy = newDyString(ag->mrnaRefCount*36); agLoc->mrnaRefCount = ag->mrnaRefCount; for(i=0; i<ag->mrnaRefCount; i++) dyStringPrintf(dy, "%s,", ag->mrnaRefs[i]); sqlStringDynamicArray(dy->string, &agLoc->mrnaRefs, &i); dyStringFree(&dy); agLoc->mrnaTissues = CloneArray(ag->mrnaTissues, ag->mrnaRefCount); agLoc->mrnaLibs = CloneArray(ag->mrnaLibs, ag->mrnaRefCount); agLoc->vPositions = vPosLoc; agLoc->edgeStarts = eStartsLoc; agLoc->edgeEnds = eEndsLoc; agLoc->vTypes = vTLoc; agLoc->edgeTypes = eTLoc; altGraphXTabOut(agLoc, out); altGraphXFree(&agLoc); }
struct xaAli *xaReadNext(FILE *f, boolean condensed) /* Read next xaAli from file. If condensed * don't fill int query, target, qSym, tSym, or hSym. */ { char line[512]; char *words[16]; int wordCount; struct xaAli *xa; char *parts[5]; int partCount; double percentScore; int symCount; int newOffset = 0; char *s, *e; /* Get first line and parse out everything but the sym lines. */ if (fgets(line, sizeof(line), f) == NULL) return NULL; wordCount = chopLine(line, words); if (wordCount < 9) errAbort("Short line in cross-species alignment file"); if (wordCount == 10) newOffset = 1; if (!sameString(words[1], "align")) errAbort("Bad line in cross-species alignment file"); AllocVar(xa); xa->name = cloneString(words[0]); s = words[5+newOffset]; e = strrchr(s, ':'); if (e == NULL) errAbort("Bad line (no colon) in cross-species alignment file"); *e++ = 0; partCount = chopString(e, "-", parts, ArraySize(parts)); if (partCount != 2) errAbort("Bad range format in cross-species alignment file"); if (!condensed) xa->query = cloneString(s); xa->qStart = atoi(parts[0]); xa->qEnd = atoi(parts[1]); xa->qStrand = words[6+newOffset][0]; partCount = chopString(words[7+newOffset], ":-", parts, ArraySize(parts)); if (!condensed) xa->target = cloneString(parts[0]); xa->tStart = atoi(parts[1]); xa->tEnd = atoi(parts[2]); xa->tStrand = words[8+newOffset][0]; percentScore = atof(words[2]); xa->milliScore = round(percentScore*10); xa->symCount = symCount = atoi(words[4]); /* Get symbol lines. */ if (condensed) { eatThroughLf(f); eatThroughLf(f); eatThroughLf(f); } else { xa->qSym = needMem(symCount+1); mustRead(f, xa->qSym, symCount); eatLf(f); xa->tSym = needMem(symCount+1); mustRead(f, xa->tSym, symCount); eatLf(f); xa->hSym = needMem(symCount+1); mustRead(f, xa->hSym, symCount); eatLf(f); } return xa; }