DNA *wormGetNamelessClusterDna(char *name) /* Get DNA associated with nameless cluster */ { char *chrom; int start, end; char strand; if (!wormGeneRange(name, &chrom, &strand, &start, &end)) errAbort("Can't find %s in database", name); return wormChromPart(chrom, start, end-start); }
boolean getWormGeneExonDna(char *name, DNA **retDna) /* Get the DNA associated with a gene, without introns. */ { struct gdfGene *g; struct slName *syn = NULL; long lstart, lend; int start, end; int dnaSize; DNA *dna; int i; struct gdfDataPoint *pt = NULL; struct wormGdfCache *gdfCache; struct dyString *dy = newDyString(1000); /* Translate biologist type name to cosmid.N name */ if (wormIsGeneName(name)) { syn = wormGeneToOrfNames(name); if (syn != NULL) name = syn->name; } if (strncmp(name, "g-", 2) == 0) gdfCache = &wormGenieGdfCache; else gdfCache = &wormSangerGdfCache; if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL) return FALSE; gdfGeneExtents(g, &lstart, &lend); start = lstart; end = lend; /*wormClipRangeToChrom(chromIds[g->chromIx], &start, &end);*/ dnaSize = end-start; dna = wormChromPart(chromIds[g->chromIx], start, dnaSize); gdfOffsetGene(g, -start); if (g->strand == '-') { reverseComplement(dna, dnaSize); gdfRcGene(g, dnaSize); } pt = g->dataPoints; for (i=0; i<g->dataCount; i += 2) { dyStringAppendN(dy, (dna+pt[i].start), (pt[i+1].start - pt[i].start)); } *retDna = cloneString(dy->string); dyStringFree(&dy); gdfFreeGene(g); return TRUE; }
boolean getWormGeneDna(char *name, DNA **retDna, boolean upcExons) /* Get the DNA associated with a gene. Optionally upper case exons. */ { struct gdfGene *g; struct slName *syn = NULL; long lstart, lend; int start, end; int dnaSize; DNA *dna; struct wormGdfCache *gdfCache; /* Translate biologist type name to cosmid.N name */ if (wormIsGeneName(name)) { syn = wormGeneToOrfNames(name); if (syn != NULL) name = syn->name; } if (strncmp(name, "g-", 2) == 0) gdfCache = &wormGenieGdfCache; else gdfCache = &wormSangerGdfCache; if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL) return FALSE; gdfGeneExtents(g, &lstart, &lend); start = lstart; end = lend; /* wormClipRangeToChrom(chromIds[g->chromIx], &start, &end); */ dnaSize = end-start; *retDna = dna = wormChromPart(chromIds[g->chromIx], start, dnaSize); gdfOffsetGene(g, -start); if (g->strand == '-') { reverseComplement(dna, dnaSize); gdfRcGene(g, dnaSize); } if (upcExons) { int i; struct gdfDataPoint *pt = g->dataPoints; for (i=0; i<g->dataCount; i += 2) { toUpperN(dna + pt[i].start, pt[i+1].start - pt[i].start); } } gdfFreeGene(g); return TRUE; }
DNA *wormChromPartExonsUpper(char *chromId, int start, int size) /* Return part of a worm chromosome with exons in upper case. */ { DNA *dna = wormChromPart(chromId, start, size); struct wormFeature *geneFeat = wormGenesInRange(chromId, start, start+size); struct wormFeature *feat; for (feat = geneFeat; feat != NULL; feat = feat->next) { char *name = feat->name; if (!wormIsNamelessCluster(name)) { struct gdfGene *gene = wormGetGdfGene(name); gdfUpcExons(gene, feat->start, dna, size, start); gdfFreeGene(gene); } } slFreeList(&geneFeat); return dna; }
void doMiddle() { char *seqName; boolean intronsLowerCase = TRUE; boolean intronsParenthesized = FALSE; boolean hiliteNear = FALSE; int startRange = 0; int endRange = 0; boolean gotRange = FALSE; struct dnaSeq *cdnaSeq; boolean isChromRange = FALSE; DNA *dna; char *translation = NULL; seqName = cgiString("geneName"); seqName = trimSpaces(seqName); if (cgiVarExists("intronsLowerCase")) intronsLowerCase = cgiBoolean("intronsLowerCase"); if (cgiVarExists("intronsParenthesized")) intronsParenthesized = cgiBoolean("intronsParenthesized"); if (cgiVarExists("startRange") && cgiVarExists("endRange" )) { startRange = cgiInt("startRange"); endRange = cgiInt("endRange"); gotRange = TRUE; } if (cgiVarExists("hiliteNear")) { hiliteNear = TRUE; } fprintf(stdout, "<P><TT>\n"); /* The logic here is a little complex to optimize speed. * If we can decide what type of thing the name refers to by * simply looking at the name we do. Otherwise we have to * search the database in various ways until we get a hit. */ if (wormIsNamelessCluster(seqName)) { isChromRange = TRUE; } else if (wormIsChromRange(seqName)) { isChromRange = TRUE; } else if (getWormGeneDna(seqName, &dna, TRUE)) { if (cgiBoolean("litLink")) { char nameBuf[64]; char *geneName = NULL; char *productName = NULL; char *coding; int transSize; struct wormCdnaInfo info; printf("<H3>Information and Links for %s</H3>\n", seqName); if (wormInfoForGene(seqName, &info)) { if (info.description) printf("<P>%s</P>\n", info.description); geneName = info.gene; productName = info.product; } else { if (wormIsGeneName(seqName)) geneName = seqName; else if (wormGeneForOrf(seqName, nameBuf, sizeof(nameBuf))) geneName = nameBuf; } coding = cloneUpperOnly(dna); transSize = 1 + (strlen(coding)+2)/3; translation = needMem(1+strlen(coding)/3); dnaTranslateSome(coding, translation, transSize); freez(&coding); if (geneName) { printf("<A HREF=\"http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?form=4&db=m" "&term=C+elegans+%s&dispmax=50&relentrezdate=No+Limit\">", geneName); printf("PubMed search on gene: </A>%s<BR>\n", geneName); } if (productName) { char *encoded = cgiEncode(productName); printf("<A HREF=\"http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?form=4&db=m" "&term=%s&dispmax=50&relentrezdate=No+Limit\">", encoded); printf("PubMed search on product:</A> %s<BR>\n", productName); freeMem(encoded); } /* Process name to get rid of isoform letter for Proteome. */ if (geneName) strcpy(nameBuf, geneName); else { strcpy(nameBuf, seqName); #ifdef NEVER /* Sometimes Proteome requires the letter after the orf name * in alt-spliced cases, sometimes it can't handle it.... */ nameLen = strlen(nameBuf); if (wormIsOrfName(nameBuf) && isalpha(nameBuf[nameLen-1])) { char *dotPos = strrchr(nameBuf, '.'); if (dotPos != NULL && isdigit(dotPos[1])) nameBuf[nameLen-1] = 0; } #endif /* NEVER */ } printf("<A HREF=\"http://www.wormbase.org/db/seq/sequence?name=%s;class=Sequence\">", seqName); printf("WormBase link on:</A> %s<BR>\n", seqName); printf("<A HREF=\"http://www.proteome.com/databases/WormPD/reports/%s.html\">", nameBuf); printf("Proteome link on:</A> %s<BR>\n<BR>\n", nameBuf); printf("<A HREF=#DNA>Genomic DNA Sequence</A><BR>\n"); if (hiliteNear) printf("<A HREF=\"#CLICKED\">Shortcut to where you clicked in gene</A><BR>"); printf("<A HREF=#protein>Translated Protein Sequence</A><BR>\n"); htmlHorizontalLine(); printf("<A NAME=DNA></A>"); printf("<H3>%s Genomic DNA sequence</H3>", seqName); } if (!intronsLowerCase) tolowers(dna); if (hiliteNear) { if (!gotRange) { double nearPos = cgiDouble("hiliteNear"); int rad = 5; int dnaSize = strlen(dna); long mid = (int)(dnaSize * nearPos); startRange = mid - rad; if (startRange < 0) startRange = 0; endRange = mid + rad; if (endRange >= dnaSize) endRange = dnaSize - 1; } } outputSeq(dna, strlen(dna), hiliteNear, startRange, endRange, stdout); freez(&dna); } else if (wormCdnaSeq(seqName, &cdnaSeq, NULL)) { outputSeq(cdnaSeq->dna, cdnaSeq->size, FALSE, 0, 0, stdout); } else { isChromRange = TRUE; } if (isChromRange) { char *chromId; int start, end; char strand = '+'; int size; if (!wormGeneRange(seqName, &chromId, &strand, &start, &end)) errAbort("Can't find %s",seqName); size = end - start; if (intronsLowerCase) dna = wormChromPartExonsUpper(chromId, start, size); else { dna = wormChromPart(chromId, start, size); touppers(dna); } if (cgiVarExists("strand")) strand = cgiString("strand")[0]; if (strand == '-') reverseComplement(dna, size); outputSeq(dna, size, FALSE, 0, 0, stdout); } if (translation != NULL) { htmlHorizontalLine(); printf("<A NAME=protein></A>"); printf("<H3>Translated Protein of %s</H3>\n", seqName); outputSeq(translation, strlen(translation), FALSE, 0, 0, stdout); freez(&translation); } fprintf(stdout, "</TT></P>\n"); }
void showClump(struct ernaClump *clump, FILE *f) /* Show detailed alignment for one clump. */ { int chromStart = clump->start - 1000; int chromEnd = clump->end + 1000; int chromSize; DNA *chromDna; struct wormFeature *cdnaNameList, *cdnaName; struct lineAli *laList = NULL, *la; struct ffAli *ali; struct dnaSeq *cdna; boolean rcCdna; int clumpSize = clump->end - clump->start + 1; int displaySize = lineSize; int displayStart = (clump->start+clump->end)/2 - displaySize/2; int displayEnd = displayStart + displaySize; int displayDnaOffset; DNA *displayDna; struct ernaHit *hit; /* Get genomic dna and list of all cDNAs in area around clump. */ wormClipRangeToChrom(clump->chrom, &chromStart, &chromEnd); chromSize = chromEnd - chromStart; chromDna = wormChromPart(clump->chrom, chromStart, chromSize); cdnaNameList = wormCdnasInRange(clump->chrom, chromStart, chromEnd); /* Figure out 60 bases to display alignment around clump. */ wormClipRangeToChrom(clump->chrom, &displayStart, &displayEnd); displaySize = displayEnd - displayStart; displayDnaOffset = displayStart - chromStart; displayDna = chromDna + displayDnaOffset; /* Make up detailed alignment on each cDNA */ for (cdnaName = cdnaNameList; cdnaName != NULL; cdnaName = cdnaName->next) { struct wormCdnaInfo info; if (!wormCdnaSeq(cdnaName->name, &cdna, &info)) { warn("Couldn't find %s", cdnaName->name); continue; } if (!ffFindEitherStrandN(cdna->dna, cdna->size, chromDna, chromSize, ffCdna, &ali, &rcCdna)) { warn("Couldn't align %s", cdnaName->name); continue; } if (rcCdna) reverseComplement(cdna->dna, cdna->size); la = makeLineAli(cdnaName->name, ali, chromDna, cdna->dna, displayDnaOffset); la->isEmbryo = info.isEmbryonic; slAddHead(&laList, la); freeDnaSeq(&cdna); ffFreeAli(&ali); } /* Display genomic with upper case at hot spots*/ displayDna[displaySize] = 0; for (hit = clump->hits; hit != NULL; hit = hit->next) { int doff = hit->pos - chromStart; chromDna[doff] = toupper(chromDna[doff]); } fprintf(f, "%s Genomic\n", displayDna); /* Display aligned list by sorted score. */ slSort(&laList, cmpLaScore); for (la = laList; la != NULL; la = la->next) { if (spaceCount(la->line) != lineSize) fprintf(f, "%s %s %s\n", la->line, la->name, (la->isEmbryo ? "emb" : " ")); } /* Clean up. */ slFreeList(&cdnaNameList); slFreeList(&laList); freeMem(chromDna); }
int main(int argc, char *argv[]) { #define stepSize 10000 #define extraBases 1000 static struct noiseTrack noiseTrack[stepSize]; int chromIx; int chromSize; int baseOff; char *chromName; int dnaStart, dnaEnd; char *outName; FILE *out; struct hash *dupeHash; if (argc != 2) { errAbort("editbase - lists bases for which there is evidence of RNA editing\n" "usage:\n" " editbase outfile.txt"); } dnaUtilOpen(); initVlookup(); outName = argv[1]; out = mustOpen(outName, "w"); printf("Scanning for cDNAs that align more than once.\n"); dupeHash = buildMultiAlignHash(); printf("Loading worm genome\n"); wormLoadNt4Genome(&chrom, &chromCount); wormChromNames(&chromNames, &chromCount); for (chromIx = 0; chromIx < chromCount; ++chromIx) { chromName = chromNames[chromIx]; printf("Processing chromosome %s\n", chromName); chromSize = wormChromSize(chromName); for (baseOff = 0; baseOff < chromSize; baseOff += stepSize) { struct wormFeature *cdnaNamesList, *name; struct cdnaAli *caList = NULL, *ca; int dnaSize; DNA *dna; int chunkSize; DNA *chunk; int i; /* Figure out how much DNA to get and get it. Include some * extra around chunk so can align better. */ chunkSize = chromSize - baseOff; if (chunkSize > stepSize) chunkSize = stepSize; dnaStart = baseOff - extraBases; dnaEnd = baseOff + stepSize + extraBases; wormClipRangeToChrom(chromName, &dnaStart, &dnaEnd); dnaSize = dnaEnd - dnaStart; dna = wormChromPart(chromName, dnaStart, dnaSize); /* Get the cDNAs */ cdnaNamesList = wormCdnasInRange(chromName, baseOff, baseOff + chunkSize); for (name = cdnaNamesList; name != NULL; name = name->next) { if (!hashLookup(dupeHash, name->name) ) { ca = makeCdnaAli(name->name, dna, dnaSize); slAddHead(&caList, ca); } } slReverse(&caList); /* Add cdnas to noise track. */ chunk = dna + baseOff - dnaStart; for (ca = caList; ca != NULL; ca = ca->next) { addNoiseTrack(noiseTrack, chunk, chunkSize, ca); } /* Step through base by base evaluating noise and reporting it if * it's interesting. */ for (i=0; i<chunkSize; ++i) { struct noiseTrack *nt = &noiseTrack[i]; struct noise *noise = nt->noise; int noiseCount = slCount(noise); if (noiseCount > 1) { char commonVal; int commonCount; findCommon(noise, &commonVal, &commonCount); if (commonCount*2 > noiseCount && commonVal != 'n') { double ratio = (double)commonCount/noiseCount; double score; ratio = ratio * ratio * ratio; score = ratio * commonCount; if (score >= 4.0) { fprintf(stdout, "%f %s:%d %c->%c in %d out of %d out of %d %s\n", ratio*commonCount, chromName, i+baseOff+1, chunk[i], commonVal, commonCount, noiseCount, nt->cdnaCount, nt->noise->ca->cdna->srn->name); fprintf(out, "%f %s:%d %c->%c in %d out of %d out of %d %s\n", ratio*ratio*commonCount, chromName, i+baseOff+1, chunk[i], commonVal, commonCount, noiseCount, nt->cdnaCount, nt->noise->ca->cdna->srn->name); } } } } freeCdnaAliList(&caList); slFreeList(&cdnaNamesList); freez(&dna); recycleNoiseTrack(noiseTrack, chunkSize); printf("%s %d maxNoise %d\n", chromName, baseOff, slCount(freeNoiseList)); } } return 0; }