void writeSeqTable(char *faName, FILE *out, boolean unburyAccession, boolean isPep) /* Write out contents of fa file to name/sequence pairs in tabbed out file. */ { struct lineFile *lf = lineFileOpen(faName, TRUE); bioSeq seq; int dotMod = 0; printf("Reading %s\n", faName); while (faSomeSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name, !isPep)) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } if (unburyAccession) { seq.name = unburyAcc(lf, seq.name); } seq.name = accWithoutSuffix(seq.name); fprintf(out, "%s\t%s\n", seq.name, seq.dna); } if (clDots) printf("\n"); lineFileClose(&lf); }
/////////////////////////////////////////////////////////////////////////////// // Get information about a dot from a specific place. /////////////////////////////////////////////////////////////////////////////// string DotPlotBackend::getDotDataByLocation( int x, int y, double scale ) { // Determine where the dot would be if scaled to 100%. double xScaled = (double)x / scale; double yScaled = (double)y / scale; // Go through each dot to see if it could be where the user clicked. // If a dot is identified, return its data. for( int i = 1; i <= length; i++ ) { for( int j = 1; j <= length; j++ ) { // Determine the X and Y coordinates of the dot. double dotX = BORDER + (j*4)-1; double dotY = BORDER + TEXTSIZE + LABEL_LINE_LENGTH + (i*4)-1; // Shave off the border, then add a 10 pixel border. dotX = ( dotX - BORDER ) + 10; dotY = ( dotY - BORDER ) + 10; // Check to make sure the point is inside the dot; if it is, return // the dot's data. if( ( dotX <= xScaled ) && ( xScaled <= ( dotX + 3 ) ) ) { if( ( dotY <= yScaled ) && ( yScaled <= ( dotY + 3 ) ) ) { // Build the dot pair to get the dot value. stringstream pair( stringstream::in | stringstream::out ); pair << i << "-" << j; // If the value is outside the default range, record this fact. double value = values[pair.str()]; double epsilon = numeric_limits<double>::epsilon(); bool min1 = ( defaultMin <= value ); bool min2 = ( fabs( defaultMin - value ) < epsilon ); bool max1 = ( defaultMax >= value ); bool max2 = ( fabs( defaultMax - value ) < epsilon ); bool outOfRange = ( (( min1 || min2 ) && ( max1 || max2 )) == false ); // Build the data string. stringstream dotOut( stringstream::in | stringstream::out ); dotOut << i << "(" << sequence[i-1] << ") -- " << j << "(" << sequence[j-1] << "): "; if( outOfRange == true ) { dotOut << "infinity"; } else { dotOut << values[pair.str()]; } // Return the data string. return dotOut.str(); } } } } // Return an empty string if no dot was identified. return ""; }
void *computeDistance(void *thread_ID) { struct microDataDistance *geneDistArray = NULL; struct microDataDistance *geneDistPtr; struct microData *curGene; int baseGenesPerThread, genesPerThread, rmdrPerThread, rmdr, xtra; int subListSize; int geneIx; int i; /* offset = thread ID */ int offset = *((int *)thread_ID); /* create subList size for each thread to process */ baseGenesPerThread = geneCount / numThreads; rmdr = geneCount % numThreads; rmdrPerThread = rmdr / numThreads; xtra = rmdr % numThreads; genesPerThread = baseGenesPerThread + rmdrPerThread; subListSize = (offset == numThreads-1) ? genesPerThread + xtra : genesPerThread; /* each thread positions initial current gene */ curGene = geneList; for (i = 0; i < offset*genesPerThread; i++) curGene = curGene->next; AllocArray(geneDistArray, geneCount); /* compute the pairwise experiment distances */ for (i = 0; i < subListSize; i++, curGene = curGene->next) { calcDistances(geneDistArray, curGene, geneList, weights); qsort(geneDistArray, geneCount, sizeof(geneDistArray[0]), cmpMicroDataDistance); /* Print out closest 1000 in tab file. */ pthread_mutex_lock( &mutexfilehandle ); geneDistPtr = geneDistArray; for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx, geneDistPtr++) fprintf(f, "%s\t%s\t%f\n", geneDistPtr->name1, geneDistPtr->name2, geneDistPtr->distance); dotOut(); pthread_mutex_unlock( &mutexfilehandle ); } freez( &geneDistArray ); pthread_exit(NULL); }
void searchOne(bioSeq *seq, struct genoFind *gf, FILE *f, boolean isProt, struct hash *maskHash, Bits *qMaskBits) /* Search for seq on either strand in index. */ { dotOut(); if (isProt) { searchOneProt(seq, gf, f); } else { gvo->maskHash = maskHash; searchOneStrand(seq, gf, f, FALSE, maskHash, qMaskBits); reverseComplement(seq->dna, seq->size); searchOneStrand(seq, gf, f, TRUE, maskHash, qMaskBits); reverseComplement(seq->dna, seq->size); } gfOutputQuery(gvo, f); }
void *computeDistance(void *thread_ID) { struct microDataDistance *geneDistArray = NULL; struct microData *curGene; int baseGenesPerThread, genesPerThread, rmdrPerThread, rmdr, xtra; int subListSize; int i; /* offset = thread ID */ int offset = *((int *)thread_ID); /* create subList size for each thread to process */ baseGenesPerThread = geneCount / numThreads; rmdr = geneCount % numThreads; rmdrPerThread = rmdr / numThreads; xtra = rmdr % numThreads; genesPerThread = baseGenesPerThread + rmdrPerThread; subListSize = (offset == numThreads-1) ? genesPerThread + xtra : genesPerThread; /* each thread positions initial current gene */ curGene = geneList; for (i = 0; i < offset*genesPerThread; i++) curGene = curGene->next; /* compute the pairwise experiment distances */ for (i = 0; i < subListSize; i++, curGene = curGene->next) { AllocArray(geneDistArray, geneCount); calcDistances(geneDistArray, curGene, geneList, weights); qsort(geneDistArray, geneCount, sizeof(geneDistArray[0]), cmpMicroDataDistance); synQueuePut( synQ, geneDistArray ); pthread_mutex_lock( &mutexDotOut ); dotOut(); pthread_mutex_unlock( &mutexDotOut ); } pthread_exit(NULL); }
void blatFilter(char *outName, int inCount, char *inNames[]) /* blatFilter - filter blat alignments somewhat. */ { int i; FILE *f = mustOpen(outName, "w"); for (i=0; i<inCount; ++i) { char *inName = inNames[i]; struct lineFile *lf = pslFileOpen(inName); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { dotOut(); if (psl->match + psl->repMatch + psl->nCount < 260 || detailTest(psl)) pslTabOut(psl, f); pslFree(&psl); } } printf("\n"); }
void blatFlekFilter(char *outName, int inCount, char *inNames[]) /* blatFilter - filter blat alignments somewhat. */ { int i; FILE *f = mustOpen(outName, "w"); for (i=0; i<inCount; ++i) { char *inName = inNames[i]; struct lineFile *lf = pslFileOpen(inName); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { dotOut(); if (psl->tEnd - psl->tStart < (psl->qEnd + psl->qStart) * 3) pslTabOut(psl, f); else writePslFrags(psl, f); pslFree(&psl); } } printf("\n"); }
void chainPreNet(char *inFile, char *targetSizes, char *querySizes, char *outFile) /* chainPreNet - Remove chains that don't have a chance of being netted. */ { struct hash *tHash = setupChroms(targetSizes); struct hash *qHash = setupChroms(querySizes); struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); struct chain *chain; double score, lastScore = 9e99; struct chrom *qChrom, *tChrom; lineFileSetMetaDataOutput(lf, f); while ((chain = chainRead(lf)) != NULL) { /* Report progress. */ dotOut(); /* Check to make sure it really is sorted by score. */ score = chain->score; if (score > lastScore) { errAbort("%s not sorted by score line %d", lf->fileName, lf->lineIx); } lastScore = score; /* Output chain if necessary and then free it. */ qChrom = hashMustFindVal(qHash, chain->qName); tChrom = hashMustFindVal(tHash, chain->tName); if (chainUsed(chain, qChrom, tChrom) && inclQuery(chain)) { chainWrite(chain, f); } chainFree(&chain); } }
void hgExpDistance(char *database, char *posTable, char *expTable, char *outTable) /* hgExpDistance - Create table that measures expression distance between pairs. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr; char query[256]; char **row; struct hash *expHash = hashNew(16); int realExpCount = -1; struct microData *geneList = NULL, *curGene, *gene; int geneIx, geneCount = 0; struct microData **geneArray = NULL; float *weights = NULL; char *tempDir = "."; FILE *f = hgCreateTabFile(tempDir, outTable); /* Get list/hash of all items with expression values. */ sqlSafef(query, sizeof(query), "select name,expCount,expScores from %s", posTable); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; if (!hashLookup(expHash, name)) { int expCount = sqlUnsigned(row[1]); int commaCount; float *expScores = NULL; sqlFloatDynamicArray(row[2], &expScores, &commaCount); if (expCount != commaCount) errAbort("expCount and expScores don't match on %s in %s", name, posTable); if (realExpCount == -1) realExpCount = expCount; if (expCount != realExpCount) errAbort("In %s some rows have %d experiments others %d", name, expCount, realExpCount); AllocVar(gene); gene->expCount = expCount; gene->expScores = expScores; hashAddSaveName(expHash, name, gene, &gene->name); slAddHead(&geneList, gene); } } sqlFreeResult(&sr); conn = sqlConnect(database); slReverse(&geneList); geneCount = slCount(geneList); printf("Have %d elements in %s\n", geneCount, posTable); weights = getWeights(realExpCount); if (optionExists("lookup")) geneList = lookupGenes(conn, optionVal("lookup", NULL), geneList); geneCount = slCount(geneList); printf("Got %d unique elements in %s\n", geneCount, posTable); sqlDisconnect(&conn); /* Disconnect because next step is slow. */ if (geneCount < 1) errAbort("ERROR: unique gene count less than one ?"); /* Get an array for sorting. */ AllocArray(geneArray, geneCount); for (gene = geneList,geneIx=0; gene != NULL; gene = gene->next, ++geneIx) geneArray[geneIx] = gene; /* Print out closest 1000 in tab file. */ for (curGene = geneList; curGene != NULL; curGene = curGene->next) { calcDistances(curGene, geneList, weights); qsort(geneArray, geneCount, sizeof(geneArray[0]), cmpMicroDataDistance); for (geneIx=0; geneIx < 1000 && geneIx < geneCount; ++geneIx) { gene = geneArray[geneIx]; fprintf(f, "%s\t%s\t%f\n", curGene->name, gene->name, gene->distance); } dotOut(); } printf("Made %s.tab\n", outTable); /* Create and load table. */ conn = sqlConnect(database); distanceTableCreate(conn, outTable); hgLoadTabFile(conn, tempDir, outTable, &f); printf("Loaded %s\n", outTable); /* Add indices. */ sqlSafef(query, sizeof(query), "alter table %s add index(query)", outTable); sqlUpdate(conn, query); printf("Made query index\n"); if (optionExists("targetIndex")) { sqlSafef(query, sizeof(query), "alter table %s add index(target)", outTable); sqlUpdate(conn, query); printf("Made target index\n"); } hgRemoveTabFile(tempDir, outTable); }
void processRefSeq(char *database, char *faFile, char *raFile, char *pslFile, char *loc2refFile, char *pepFile, char *mim2locFile) /* hgRefSeqMrna - Load refSeq mRNA alignments and other info into * refSeqGene table. */ { struct lineFile *lf; struct hash *raHash, *rsiHash = newHash(0); struct hash *loc2mimHash = newHash(0); struct refSeqInfo *rsiList = NULL, *rsi; char *s, *line, *row[5]; int wordCount, dotMod = 0; int noLocCount = 0; int rsiCount = 0; int noProtCount = 0; struct psl *psl; struct sqlConnection *conn = hgStartUpdate(database); struct hash *productHash = loadNameTable(conn, "productName", 16); struct hash *geneHash = loadNameTable(conn, "geneName", 16); char *kgName = "refGene"; FILE *kgTab = hgCreateTabFile(".", kgName); FILE *productTab = hgCreateTabFile(".", "productName"); FILE *geneTab = hgCreateTabFile(".", "geneName"); FILE *refLinkTab = hgCreateTabFile(".", "refLink"); FILE *refPepTab = hgCreateTabFile(".", "refPep"); FILE *refMrnaTab = hgCreateTabFile(".", "refMrna"); struct exon *exonList = NULL, *exon; char *answer; char cond_str[200]; /* Make refLink and other tables table if they don't exist already. */ sqlMaybeMakeTable(conn, "refLink", refLinkTableDef); sqlUpdate(conn, "NOSQLINJ delete from refLink"); sqlMaybeMakeTable(conn, "refGene", refGeneTableDef); sqlUpdate(conn, "NOSQLINJ delete from refGene"); sqlMaybeMakeTable(conn, "refPep", refPepTableDef); sqlUpdate(conn, "NOSQLINJ delete from refPep"); sqlMaybeMakeTable(conn, "refMrna", refMrnaTableDef); sqlUpdate(conn, "NOSQLINJ delete from refMrna"); /* Scan through locus link to omim ID file and put in hash. */ { char *row[2]; printf("Scanning %s\n", mim2locFile); lf = lineFileOpen(mim2locFile, TRUE); while (lineFileRow(lf, row)) { hashAdd(loc2mimHash, row[1], intToPt(atoi(row[0]))); } lineFileClose(&lf); } /* Scan through .ra file and make up start of refSeqInfo * objects in hash and list. */ printf("Scanning %s\n", raFile); lf = lineFileOpen(raFile, TRUE); while ((raHash = hashNextRa(lf)) != NULL) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } AllocVar(rsi); slAddHead(&rsiList, rsi); if ((s = hashFindVal(raHash, "acc")) == NULL) errAbort("No acc near line %d of %s", lf->lineIx, lf->fileName); rsi->mrnaAcc = cloneString(s); if ((s = hashFindVal(raHash, "siz")) == NULL) errAbort("No siz near line %d of %s", lf->lineIx, lf->fileName); rsi->size = atoi(s); if ((s = hashFindVal(raHash, "gen")) != NULL) rsi->geneName = cloneString(s); //!!!else //!!! warn("No gene name for %s", rsi->mrnaAcc); if ((s = hashFindVal(raHash, "cds")) != NULL) parseCds(s, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd); else rsi->cdsEnd = rsi->size; if ((s = hashFindVal(raHash, "ngi")) != NULL) rsi->ngi = atoi(s); rsi->geneNameId = putInNameTable(geneHash, geneTab, rsi->geneName); s = hashFindVal(raHash, "pro"); if (s != NULL) rsi->productName = cloneString(s); rsi->productNameId = putInNameTable(productHash, productTab, s); hashAdd(rsiHash, rsi->mrnaAcc, rsi); freeHashAndVals(&raHash); } lineFileClose(&lf); if (clDots) printf("\n"); /* Scan through loc2ref filling in some gaps in rsi. */ printf("Scanning %s\n", loc2refFile); lf = lineFileOpen(loc2refFile, TRUE); while (lineFileNext(lf, &line, NULL)) { char *mrnaAcc; if (line[0] == '#') continue; wordCount = chopTabs(line, row); if (wordCount < 5) errAbort("Expecting at least 5 tab-separated words line %d of %s", lf->lineIx, lf->fileName); mrnaAcc = row[1]; mrnaAcc = accWithoutSuffix(mrnaAcc); if (mrnaAcc[2] != '_') warn("%s is and odd name %d of %s", mrnaAcc, lf->lineIx, lf->fileName); if ((rsi = hashFindVal(rsiHash, mrnaAcc)) != NULL) { rsi->locusLinkId = lineFileNeedNum(lf, row, 0); rsi->omimId = ptToInt(hashFindVal(loc2mimHash, row[0])); rsi->proteinAcc = cloneString(accWithoutSuffix(row[4])); } } lineFileClose(&lf); /* Report how many seem to be missing from loc2ref file. * Write out knownInfo file. */ printf("Writing %s\n", "refLink.tab"); for (rsi = rsiList; rsi != NULL; rsi = rsi->next) { ++rsiCount; if (rsi->locusLinkId == 0) ++noLocCount; if (rsi->proteinAcc == NULL) ++noProtCount; fprintf(refLinkTab, "%s\t%s\t%s\t%s\t%u\t%u\t%u\t%u\n", emptyForNull(rsi->geneName), emptyForNull(rsi->productName), emptyForNull(rsi->mrnaAcc), emptyForNull(rsi->proteinAcc), rsi->geneNameId, rsi->productNameId, rsi->locusLinkId, rsi->omimId); } if (noLocCount) printf("Missing locusLinkIds for %d of %d\n", noLocCount, rsiCount); if (noProtCount) printf("Missing protein accessions for %d of %d\n", noProtCount, rsiCount); /* Process alignments and write them out as genes. */ lf = pslFileOpen(pslFile); dotMod = 0; while ((psl = pslNext(lf)) != NULL) { if (hashFindVal(rsiHash, psl->qName) != NULL) { if (clDots > 0 && ++dotMod == clDots ) { dotMod = 0; dotOut(); } sqlSafefFrag(cond_str, sizeof cond_str, "extAC='%s'", psl->qName); answer = sqlGetField(proteinDB, "spXref2", "displayID", cond_str); if (answer == NULL) { fprintf(stderr, "%s NOT FOUND.\n", psl->qName); fflush(stderr); } if (answer != NULL) { struct genePred *gp = NULL; exonList = pslToExonList(psl); fprintf(kgTab, "%s\t%s\t%c\t%d\t%d\t", psl->qName, psl->tName, psl->strand[0], psl->tStart, psl->tEnd); rsi = hashMustFindVal(rsiHash, psl->qName); gp = genePredFromPsl(psl, rsi->cdsStart, rsi->cdsEnd, genePredStdInsertMergeSize); if (!gp) errAbort("Cannot convert psl (%s) to genePred.\n", psl->qName); fprintf(kgTab, "%d\t%d\t", gp->cdsStart, gp->cdsEnd); fprintf(kgTab, "%d\t", slCount(exonList)); fflush(kgTab); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(kgTab, "%d,", exon->start); fprintf(kgTab, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(kgTab, "%d,", exon->end); fprintf(kgTab, "\n"); slFreeList(&exonList); } } else { fprintf(stderr, "%s found in psl, but not in .fa or .ra data files.\n", psl->qName); fflush(stderr); } } if (clDots) printf("\n"); if (!clTest) { writeSeqTable(pepFile, refPepTab, FALSE, TRUE); writeSeqTable(faFile, refMrnaTab, FALSE, FALSE); } carefulClose(&kgTab); carefulClose(&productTab); carefulClose(&geneTab); carefulClose(&refLinkTab); carefulClose(&refPepTab); carefulClose(&refMrnaTab); if (!clTest) { printf("Loading database with %s\n", kgName); fflush(stdout); hgLoadTabFile(conn, ".", kgName, NULL); printf("Loading database with %s\n", "productName"); fflush(stdout); hgLoadTabFile(conn, ".", "productName", NULL); printf("Loading database with %s\n", "geneName"); fflush(stdout); hgLoadTabFile(conn, ".", "geneName", NULL); printf("Loading database with %s\n", "refLink"); fflush(stdout); hgLoadTabFile(conn, ".", "refLink", NULL); printf("Loading database with %s\n", "refPep"); fflush(stdout); hgLoadTabFile(conn, ".", "refPep", NULL); printf("Loading database with %s\n", "refMrna"); fflush(stdout); hgLoadTabFile(conn, ".", "refMrna", NULL); } }
void bigBlat(struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus) /* Run query against translated DNA database (3 frames on each strand). */ { int frame, i; struct dnaSeq *seq, trimmedSeq; struct genoFind *gfs[3]; aaSeq *dbSeqLists[3]; struct trans3 *t3List = NULL; int isRc; struct lineFile *lf = NULL; struct hash *t3Hash = NULL; boolean forceUpper = FALSE; boolean forceLower = FALSE; boolean toggle = FALSE; boolean maskUpper = FALSE; ZeroVar(&trimmedSeq); if (showStatus) printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount); /* Figure out how to manage query case. Proteins want to be in * upper case, generally, nucleotides in lower case. But there * may be repeatMasking based on case as well. */ if (transQuery) { if (qMask == NULL) forceLower = TRUE; else { maskUpper = TRUE; toggle = !sameString(qMask, "upper"); } } else { forceUpper = TRUE; } if (gvo->fileHead != NULL) gvo->fileHead(gvo, out); for (isRc = FALSE; isRc <= 1; ++isRc) { /* Initialize local pointer arrays to NULL to prevent surprises. */ for (frame = 0; frame < 3; ++frame) { gfs[frame] = NULL; dbSeqLists[frame] = NULL; } t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash); for (frame = 0; frame < 3; ++frame) { gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, repMatch, ooc, TRUE, oneOff, FALSE, stepSize); } for (i=0; i<queryCount; ++i) { aaSeq qSeq; lf = lineFileOpen(queryFiles[i], TRUE); while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name)) { dotOut(); /* Put it into right case and optionally mask on case. */ if (forceLower) toLowerN(qSeq.dna, qSeq.size); else if (forceUpper) toUpperN(qSeq.dna, qSeq.size); else if (maskUpper) { if (toggle) toggleCase(qSeq.dna, qSeq.size); upperToN(qSeq.dna, qSeq.size); } if (qSeq.size > qWarnSize) { warn("Query sequence %s has size %d, it might take a while.", qSeq.name, qSeq.size); } trimSeq(&qSeq, &trimmedSeq); if (transQuery) transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out); else tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out); gfOutputQuery(gvo, out); } lineFileClose(&lf); } /* Clean up time. */ trans3FreeList(&t3List); freeHash(&t3Hash); for (frame = 0; frame < 3; ++frame) { genoFindFree(&gfs[frame]); } for (seq = untransList; seq != NULL; seq = seq->next) { reverseComplement(seq->dna, seq->size); } } carefulClose(&out); }