void writeTabFile(struct itemAttr *itemAttrs, char *tabFile) /* write rows to tab file, with bin column */ { FILE *tabFh = mustOpen(tabFile, "w"); struct itemAttr *ia; for (ia = itemAttrs; ia != NULL; ia = ia->next) { fprintf(tabFh, "%u\t", binFromRange(ia->chromStart, ia->chromEnd)); itemAttrTabOut(ia, tabFh); } carefulClose(&tabFh); }
void printBed(FILE *out, struct pslPair *ppList, struct clone *clone, char *pslTable) { struct pslPair *pp, *ppPrev; int count = 0, best = 0; for (pp = ppList; pp != NULL; pp=pp->next) if (pp->score > best) best = pp->score; while (((ppList->score)/best) < NEARTOP) { struct pslPair *temp = ppList; ppList = ppList->next; pslPairFree(&temp); } ppPrev = ppList; for (pp = ppList; pp != NULL; pp=pp->next) { if (((pp->score)/best) >= NEARTOP) count++; else { ppPrev->next = pp->next; pslPairFree(&pp); pp = ppPrev; } ppPrev = pp; } for (pp = ppList; pp != NULL; pp=pp->next) { int bin = binFromRange(pp->f->psl->tStart,pp->r->psl->tEnd); int score = 1000; char *strand; int d1, d2; if (count != 1) score = 1500/count; if (hashLookup(leftNames, pp->f->psl->qName)) strand = "+"; else strand = "-"; d1 = pp->f->psl->tEnd - pp->f->psl->tStart; d2 = pp->r->psl->tEnd - pp->r->psl->tStart; if (!NOBIN) fprintf(out, "%d\t",bin); fprintf(out, "%s\t%d\t%d\t%s\t%d\t%s\t%s\t2\t%d,%d\t%d,%d\t%s,%s\n", pp->f->psl->tName,pp->f->psl->tStart,pp->r->psl->tEnd,clone->name, score, strand, pslTable, pp->f->psl->tStart, pp->r->psl->tStart, d1, d2, pp->f->psl->qName, pp->r->psl->qName); } }
static void testOneBin(int start, int end, int expected) /* expected < 0 == do not check, do not know the answer */ { int bin; bin = binFromRange(start,end); if ((expected >= 0) && (expected != bin)) { verbose(2,"#\tERROR: expected: %d got: %d = binFromRange(%d, %d)\n", expected, bin, start, end); ++failureCount; } verbose(3,"#\t%5d = binFromRange(%d, %d)\n", bin, start, end); }
void processFrameFile(FILE *sortFh, char *framesFile) /* read records from one frame file, adding bin and write to pipe to sort */ { struct lineFile *inLf = lineFileOpen(framesFile, TRUE); struct mafFrames mf; char *row[MAFFRAMES_NUM_COLS]; while (lineFileNextRowTab(inLf, row, MAFFRAMES_NUM_COLS)) { mafFramesStaticLoad(row, &mf); fprintf(sortFh, "%d\t", binFromRange(mf.chromStart, mf.chromEnd)); mafFramesTabOut(&mf, sortFh); } lineFileClose(&inLf); }
static void createCcdsGene(struct sqlConnection *conn, char *ccdsGeneFile, struct genomeInfo *genome, struct hash* ignoreTbl, struct hash *gotCcds) /* create the ccdsGene tab file from the ccds database */ { struct ccdsLocationsJoin *locs = loadLocations(conn, genome, ignoreTbl, gotCcds); struct genePred *gp, *genes = buildCcdsGene(&locs); FILE *genesFh; genesFh = mustOpen(ccdsGeneFile, "w"); for (gp = genes; gp != NULL; gp = gp->next) { if (loadDb) fprintf(genesFh, "%d\t", binFromRange(gp->txStart, gp->txEnd)); genePredTabOut(gp, genesFh); } carefulClose(&genesFh); genePredFreeList(&genes); }
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table, struct hash *geneToModuleHash, struct hash *moduleAndMotifHash, struct hash *motifHash, struct hash *positionsHash, char *regionTable) /* Load file which is a big matrix with genes for rows and motifs for * columns. There is a semicolon-separated list of numbers in the matrix * where a gene has the motif, and an empty (tab separated) field * where there is no motif. The numbers are relative to the * region associated with the gene in the positionsHash. * Only load bits of this where motif actually occurs in module associated * with gene. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; FILE *f = hgCreateTabFile(tmpDir, table); char *motifNames[32*1024], *row[32*1024]; int motifCount, rowSize, i; char *gene, *module; int geneCount = 0, total = 0; struct dyString *dy = dyStringNew(512); struct genomePos *motifPosList = NULL, *motifPosForGene; struct genomePos *regionPosList = NULL, *regionPos; /* Read first line, which is labels. */ if (!lineFileNextReal(lf, &line)) errAbort("Empty file %s", fileName); subChar(line, ' ', '_'); motifCount = chopLine(line, motifNames); if (motifCount >= ArraySize(motifNames)) errAbort("Too many motifs line 1 of %s", fileName); lineFileExpectAtLeast(lf, 2, motifCount); motifNames[0] = NULL; for (i=1; i<motifCount; ++i) { char name[64]; motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name))); if (!hashLookup(motifHash, motifNames[i])) errAbort("Motif %s is in %s but not modules_motifs.gxm", motifNames[i], fileName); } /* Read subsequent lines. */ while ((rowSize = lineFileChopTab(lf, row)) != 0) { lineFileExpectWords(lf, motifCount, rowSize); gene = row[0]; module = hashFindVal(geneToModuleHash, gene); if (module == NULL) { warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", gene, lf->lineIx, lf->fileName); continue; } regionPos = NULL; for (i=1; i<rowSize; ++i) { if (row[i][0] != 0) { if (hashLookup2(moduleAndMotifHash, module, motifNames[i])) { regionPos = hashFindVal(positionsHash, gene); if (regionPos == NULL) { warn("WARNING: %s in %s but not gene_positions.tab", gene, fileName); i = rowSize; continue; } motifPosForGene = convertMotifPos(row[i], regionPos, hashMustFindVal(motifHash, motifNames[i]), lf); motifPosList = slCat(motifPosForGene, motifPosList); ++total; } } } if (regionPos != NULL) { slAddHead(®ionPosList, regionPos); } ++geneCount; } lineFileClose(&lf); /* Output sorted table of all motif hits. */ { struct genomePos *pos; slSort(&motifPosList, genomePosCmp); for (pos = motifPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->motif); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\t", pos->strand); fprintf(f, "%s\n", pos->name); } dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " gene varchar(255) not null,\n" " #Indices\n" " INDEX(gene(12)),\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", table); sqlRemakeTable(conn, table, dy->string); verbose(1, "%d genes, %d motifs, %d motifs in genes\n", geneCount, motifCount-1, total); hgLoadTabFile(conn, tmpDir, table, &f); // hgRemoveTabFile(tmpDir, table); verbose(1, "Loaded %s table\n", table); slFreeList(&motifPosList); } /* Now output sorted table of upstream regions. */ { FILE *f = hgCreateTabFile(tmpDir, regionTable); struct genomePos *pos; dyStringClear(dy); dyStringPrintf(dy, "CREATE TABLE %s (\n" " bin smallInt unsigned not null,\n" " chrom varChar(255) not null,\n" " chromStart int not null,\n" " chromEnd int not null,\n" " name varchar(255) not null,\n" " score int not null,\n" " strand char(1) not null,\n" " #Indices\n" " INDEX(name(16)),\n" " INDEX(chrom(8),bin)\n" ")\n", regionTable); sqlRemakeTable(conn, regionTable, dy->string); slSort(®ionPosList, genomePosCmp); for (pos = regionPosList; pos != NULL; pos = pos->next) { int start = pos->start; int end = pos->end; if (start < 0) start = 0; fprintf(f, "%d\t", binFromRange(start, end)); fprintf(f, "%s\t", pos->chrom); fprintf(f, "%d\t%d\t", start, end); fprintf(f, "%s\t", pos->name); fprintf(f, "%d\t", pos->score); fprintf(f, "%c\n", pos->strand); } hgLoadTabFile(conn, tmpDir, regionTable, &f); // hgRemoveTabFile(tmpDir, regionTable); } }
struct annoStreamer *annoStreamDbNew(char *db, char *table, struct annoAssembly *aa, struct asObject *asObj, int maxOutRows) /* Create an annoStreamer (subclass) object from a database table described by asObj. */ { struct sqlConnection *conn = hAllocConn(db); if (!sqlTableExists(conn, table)) errAbort("annoStreamDbNew: table '%s' doesn't exist in database '%s'", table, db); struct annoStreamDb *self = NULL; AllocVar(self); struct annoStreamer *streamer = &(self->streamer); int dbtLen = strlen(db) + strlen(table) + 2; char dbTable[dbtLen]; safef(dbTable, dbtLen, "%s.%s", db, table); annoStreamerInit(streamer, aa, asObj, dbTable); streamer->rowType = arWords; streamer->setRegion = asdSetRegion; streamer->nextRow = asdNextRow; streamer->close = asdClose; self->conn = conn; self->table = cloneString(table); char *asFirstColumnName = streamer->asObj->columnList->name; if (sqlFieldIndex(self->conn, self->table, "bin") == 0) { self->hasBin = 1; self->minFinestBin = binFromRange(0, 1); } if (self->hasBin && !sameString(asFirstColumnName, "bin")) self->omitBin = 1; if (!asdInitBed3Fields(self)) errAbort("annoStreamDbNew: can't figure out which fields of %s.%s to use as " "{chrom, chromStart, chromEnd}.", db, table); self->makeBaselineQuery = asdMakeBaselineQuery; // When a table has an index on endField, sometimes the query optimizer uses it // and that ruins the sorting. Fortunately most tables don't anymore. self->endFieldIndexName = sqlTableIndexOnField(self->conn, self->table, self->endField); self->notSorted = FALSE; // Special case: genbank-updated tables are not sorted because new mappings are // tacked on at the end. if (isIncrementallyUpdated(table)) self->notSorted = TRUE; self->mergeBins = FALSE; self->maxOutRows = maxOutRows; self->useMaxOutRows = (maxOutRows > 0); self->needQuery = TRUE; self->chromList = annoAssemblySeqNames(aa); if (slCount(self->chromList) > 1000) { // Assembly has many sequences (e.g. scaffold-based assembly) -- // don't break up into per-sequence queries. Take our chances // with mysql being unhappy about the sqlResult being open too long. self->doQuery = asdDoQuerySimple; self->nextRowRaw = nextRowFromSqlResult; } else { // All-chromosome assembly -- if table is large, perform a series of // chunked queries. self->doQuery = asdDoQueryChunking; self->nextRowRaw = nextRowFromBuffer; } return (struct annoStreamer *)self; }
void printOrphan(FILE *out, struct pslAli *paList, struct clone *clone, char *pslTable) { struct pslAli *pa; int best = 0, count = 0; for (pa = paList; pa != NULL; pa=pa->next) if (pa->score > best) best = pa->score; for (pa = paList; pa != NULL; pa=pa->next) if ((((pa->score)/best) > NEARTOP) && (pa->id >= MIN_ORPHAN_ID)) count++; for (pa = paList; pa != NULL; pa=pa->next) if ((((pa->score)/best) > NEARTOP) && (pa->id >= MIN_ORPHAN_ID)) { int bin = binFromRange(pa->psl->tStart,pa->psl->tEnd); int score = 1000; char *strand; int d1, genStart = 0, genEnd = 0; if (count != 1) score = 1500/count; if (hashLookup(leftNames,pa->psl->qName)) { if (pa->psl->strand[0] == '+') { genStart = pa->psl->tStart; genEnd = pa->psl->tEnd+(MIN/2); } else { genStart = pa->psl->tStart-(MIN/2); genEnd = pa->psl->tEnd; } } else { if (pa->psl->strand[0] == '-') { genStart = pa->psl->tStart-(MIN/2); genEnd = pa->psl->tEnd; } else { genStart = pa->psl->tStart; genEnd = pa->psl->tEnd+(MIN/2); } } if (genStart < 0) genStart = 0; if (((hashLookup(leftNames,pa->psl->qName)) && (pa->psl->strand[0] == '+')) || ((hashLookup(rightNames,pa->psl->qName)) && (pa->psl->strand[0] == '-'))) strand = "+"; else strand = "-"; d1 = pa->psl->tEnd - pa->psl->tStart; if (!NOBIN) fprintf(out, "%d\t",bin); fprintf(out, "%s\t%d\t%d\t%s\t%d\t%s\t%s\t1\t%d\t%d\t%s\n", pa->psl->tName,genStart,genEnd,clone->name, score, strand, pslTable, pa->psl->tStart, d1, pa->psl->qName); } }