void writeTabFile(struct itemAttr *itemAttrs, char *tabFile)
/* write rows to tab file, with bin column */
{
FILE *tabFh = mustOpen(tabFile, "w");
struct itemAttr *ia;

for (ia = itemAttrs; ia != NULL; ia = ia->next)
    {
    fprintf(tabFh, "%u\t", binFromRange(ia->chromStart, ia->chromEnd));
    itemAttrTabOut(ia, tabFh);
    }
carefulClose(&tabFh);
}
Ejemplo n.º 2
0
void printBed(FILE *out, struct pslPair *ppList, struct clone *clone, char *pslTable)
{
  struct pslPair *pp, *ppPrev;
  int count = 0, best = 0;

  for (pp = ppList; pp != NULL; pp=pp->next)
    if (pp->score > best)
      best = pp->score;
  while (((ppList->score)/best) < NEARTOP)
    {
      struct pslPair *temp = ppList;
      ppList = ppList->next;
      pslPairFree(&temp);
    }
  ppPrev = ppList;
  for (pp = ppList; pp != NULL; pp=pp->next)
    {
      if (((pp->score)/best) >= NEARTOP)
	count++;
      else
	{
	  ppPrev->next = pp->next;
	  pslPairFree(&pp);
	  pp = ppPrev;
	}
      ppPrev = pp;
    }
  for (pp = ppList; pp != NULL; pp=pp->next)
    {
      int bin = binFromRange(pp->f->psl->tStart,pp->r->psl->tEnd);
      int score = 1000;
      char *strand;
      int d1, d2;

      if (count != 1)
	score = 1500/count;
      if (hashLookup(leftNames, pp->f->psl->qName))
	strand = "+";
      else 
	strand = "-";
      d1 = pp->f->psl->tEnd - pp->f->psl->tStart;
      d2 = pp->r->psl->tEnd - pp->r->psl->tStart;
      
      if (!NOBIN) 
	fprintf(out, "%d\t",bin);
      fprintf(out, "%s\t%d\t%d\t%s\t%d\t%s\t%s\t2\t%d,%d\t%d,%d\t%s,%s\n",
	      pp->f->psl->tName,pp->f->psl->tStart,pp->r->psl->tEnd,clone->name,
	      score, strand, pslTable, pp->f->psl->tStart, pp->r->psl->tStart, d1, d2, 
	      pp->f->psl->qName, pp->r->psl->qName);
    }
}
Ejemplo n.º 3
0
static void testOneBin(int start, int end, int expected)
/*	expected < 0 == do not check, do not know the answer */
{
int bin;

bin = binFromRange(start,end);
if ((expected >= 0) && (expected != bin))
    {
    verbose(2,"#\tERROR: expected: %d got: %d = binFromRange(%d, %d)\n",
	expected, bin, start, end);
    ++failureCount;
    }
verbose(3,"#\t%5d = binFromRange(%d, %d)\n", bin, start, end);
}
Ejemplo n.º 4
0
void processFrameFile(FILE *sortFh, char *framesFile)
/* read records from one frame file, adding bin and write to pipe to sort */
{
struct lineFile *inLf = lineFileOpen(framesFile, TRUE);
struct mafFrames mf;
char *row[MAFFRAMES_NUM_COLS];

while (lineFileNextRowTab(inLf, row, MAFFRAMES_NUM_COLS))
    {
    mafFramesStaticLoad(row, &mf);
    fprintf(sortFh, "%d\t",  binFromRange(mf.chromStart, mf.chromEnd));
    mafFramesTabOut(&mf, sortFh);
    }
lineFileClose(&inLf);
}
Ejemplo n.º 5
0
static void createCcdsGene(struct sqlConnection *conn, char *ccdsGeneFile,
                           struct genomeInfo *genome, struct hash* ignoreTbl,
                           struct hash *gotCcds)
/* create the ccdsGene tab file from the ccds database */
{
struct ccdsLocationsJoin *locs = loadLocations(conn, genome, ignoreTbl, gotCcds);
struct genePred *gp, *genes = buildCcdsGene(&locs);
FILE *genesFh;

genesFh = mustOpen(ccdsGeneFile, "w");
for (gp = genes; gp != NULL; gp = gp->next)
    {
    if (loadDb)
        fprintf(genesFh, "%d\t", binFromRange(gp->txStart, gp->txEnd));
    genePredTabOut(gp, genesFh);
    }
carefulClose(&genesFh);
genePredFreeList(&genes);
}
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table,
	struct hash *geneToModuleHash, struct hash *moduleAndMotifHash,
	struct hash *motifHash, struct hash *positionsHash,
	char *regionTable)
/* Load file which is a big matrix with genes for rows and motifs for
 * columns.  There is a semicolon-separated list of numbers in the matrix 
 * where a gene has the motif, and an empty (tab separated) field
 * where there is no motif.  The numbers are relative to the
 * region associated with the gene in the positionsHash. 
 * Only load bits of this where motif actually occurs in module associated 
 * with gene. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
FILE *f = hgCreateTabFile(tmpDir, table);
char *motifNames[32*1024], *row[32*1024];
int motifCount, rowSize, i;
char *gene, *module;
int geneCount = 0, total = 0;
struct dyString *dy = dyStringNew(512);
struct genomePos *motifPosList = NULL, *motifPosForGene;
struct genomePos *regionPosList = NULL, *regionPos;

/* Read first line, which is labels. */
if (!lineFileNextReal(lf, &line))
    errAbort("Empty file %s", fileName);
subChar(line, ' ', '_');
motifCount = chopLine(line, motifNames);
if (motifCount >= ArraySize(motifNames))
    errAbort("Too many motifs line 1 of %s", fileName);
lineFileExpectAtLeast(lf, 2, motifCount);
motifNames[0] = NULL;
for (i=1; i<motifCount; ++i)
    {
    char name[64];
    motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name)));
    if (!hashLookup(motifHash, motifNames[i]))
        errAbort("Motif %s is in %s but not modules_motifs.gxm",
		motifNames[i], fileName);
    }

/* Read subsequent lines. */
while ((rowSize = lineFileChopTab(lf, row)) != 0)
    {
    lineFileExpectWords(lf, motifCount, rowSize);
    gene = row[0];
    module = hashFindVal(geneToModuleHash, gene);
    if (module == NULL)
	{
        warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", 
		gene, lf->lineIx, lf->fileName);
	continue;
	}
    regionPos = NULL;
    for (i=1; i<rowSize; ++i)
        {
	if (row[i][0] != 0)
	    {
	    if (hashLookup2(moduleAndMotifHash, module, motifNames[i]))
		{
		regionPos = hashFindVal(positionsHash, gene);
		if (regionPos == NULL)
		    {
		    warn("WARNING: %s in %s but not gene_positions.tab",
		    	gene, fileName);
		    i = rowSize; continue;
		    }
		
		motifPosForGene = convertMotifPos(row[i], regionPos, 
			hashMustFindVal(motifHash, motifNames[i]), lf);
		motifPosList = slCat(motifPosForGene, motifPosList);
		++total;
		}
	    }
	}
    if (regionPos != NULL)
        {
	slAddHead(&regionPosList, regionPos);
	}
    ++geneCount;
    }
lineFileClose(&lf);

/* Output sorted table of all motif hits. */
    {
    struct genomePos *pos;
    slSort(&motifPosList, genomePosCmp);
    for (pos = motifPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->motif);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\t", pos->strand);
	fprintf(f, "%s\n", pos->name);
	}
    dyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "    gene varchar(255) not null,\n"
    "              #Indices\n"
    "    INDEX(gene(12)),\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  table);
    sqlRemakeTable(conn, table, dy->string);
    verbose(1, "%d genes, %d motifs, %d motifs in genes\n",
	    geneCount, motifCount-1, total);
    hgLoadTabFile(conn, tmpDir, table, &f);
    // hgRemoveTabFile(tmpDir, table);
    verbose(1, "Loaded %s table\n", table);
    slFreeList(&motifPosList);
    }

/* Now output sorted table of upstream regions. */
    {
    FILE *f = hgCreateTabFile(tmpDir, regionTable);
    struct genomePos *pos;
    dyStringClear(dy);
    dyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "              #Indices\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  regionTable);
    sqlRemakeTable(conn, regionTable, dy->string);
    slSort(&regionPosList, genomePosCmp);
    for (pos = regionPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->name);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\n", pos->strand);
	}
    hgLoadTabFile(conn, tmpDir, regionTable, &f);
    // hgRemoveTabFile(tmpDir, regionTable);
    }
}
Ejemplo n.º 7
0
struct annoStreamer *annoStreamDbNew(char *db, char *table, struct annoAssembly *aa,
				     struct asObject *asObj, int maxOutRows)
/* Create an annoStreamer (subclass) object from a database table described by asObj. */
{
struct sqlConnection *conn = hAllocConn(db);
if (!sqlTableExists(conn, table))
    errAbort("annoStreamDbNew: table '%s' doesn't exist in database '%s'", table, db);
struct annoStreamDb *self = NULL;
AllocVar(self);
struct annoStreamer *streamer = &(self->streamer);
int dbtLen = strlen(db) + strlen(table) + 2;
char dbTable[dbtLen];
safef(dbTable, dbtLen, "%s.%s", db, table);
annoStreamerInit(streamer, aa, asObj, dbTable);
streamer->rowType = arWords;
streamer->setRegion = asdSetRegion;
streamer->nextRow = asdNextRow;
streamer->close = asdClose;
self->conn = conn;
self->table = cloneString(table);
char *asFirstColumnName = streamer->asObj->columnList->name;
if (sqlFieldIndex(self->conn, self->table, "bin") == 0)
    {
    self->hasBin = 1;
    self->minFinestBin = binFromRange(0, 1);
    }
if (self->hasBin && !sameString(asFirstColumnName, "bin"))
    self->omitBin = 1;
if (!asdInitBed3Fields(self))
    errAbort("annoStreamDbNew: can't figure out which fields of %s.%s to use as "
	     "{chrom, chromStart, chromEnd}.", db, table);
self->makeBaselineQuery = asdMakeBaselineQuery;
// When a table has an index on endField, sometimes the query optimizer uses it
// and that ruins the sorting.  Fortunately most tables don't anymore.
self->endFieldIndexName = sqlTableIndexOnField(self->conn, self->table, self->endField);
self->notSorted = FALSE;
// Special case: genbank-updated tables are not sorted because new mappings are
// tacked on at the end.
if (isIncrementallyUpdated(table))
    self->notSorted = TRUE;
self->mergeBins = FALSE;
self->maxOutRows = maxOutRows;
self->useMaxOutRows = (maxOutRows > 0);
self->needQuery = TRUE;
self->chromList = annoAssemblySeqNames(aa);
if (slCount(self->chromList) > 1000)
    {
    // Assembly has many sequences (e.g. scaffold-based assembly) --
    // don't break up into per-sequence queries.  Take our chances
    // with mysql being unhappy about the sqlResult being open too long.
    self->doQuery = asdDoQuerySimple;
    self->nextRowRaw = nextRowFromSqlResult;
    }
else
    {
    // All-chromosome assembly -- if table is large, perform a series of
    // chunked queries.
    self->doQuery = asdDoQueryChunking;
    self->nextRowRaw = nextRowFromBuffer;
    }
return (struct annoStreamer *)self;
}
Ejemplo n.º 8
0
void printOrphan(FILE *out, struct pslAli *paList, struct clone *clone, char *pslTable)
{
  struct pslAli *pa;
  int best = 0, count = 0;

  for (pa = paList; pa != NULL; pa=pa->next)
    if (pa->score > best)
      best = pa->score;
  for (pa = paList; pa != NULL; pa=pa->next)
    if ((((pa->score)/best) > NEARTOP) && (pa->id >= MIN_ORPHAN_ID))
      count++;
  for (pa = paList; pa != NULL; pa=pa->next)
    if ((((pa->score)/best) > NEARTOP) && (pa->id >= MIN_ORPHAN_ID))
      {
      int bin = binFromRange(pa->psl->tStart,pa->psl->tEnd);
      int score = 1000;
      char *strand;
      int d1, genStart = 0, genEnd = 0;

      if (count != 1)
	score = 1500/count;
      if (hashLookup(leftNames,pa->psl->qName)) 
	{
	  if (pa->psl->strand[0] == '+')
	    {
	      genStart = pa->psl->tStart;
	      genEnd = pa->psl->tEnd+(MIN/2);
	    } 
	  else
	    {
	      genStart = pa->psl->tStart-(MIN/2);
	      genEnd = pa->psl->tEnd;
	    }
	}
      else
	{
	  if (pa->psl->strand[0] == '-')
	    {
	      genStart = pa->psl->tStart-(MIN/2);
	      genEnd = pa->psl->tEnd;
	    }
	  else
	    {
	      genStart = pa->psl->tStart;
	      genEnd = pa->psl->tEnd+(MIN/2);
	    }
	}
      if (genStart < 0)
	genStart = 0;
      if (((hashLookup(leftNames,pa->psl->qName)) && (pa->psl->strand[0] == '+')) ||
	  ((hashLookup(rightNames,pa->psl->qName)) && (pa->psl->strand[0] == '-')))
	strand = "+";
      else 
	strand = "-";
      d1 = pa->psl->tEnd - pa->psl->tStart;

      if (!NOBIN) 
	fprintf(out, "%d\t",bin);
      fprintf(out, "%s\t%d\t%d\t%s\t%d\t%s\t%s\t1\t%d\t%d\t%s\n",
	      pa->psl->tName,genStart,genEnd,clone->name,
	      score, strand, pslTable, pa->psl->tStart, d1, pa->psl->qName);        
      }
}