Пример #1
0
void readStsInfo(struct lineFile *sif)
/* Read in current stsInfo file */
{
  struct stsInfo2 *si;
  struct sts *s;
  struct primer *p;
  struct gb *gb;
  char name[16], *words[52];
  int i;
  
  stsHash = newHash(20);
  dbStsIdHash = newHash(20);
  primerHash = newHash(20);
  nameHash = newHash(24);
  gbAccHash = newHash(20);
  ucscIdHash = newHash(20);

  /* Read in all rows */  
  while (lineFileChopTab(sif, words))
    {
      si = stsInfo2Load(words);
      /* Determine next ucsc id to be used */
      if (si->identNo >= nextUcscId)
	nextUcscId = si->identNo + 1;

      /* Create sts struct */
      if (sameString(si->organism, "H**o sapiens\0"))
	{
	  AllocVar(s);
	  s->next = NULL;
	  s->si = si;
	  s->fa = NULL;
	  s->mapped = isMapped(si);
	  s->dbstsIdExists = FALSE;
	  slAddHead(&sList, s);
	  safef(name, ArraySize(name), "%d", si->identNo);
	  hashAdd(stsHash, name, s);
	  
	  /* Add ids to dbStsIdHash */
	  if (si->dbSTSid)
	    hashAddInt(dbStsIdHash, name, si->dbSTSid);
	  
	  /* Add sts records to ucscId hash */
	  if (si->dbSTSid)
	    {
	      safef(name, ArraySize(name),  "%d", si->dbSTSid);
	      hashAdd(ucscIdHash, name, s);
	    }
	  for (i = 0; i < si->otherDbstsCount; i++)
	    {
	      safef(name, ArraySize(name), "%d", si->otherDbSTS[i]);
	      if (!hashLookup(ucscIdHash, name))
		hashAdd(ucscIdHash, name, s);
	    }
	  
	  /* Add names to name hash and genbank hash */
	  hashAdd(nameHash, si->name, s);
      	  for (i = 0; i < si->gbCount; i++) 
	    {
	      hashAdd(nameHash, si->genbank[i], s);
	      AllocVar(gb);
	      gb->next = NULL;
	      gb->acc = cloneString(si->genbank[i]);
	      gb->s = s;
	      gb->gbSeq = FALSE;
	      hashAdd(gbAccHash, gb->acc, gb);
	    }
	  for (i = 0; i < si->gdbCount; i++) 
	    hashAdd(nameHash, si->gdb[i], s);
	  for (i = 0; i < si->nameCount; i++) 
	    hashAdd(nameHash, si->otherNames[i], s);
	  
	  /* Create primer info if available and add to hash */
	  if (differentString(si->leftPrimer, "\0"))
	    {
	      AllocVar(p);
	      p->next = NULL;
	      p->dbStsId = si->dbSTSid;
	      p->left = cloneString(si->leftPrimer);
	      p->right = cloneString(si->rightPrimer);
	      p->dist = cloneString(si->distance);
	      p->ucscId = si->identNo;
	      safef(name, ArraySize(name), "%d", p->dbStsId);
	      hashAdd(primerHash, name, p);
	    }
	}
      else 
	{
	  stsInfo2Free(&si);
	}
    }
}
Пример #2
0
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table,
	struct hash *geneToModuleHash, struct hash *moduleAndMotifHash,
	struct hash *motifHash, struct hash *positionsHash,
	char *regionTable)
/* Load file which is a big matrix with genes for rows and motifs for
 * columns.  There is a semicolon-separated list of numbers in the matrix 
 * where a gene has the motif, and an empty (tab separated) field
 * where there is no motif.  The numbers are relative to the
 * region associated with the gene in the positionsHash. 
 * Only load bits of this where motif actually occurs in module associated 
 * with gene. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
FILE *f = hgCreateTabFile(tmpDir, table);
char *motifNames[32*1024], *row[32*1024];
int motifCount, rowSize, i;
char *gene, *module;
int geneCount = 0, total = 0;
struct dyString *dy = dyStringNew(512);
struct genomePos *motifPosList = NULL, *motifPosForGene;
struct genomePos *regionPosList = NULL, *regionPos;

/* Read first line, which is labels. */
if (!lineFileNextReal(lf, &line))
    errAbort("Empty file %s", fileName);
subChar(line, ' ', '_');
motifCount = chopLine(line, motifNames);
if (motifCount >= ArraySize(motifNames))
    errAbort("Too many motifs line 1 of %s", fileName);
lineFileExpectAtLeast(lf, 2, motifCount);
motifNames[0] = NULL;
for (i=1; i<motifCount; ++i)
    {
    char name[64];
    motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name)));
    if (!hashLookup(motifHash, motifNames[i]))
        errAbort("Motif %s is in %s but not modules_motifs.gxm",
		motifNames[i], fileName);
    }

/* Read subsequent lines. */
while ((rowSize = lineFileChopTab(lf, row)) != 0)
    {
    lineFileExpectWords(lf, motifCount, rowSize);
    gene = row[0];
    module = hashFindVal(geneToModuleHash, gene);
    if (module == NULL)
	{
        warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", 
		gene, lf->lineIx, lf->fileName);
	continue;
	}
    regionPos = NULL;
    for (i=1; i<rowSize; ++i)
        {
	if (row[i][0] != 0)
	    {
	    if (hashLookup2(moduleAndMotifHash, module, motifNames[i]))
		{
		regionPos = hashFindVal(positionsHash, gene);
		if (regionPos == NULL)
		    {
		    warn("WARNING: %s in %s but not gene_positions.tab",
		    	gene, fileName);
		    i = rowSize; continue;
		    }
		
		motifPosForGene = convertMotifPos(row[i], regionPos, 
			hashMustFindVal(motifHash, motifNames[i]), lf);
		motifPosList = slCat(motifPosForGene, motifPosList);
		++total;
		}
	    }
	}
    if (regionPos != NULL)
        {
	slAddHead(&regionPosList, regionPos);
	}
    ++geneCount;
    }
lineFileClose(&lf);

/* Output sorted table of all motif hits. */
    {
    struct genomePos *pos;
    slSort(&motifPosList, genomePosCmp);
    for (pos = motifPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->motif);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\t", pos->strand);
	fprintf(f, "%s\n", pos->name);
	}
    sqlDyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "    gene varchar(255) not null,\n"
    "              #Indices\n"
    "    INDEX(gene(12)),\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  table);
    sqlRemakeTable(conn, table, dy->string);
    verbose(1, "%d genes, %d motifs, %d motifs in genes\n",
	    geneCount, motifCount-1, total);
    hgLoadTabFile(conn, tmpDir, table, &f);
    // hgRemoveTabFile(tmpDir, table);
    verbose(1, "Loaded %s table\n", table);
    slFreeList(&motifPosList);
    }

/* Now output sorted table of upstream regions. */
    {
    FILE *f = hgCreateTabFile(tmpDir, regionTable);
    struct genomePos *pos;
    dyStringClear(dy);
    sqlDyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "              #Indices\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  regionTable);
    sqlRemakeTable(conn, regionTable, dy->string);
    slSort(&regionPosList, genomePosCmp);
    for (pos = regionPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->name);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\n", pos->strand);
	}
    hgLoadTabFile(conn, tmpDir, regionTable, &f);
    // hgRemoveTabFile(tmpDir, regionTable);
    }
}
void readZfinMapping(struct lineFile *zmf)
/* Read in ZFIN IDs and mapping panel data */
{
char *words[6], *name = NULL, *nameLower = NULL, *id = NULL, *panel = NULL;
char *chrom = NULL, *units = NULL;
double pos;
int i = 0, j, index = -1, k;
struct zfin *zf = NULL;
struct panel *p = NULL;
boolean foundPos = FALSE;

zfinMarkerHash = newHash(16);
zfinIdHash = newHash(16);

while (lineFileChopTab(zmf, words) )
    {
    /* get ZFIN Id, name and panel */
    id = cloneString(words[0]);
    name = cloneString(words[1]);
    nameLower = cloneString(words[1]);
    panel = cloneString(words[2]);
    chrom = cloneString(words[3]);
    pos = sqlDouble(words[4]);
    units = cloneString(words[5]);
    /* check if this name exists already in hash */
    touppers(name);
    zf = hashFindVal(zfinMarkerHash, name);
    if (zf == NULL)    
        {
        /* allocate memory for zfin struct if not in hash */
        AllocVar(zf);
        /* find panel and add marker information */
        /* first initialize */
        zf->panelArray = needMem(sizeof(struct panel*) * NUMPANELS);
        for (i = 0; i < NUMPANELS; i++)
            {
            zf->panelArray[i] = NULL;
            }
     
        zf->zfinId = cloneString(id);
        zf->name = cloneString(nameLower);
        zf->acc = NULL;
        zf->zfAlias = NULL;
        }
    /* find index number for this panel */
    for (j = 0; j < NUMPANELS; j++)
        {
        if (sameString(panel, markerPanels[j]))
           index = j;
        }
    if (zf->panelArray[index] == NULL)
        {
        /* allocate memory for panel struct for panel data */
        AllocVar(p);
        /* initialize chrom and pos arrays */
        for (k = 0; k < NUMPOS; k++)
            {
            p->chrom[k] = NULL;
            p->pos[k] = -1;
            }
 
        p->name = cloneString(nameLower);
        p->panel = cloneString(panel);
        p->units = cloneString(units);
        } 
    else
       p = zf->panelArray[index];
    foundPos = FALSE;
    for (k = 0; k < NUMPOS && (!foundPos); k++)
        { 
        if (p->chrom[k] == NULL)
            {
            if (!sameString(p->name, nameLower))
                fprintf(stderr, "The entry in position %d does not match %s\n", index, nameLower);
            else 
                {
                p->chrom[k] = cloneString(chrom);
                p->pos[k] = pos;
                foundPos = TRUE;
                }
            }
        }
    zf->panelArray[index] = p;
        
    /* name is already in upper case, add zfin to zfinMarkerHash */
    addHashElUnique(zfinMarkerHash, name, zf);
    /* if new ZFIN struct add the name to the zfinIdHash keyed by ZFIN ID */
    if (hashFindVal(zfinIdHash, id) == NULL)
        addHashElUnique(zfinIdHash, id, nameLower);
    }
}