struct clusterMember *loadClusterMembers()
/* Load the probe sets that are in our cluster of interest. */
{
struct clusterMember *cmList = NULL, *cm = NULL;
char *words[3];
struct lineFile *lf = NULL;
char *inputFile = optionVal("clusterFile", NULL);
int wordCount = 0;
assert(inputFile);
lf = lineFileOpen(inputFile, TRUE);
while((wordCount = lineFileChopCharNext(lf, '\t', words, ArraySize(words))) != 0)
    {
    AllocVar(cm);
    if(wordCount == 3) 
	{
	cm->geneId = cloneString(words[0]);
	cm->psName = cloneString(words[1]);
	cm->desc = cloneString(words[2]);
	}
    else if(wordCount == 2)
	{
	cm->psName = cloneString(words[0]);
	cm->desc = cloneString(words[1]);
	}
    else
	errAbort("Got %d words at line %d", wordCount, lf->lineIx);
    slAddHead(&cmList, cm);
    }
lineFileClose(&lf);
slReverse(&cmList);
return cmList; 
}
Exemple #2
0
boolean lineFileNextCharRow(struct lineFile *lf, char sep, char *words[], int wordCount)
/* Return next non-blank line that doesn't start with '#' chopped into words
 * delimited by sep. Returns FALSE at EOF.  Aborts on error. */
{
int wordsRead;
wordsRead = lineFileChopCharNext(lf, sep, words, wordCount);
if (wordsRead == 0)
    return FALSE;
if (wordsRead < wordCount)
    lineFileExpectWords(lf, wordCount, wordsRead);
return TRUE;
}
boolean lineFileNextCharRow2(struct lineFile *lf, char sep, char *words[], int wordCount)
/* Return next non-blank line that doesn't start with '#' chopped into words
 * delimited by sep. Returns FALSE at EOF.  Aborts on error if words in line
 * are not the same as wordCount or wordCount-1. */
{
int wordsRead;
wordsRead = lineFileChopCharNext(lf, sep, words, wordCount);
if (wordsRead == 0)
    return FALSE;
if ((wordsRead > wordCount) || (wordsRead < wordCount-1))
    errAbort("Expecting %d or %d words line %d of %s got %d",
            wordCount, wordCount-1, lf->lineIx, lf->fileName, wordsRead);

return TRUE;
}
void readMarkers(struct lineFile *mkf)
/* Read in Sanger sts name, UniSTS ID and aliases */
/* All Sanger names in this file are found in the Clone marker file */
{
struct bac *b = NULL;
struct alias *a = NULL;
char *words[6], *sanger[NUMSANGER], *stsIdandAliases[NUMALIASES], *extName = NULL;
char *firstAlias = NULL, **aliases = NULL, *pr1 = NULL, *pr2 = NULL;
int sangerCount = 0, nameCount = 0, i, j, k;
char sep = '|';
boolean isId = TRUE;

/* Read in all rows */
while (lineFileChopCharNext(mkf, sep, words, 6))
    {
    sangerCount = chopByChar(words[1], ';', sanger, ArraySize(sanger));
    nameCount = chopByChar(words[2], ';', stsIdandAliases, ArraySize(stsIdandAliases));
    pr1 = cloneString(words[3]);
    pr2 = cloneString(words[4]);

    /* process each sanger name found */
    for (i = 0; i < sangerCount; i++)
        {
        /* use sanger name to find alias struct in hash */
        if ((a = hashFindVal(aliasHash, sanger[i])) != NULL)
            {
            /* if string is numeric, then it is an integer ID so do not add to array */
            k = 0;
            for (j = 0; j < nameCount; j++)
                {
                isId = stringIsAnInteger(stsIdandAliases[j]);
                if (!isId)
                    {
                    a->aliases[k] = cloneString(stsIdandAliases[j]);
                    k++;
                    }
                }
                
            /* store primer sequences */
            a->primer1 = cloneString(pr1);
            a->primer2 = cloneString(pr2);
            }
        else
            fprintf(stderr, "Can not find sanger name, %s, in aliasHash\n", sanger[i]);
        }
    }
}
void readPrimerInfo(struct lineFile *sf)
/* Read in primer info from all.primers file */
{
int wordCount;
char *words[5];
char *dist1, *dist[2];
struct sts *sts;

stsHash = newHash(16);

while (lineFileChopCharNext(sf, '\t', words, 5))
    {
      verbose(2, "# line %d words1-4: '%s' '%s' '%s' '%s'\n", sf->lineIx, words[1], words[2], words[3], words[4]);
      if (words[1] && words[2] && words[3] && words[4])
	{
	  AllocVar(sts);
	  sts->dbstsId = cloneString(words[0]);
	  sts->leftPrimer = cloneString(words[1]);
	  sts->rightPrimer = cloneString(words[2]);
	  sts->size = cloneString(words[3]);
	  sts->ucscId = cloneString(words[4]);
	  sts->found = FALSE;
	  dist1 = cloneString(words[3]);
	  if (sts->leftPrimer && dist1 && differentWord("-", dist1))
	    {
	      wordCount = chopByChar(dist1, '-', dist, ArraySize(dist));
	      sts->minSize = sqlUnsigned(dist[0]);
	      if (wordCount == 1) 
		sts->maxSize = sqlUnsigned(dist[0]);
	      else
		sts->maxSize = sqlUnsigned(dist[1]);
	      if (sts->maxSize == 0) 
		sts->maxSize = 1000;
	      sts->next = NULL;
	      sts->place = NULL;
	      sts->epcr = NULL;
	      hashAdd(stsHash, sts->dbstsId, sts);
	    }
	  slAddHead(&stsList, sts);
	}
    }
}
void readContigs(struct lineFile *cgf)
{
struct bac *b = NULL;
char *words[4], *name = NULL, *extName = NULL, *extName2 = NULL;
char sep = '|';
int i;

/* BAC structs keyed by external name */
bacHash = newHash(16);
/* external names keyed by internal names */
extNameHash = newHash(16);

while (lineFileChopCharNext(cgf, sep, words, 5))
    {
    name = cloneString(words[1]);
    extName = cloneString(words[2]);
    extName2 = cloneString(words[2]);
    if ((b = hashFindVal(bacHash, extName)) == NULL)
        {
        /* allocate memory for bac struct */
        AllocVar(b);
        /* add BAC info to struct */
        b->intName = cloneString(name);
        b->extName = cloneString(extName);
        AllocArray(b->chrom, (sizeof(char *) * NUMCHROMS));
        for (i = 0; i < NUMCHROMS; i++)
            {
            b->chrom[i] = NULL;
            }
        b->acc = NULL;
        hashAdd(bacHash, extName, b);
        hashAdd(extNameHash, name, extName2);
        }
    else
        fprintf(stderr, "The BAC clone %s is assigned to more than one contig\n", extName);
    }
}
struct rnaBinder *loadRnaBinders()
/* Load the probe sets that encode genes thought to 
   bind rnas. Expected order is probeSet, geneName, pfamAcc, pfamName */
{
struct rnaBinder *rbList = NULL, *rb = NULL;
char *words[4];
struct lineFile *lf = NULL;
char *inputFile = optionVal("rnaBindingFile", NULL);

assert(inputFile);
lf = lineFileOpen(inputFile, TRUE);
while(lineFileChopCharNext(lf, '\t', words, ArraySize(words)))
    {
    AllocVar(rb);
    rb->psName = cloneString(words[0]);
    rb->geneName = cloneString(words[1]);
    rb->pfamAcc = cloneString(words[2]);
    rb->pfamName = cloneString(words[3]);
    slAddHead(&rbList, rb);
    }
lineFileClose(&lf);
slReverse(&rbList);
return rbList; 
}
void readCloneNames(struct lineFile *clf)
/* read internal BAC clone names and Sanger sts names */
{
struct alias *a = NULL;
struct sanger *s = NULL;
char *words[4], *name = NULL, *sanger = NULL, *extName = NULL;
int i, rel;
char sep = '|';
boolean found = FALSE, posFound = FALSE;

/* alias hash is keyed by Sanger sts name */
aliasHash = newHash(16);
/* hash of Sanger names keyed by external name */
sangerByExtNameHash = newHash(16);

/* Read in all rows */
while (lineFileChopCharNext(clf, sep, words, 5))
    {
    name = cloneString(words[0]);
    sanger = cloneString(words[1]);
    if (!sameString(words[2], ""))
        rel = sqlUnsigned(words[2]);
    else
        rel = 3;
    /* find external name for this internal name from the extNameHash */
    if ((extName = hashFindVal(extNameHash, name)) == NULL)
        {
        /* if not found in BAC hash, then need to use internal name to make extName */
        extName = translateName(name, FALSE);
        }
    if ((a = hashFindVal(aliasHash, sanger)) == NULL)
        {
        /* allocate memory for alias struct */
        AllocVar(a); 
        /* allocate memory for UniSTS IDs, aliases, internal and external names and relations */
        /* and initialize the arrays */
        AllocArray(a->uniStsId, (sizeof(char *) * NUMSANGER));
        AllocArray(a->aliases, (sizeof(char *) * NUMALIASES));
        AllocArray(a->extName, (sizeof(char *) * MAXSANGER));
        AllocArray(a->intName, (sizeof(char *) * MAXSANGER));
        AllocArray(a->relation, (sizeof(int) * MAXSANGER));

        for (i = 0; i < NUMSANGER; i++)
            {
            a->uniStsId[i] = NULL;
            }
        for (i = 0; i < MAXSANGER; i++)
            {
            a->extName[i] = NULL;
            a->intName[i] = NULL;
            a->relation[i] = -1;
            }
        for (i = 0; i < NUMALIASES; i++)
            {
            a->aliases[i] = NULL;
            }
        }
    /* find empty slot in arrays to add external and internal names */
    posFound = FALSE;
    for (i = 0; i < NUMALIASES && (!posFound); i++)
        {
        if (a->extName[i] == NULL)
            {
            posFound = TRUE;
            a->extName[i] = cloneString(extName);
            if (a->intName[i] == NULL)
                a->intName[i] = cloneString(name);
            else
                errAbort("For marker %s, the empty slot in the intName array is not the same as that for the extName array in the alias struct\n", extName);
            if (a->relation[i] == -1)
                a->relation[i] = rel;
            else 
                errAbort("For marker %s, the empty slot in the relation array is not the same as that for the extName array in the alias struct\n", extName);
            }
        }
   
    a->sangerName = cloneString(sanger);
    a->primer1 = NULL;
    a->primer2 = NULL;
    /* add this alias struct to the hash keyed by sanger name */
    hashAdd(aliasHash, sanger, a);
    /* add sanger name to hash keyed by external name */
    if ((s = hashFindVal(sangerByExtNameHash, extName)) == NULL)
        {
        /* allocate memory for struct with array of Sanger names */
        AllocVar(s);
        /* initialize the array */
        for (i = 0; i < MAXSANGER; i++)
            {
            s->sangerName[i] = NULL;
            }
        }
    found = FALSE;
    for (i = 0; i < MAXSANGER && (!found); i++)
        {
        if (s->sangerName[i] == NULL)
            {
            found = TRUE;
            s->sangerName[i] = cloneString(sanger);
            }
        }
  /* add this list of sanger names to a hash keyed by external name, extName */
    hashAdd(sangerByExtNameHash, extName, s);
    }
}
Exemple #9
0
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, 
	struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount, boolean tabSep)
/* Go through bed file and collect chromosomes and statistics.  If eim parameter is non-NULL
 * collect max field sizes there too. */
{
int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1);
char *row[maxRowSize];
struct bbiChromUsage *usage = NULL, *usageList = NULL;
int lastStart = -1;
bits32 id = 0;
bits64 totalBases = 0, bedCount = 0;
int minDiff = BIGNUM;

lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    int rowSize = 0;

    if (tabSep)
        rowSize = lineFileChopCharNext(lf, '\t', row, maxRowSize);
    else
        rowSize = lineFileChopNext(lf, row, maxRowSize);
    if (rowSize == 0)
        break;
    lineFileExpectAtLeast(lf, maxRowSize, rowSize);
    char *chrom = row[0];
    int start = lineFileNeedNum(lf, row, 1);
    int end = lineFileNeedNum(lf, row, 2);
    if (eim != NULL)
	bbExIndexMakerUpdateMaxFieldSize(eim, row);
    if (start > end)
        {
	    errAbort("end (%d) before start (%d) line %d of %s",
	    	end, start, lf->lineIx, lf->fileName);
	}
    ++bedCount;
    totalBases += (end - start);
    if (usage == NULL || differentString(usage->name, chrom))
        {
	/* make sure chrom names are sorted in ASCII order */
	if ((usage != NULL) && strcmp(usage->name, chrom) > 0)
	    {
	    errAbort("%s is not case-sensitive sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" with LC_COLLATE=C,  or bedSort and try again.",
	    	lf->fileName, lf->lineIx);
	    }
	struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom);
	if (chromHashEl == NULL)
	    errAbort("%s is not found in chromosome sizes file", chrom);
	int chromSize = ptToInt(chromHashEl->val);
	AllocVar(usage);
	usage->name = cloneString(chrom);
	usage->id = id++;
	usage->size = chromSize;
	slAddHead(&usageList, usage);
	lastStart = -1;
	}
    if (end > usage->size)
        errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName);
    usage->itemCount += 1;
    if (lastStart >= 0)
        {
	int diff = start - lastStart;
	if (diff < minDiff)
	    {
	    if (diff < 0)
		errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
		    lf->fileName, lf->lineIx);
	    minDiff = diff;
	    }
	}
    lastStart = start;
    }
slReverse(&usageList);
double aveSize = 0;
if (bedCount > 0)
    aveSize = (double)totalBases/bedCount;
*retMinDiff = minDiff;
*retAveSize = aveSize;
*retBedCount = bedCount;
return usageList;
}