struct clusterMember *loadClusterMembers() /* Load the probe sets that are in our cluster of interest. */ { struct clusterMember *cmList = NULL, *cm = NULL; char *words[3]; struct lineFile *lf = NULL; char *inputFile = optionVal("clusterFile", NULL); int wordCount = 0; assert(inputFile); lf = lineFileOpen(inputFile, TRUE); while((wordCount = lineFileChopCharNext(lf, '\t', words, ArraySize(words))) != 0) { AllocVar(cm); if(wordCount == 3) { cm->geneId = cloneString(words[0]); cm->psName = cloneString(words[1]); cm->desc = cloneString(words[2]); } else if(wordCount == 2) { cm->psName = cloneString(words[0]); cm->desc = cloneString(words[1]); } else errAbort("Got %d words at line %d", wordCount, lf->lineIx); slAddHead(&cmList, cm); } lineFileClose(&lf); slReverse(&cmList); return cmList; }
boolean lineFileNextCharRow(struct lineFile *lf, char sep, char *words[], int wordCount) /* Return next non-blank line that doesn't start with '#' chopped into words * delimited by sep. Returns FALSE at EOF. Aborts on error. */ { int wordsRead; wordsRead = lineFileChopCharNext(lf, sep, words, wordCount); if (wordsRead == 0) return FALSE; if (wordsRead < wordCount) lineFileExpectWords(lf, wordCount, wordsRead); return TRUE; }
boolean lineFileNextCharRow2(struct lineFile *lf, char sep, char *words[], int wordCount) /* Return next non-blank line that doesn't start with '#' chopped into words * delimited by sep. Returns FALSE at EOF. Aborts on error if words in line * are not the same as wordCount or wordCount-1. */ { int wordsRead; wordsRead = lineFileChopCharNext(lf, sep, words, wordCount); if (wordsRead == 0) return FALSE; if ((wordsRead > wordCount) || (wordsRead < wordCount-1)) errAbort("Expecting %d or %d words line %d of %s got %d", wordCount, wordCount-1, lf->lineIx, lf->fileName, wordsRead); return TRUE; }
void readMarkers(struct lineFile *mkf) /* Read in Sanger sts name, UniSTS ID and aliases */ /* All Sanger names in this file are found in the Clone marker file */ { struct bac *b = NULL; struct alias *a = NULL; char *words[6], *sanger[NUMSANGER], *stsIdandAliases[NUMALIASES], *extName = NULL; char *firstAlias = NULL, **aliases = NULL, *pr1 = NULL, *pr2 = NULL; int sangerCount = 0, nameCount = 0, i, j, k; char sep = '|'; boolean isId = TRUE; /* Read in all rows */ while (lineFileChopCharNext(mkf, sep, words, 6)) { sangerCount = chopByChar(words[1], ';', sanger, ArraySize(sanger)); nameCount = chopByChar(words[2], ';', stsIdandAliases, ArraySize(stsIdandAliases)); pr1 = cloneString(words[3]); pr2 = cloneString(words[4]); /* process each sanger name found */ for (i = 0; i < sangerCount; i++) { /* use sanger name to find alias struct in hash */ if ((a = hashFindVal(aliasHash, sanger[i])) != NULL) { /* if string is numeric, then it is an integer ID so do not add to array */ k = 0; for (j = 0; j < nameCount; j++) { isId = stringIsAnInteger(stsIdandAliases[j]); if (!isId) { a->aliases[k] = cloneString(stsIdandAliases[j]); k++; } } /* store primer sequences */ a->primer1 = cloneString(pr1); a->primer2 = cloneString(pr2); } else fprintf(stderr, "Can not find sanger name, %s, in aliasHash\n", sanger[i]); } } }
void readPrimerInfo(struct lineFile *sf) /* Read in primer info from all.primers file */ { int wordCount; char *words[5]; char *dist1, *dist[2]; struct sts *sts; stsHash = newHash(16); while (lineFileChopCharNext(sf, '\t', words, 5)) { verbose(2, "# line %d words1-4: '%s' '%s' '%s' '%s'\n", sf->lineIx, words[1], words[2], words[3], words[4]); if (words[1] && words[2] && words[3] && words[4]) { AllocVar(sts); sts->dbstsId = cloneString(words[0]); sts->leftPrimer = cloneString(words[1]); sts->rightPrimer = cloneString(words[2]); sts->size = cloneString(words[3]); sts->ucscId = cloneString(words[4]); sts->found = FALSE; dist1 = cloneString(words[3]); if (sts->leftPrimer && dist1 && differentWord("-", dist1)) { wordCount = chopByChar(dist1, '-', dist, ArraySize(dist)); sts->minSize = sqlUnsigned(dist[0]); if (wordCount == 1) sts->maxSize = sqlUnsigned(dist[0]); else sts->maxSize = sqlUnsigned(dist[1]); if (sts->maxSize == 0) sts->maxSize = 1000; sts->next = NULL; sts->place = NULL; sts->epcr = NULL; hashAdd(stsHash, sts->dbstsId, sts); } slAddHead(&stsList, sts); } } }
void readContigs(struct lineFile *cgf) { struct bac *b = NULL; char *words[4], *name = NULL, *extName = NULL, *extName2 = NULL; char sep = '|'; int i; /* BAC structs keyed by external name */ bacHash = newHash(16); /* external names keyed by internal names */ extNameHash = newHash(16); while (lineFileChopCharNext(cgf, sep, words, 5)) { name = cloneString(words[1]); extName = cloneString(words[2]); extName2 = cloneString(words[2]); if ((b = hashFindVal(bacHash, extName)) == NULL) { /* allocate memory for bac struct */ AllocVar(b); /* add BAC info to struct */ b->intName = cloneString(name); b->extName = cloneString(extName); AllocArray(b->chrom, (sizeof(char *) * NUMCHROMS)); for (i = 0; i < NUMCHROMS; i++) { b->chrom[i] = NULL; } b->acc = NULL; hashAdd(bacHash, extName, b); hashAdd(extNameHash, name, extName2); } else fprintf(stderr, "The BAC clone %s is assigned to more than one contig\n", extName); } }
struct rnaBinder *loadRnaBinders() /* Load the probe sets that encode genes thought to bind rnas. Expected order is probeSet, geneName, pfamAcc, pfamName */ { struct rnaBinder *rbList = NULL, *rb = NULL; char *words[4]; struct lineFile *lf = NULL; char *inputFile = optionVal("rnaBindingFile", NULL); assert(inputFile); lf = lineFileOpen(inputFile, TRUE); while(lineFileChopCharNext(lf, '\t', words, ArraySize(words))) { AllocVar(rb); rb->psName = cloneString(words[0]); rb->geneName = cloneString(words[1]); rb->pfamAcc = cloneString(words[2]); rb->pfamName = cloneString(words[3]); slAddHead(&rbList, rb); } lineFileClose(&lf); slReverse(&rbList); return rbList; }
void readCloneNames(struct lineFile *clf) /* read internal BAC clone names and Sanger sts names */ { struct alias *a = NULL; struct sanger *s = NULL; char *words[4], *name = NULL, *sanger = NULL, *extName = NULL; int i, rel; char sep = '|'; boolean found = FALSE, posFound = FALSE; /* alias hash is keyed by Sanger sts name */ aliasHash = newHash(16); /* hash of Sanger names keyed by external name */ sangerByExtNameHash = newHash(16); /* Read in all rows */ while (lineFileChopCharNext(clf, sep, words, 5)) { name = cloneString(words[0]); sanger = cloneString(words[1]); if (!sameString(words[2], "")) rel = sqlUnsigned(words[2]); else rel = 3; /* find external name for this internal name from the extNameHash */ if ((extName = hashFindVal(extNameHash, name)) == NULL) { /* if not found in BAC hash, then need to use internal name to make extName */ extName = translateName(name, FALSE); } if ((a = hashFindVal(aliasHash, sanger)) == NULL) { /* allocate memory for alias struct */ AllocVar(a); /* allocate memory for UniSTS IDs, aliases, internal and external names and relations */ /* and initialize the arrays */ AllocArray(a->uniStsId, (sizeof(char *) * NUMSANGER)); AllocArray(a->aliases, (sizeof(char *) * NUMALIASES)); AllocArray(a->extName, (sizeof(char *) * MAXSANGER)); AllocArray(a->intName, (sizeof(char *) * MAXSANGER)); AllocArray(a->relation, (sizeof(int) * MAXSANGER)); for (i = 0; i < NUMSANGER; i++) { a->uniStsId[i] = NULL; } for (i = 0; i < MAXSANGER; i++) { a->extName[i] = NULL; a->intName[i] = NULL; a->relation[i] = -1; } for (i = 0; i < NUMALIASES; i++) { a->aliases[i] = NULL; } } /* find empty slot in arrays to add external and internal names */ posFound = FALSE; for (i = 0; i < NUMALIASES && (!posFound); i++) { if (a->extName[i] == NULL) { posFound = TRUE; a->extName[i] = cloneString(extName); if (a->intName[i] == NULL) a->intName[i] = cloneString(name); else errAbort("For marker %s, the empty slot in the intName array is not the same as that for the extName array in the alias struct\n", extName); if (a->relation[i] == -1) a->relation[i] = rel; else errAbort("For marker %s, the empty slot in the relation array is not the same as that for the extName array in the alias struct\n", extName); } } a->sangerName = cloneString(sanger); a->primer1 = NULL; a->primer2 = NULL; /* add this alias struct to the hash keyed by sanger name */ hashAdd(aliasHash, sanger, a); /* add sanger name to hash keyed by external name */ if ((s = hashFindVal(sangerByExtNameHash, extName)) == NULL) { /* allocate memory for struct with array of Sanger names */ AllocVar(s); /* initialize the array */ for (i = 0; i < MAXSANGER; i++) { s->sangerName[i] = NULL; } } found = FALSE; for (i = 0; i < MAXSANGER && (!found); i++) { if (s->sangerName[i] == NULL) { found = TRUE; s->sangerName[i] = cloneString(sanger); } } /* add this list of sanger names to a hash keyed by external name, extName */ hashAdd(sangerByExtNameHash, extName, s); } }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount, boolean tabSep) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = 0; if (tabSep) rowSize = lineFileChopCharNext(lf, '\t', row, maxRowSize); else rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { /* make sure chrom names are sorted in ASCII order */ if ((usage != NULL) && strcmp(usage->name, chrom) > 0) { errAbort("%s is not case-sensitive sorted at line %d. Please use \"sort -k1,1 -k2,2n\" with LC_COLLATE=C, or bedSort and try again.", lf->fileName, lf->lineIx); } struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); double aveSize = 0; if (bedCount > 0) aveSize = (double)totalBases/bedCount; *retMinDiff = minDiff; *retAveSize = aveSize; *retBedCount = bedCount; return usageList; }