Ejemplo n.º 1
0
boolean lineFileNextRowTab(struct lineFile *lf, char *words[], int wordCount)
/* Return next non-blank line that doesn't start with '#' chopped into words
 * at tabs. Returns FALSE at EOF.  Aborts on error. */
{
int wordsRead;
wordsRead = lineFileChopNextTab(lf, words, wordCount);
if (wordsRead == 0)
    return FALSE;
if (wordsRead < wordCount)
    lineFileExpectWords(lf, wordCount, wordsRead);
return TRUE;
}
Ejemplo n.º 2
0
boolean lineFileNextCharRow(struct lineFile *lf, char sep, char *words[], int wordCount)
/* Return next non-blank line that doesn't start with '#' chopped into words
 * delimited by sep. Returns FALSE at EOF.  Aborts on error. */
{
int wordsRead;
wordsRead = lineFileChopCharNext(lf, sep, words, wordCount);
if (wordsRead == 0)
    return FALSE;
if (wordsRead < wordCount)
    lineFileExpectWords(lf, wordCount, wordsRead);
return TRUE;
}
Ejemplo n.º 3
0
struct vcfRecord *vcfNextRecord(struct vcfFile *vcff)
/* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file.
 * Note: this does not store record in vcff->records! */
{
char *words[VCF_MAX_COLUMNS];
int wordCount;
if ((wordCount = lineFileChop(vcff->lf, words)) <= 0)
    return NULL;
int expected = 8;
if (vcff->genotypeCount > 0)
    expected = 9 + vcff->genotypeCount;
lineFileExpectWords(vcff->lf, expected, wordCount);
return vcfRecordFromRow(vcff, words);
}
Ejemplo n.º 4
0
void parseA(struct lineFile *lf, struct block **retBlockList, 
	int *retScore)
/* Parse an alignment stanza into a block list. */
{
struct block *block, *blockList = NULL;
char *line, *words[6], typeChar;
int wordCount;
int score = -666;
boolean gotScore = FALSE;

while (lineFileNext(lf, &line, NULL))
    {
    if (line[0] == '#')
	continue;
    if (line[0] == '}')
	break;
    wordCount = chopLine(line, words);
    if (wordCount == 0)
	continue;
    typeChar = words[0][0];
    if (typeChar == 'l')
	{
	lineFileExpectWords(lf, 6, wordCount);
	AllocVar(block);
	block->tStart = lineFileNeedNum(lf, words, 1) - 1;
	block->tEnd = lineFileNeedNum(lf, words, 3);
	block->qStart = lineFileNeedNum(lf, words, 2) - 1;
	block->qEnd = lineFileNeedNum(lf, words, 4);
	if (block->qEnd - block->qStart != block->tEnd - block->tStart)
	    errAbort("Block size mismatch line %d of %s", lf->lineIx, lf->fileName);
	block->percentId = lineFileNeedNum(lf, words, 5);
	slAddHead(&blockList, block);
	}
    else if (typeChar == 's')
        {
	gotScore = TRUE;
	score = lineFileNeedNum(lf, words, 1);
	}
    }
if (!gotScore)
    {
    errAbort("'a' stanza missing score line %d of %s", 
    	lf->lineIx, lf->fileName);
    }
slReverse(&blockList);
blockList = removeFrayedEnds(blockList);
*retBlockList = blockList;
*retScore = score;
}
Ejemplo n.º 5
0
void dnaseHg38AddTreatments(char *inTab, char *outTab)
/* dnaseHg38AddTreatments - Add treatments to dnase hg38 metadata. */
{
struct sqlConnection *conn = sqlConnect("hgFixed");
struct lineFile *lf = lineFileOpen(inTab, TRUE);
FILE *f = mustOpen(outTab, "w");
char *line;
while (lineFileNext(lf, &line, NULL))
    {
    if (line[0] == '#')
        fprintf(f, "%s\ttreatment\tlabel\n", line);
    else
        {
	char *inRow[5];
	int wordCount = chopByWhite(line, inRow, ArraySize(inRow));
	lineFileExpectWords(lf, 4, wordCount);
	char *acc = inRow[0];
	char *biosample = inRow[1];
	char query[512];
	sqlSafef(query, sizeof(query), "select expVars from encodeExp where accession = '%s'", acc);
	char varBuf[1024];
	char *treatment = "n/a";
	char *label = biosample;
	char labelBuf[256];
	char *vars = sqlQuickQuery(conn, query, varBuf, sizeof(varBuf));
	if (!isEmpty(vars))
	     {
	     treatment = vars + strlen("treatment=");
	     if (sameString(treatment, "4OHTAM_20nM_72hr"))
	         safef(labelBuf, sizeof(labelBuf), "%s 40HTAM", biosample);
	     else if (sameString(treatment, "diffProtA_14d"))
	         safef(labelBuf, sizeof(labelBuf), "%s diff 14d", biosample);
	     else if (sameString(treatment, "diffProtA_5d"))
		safef(labelBuf, sizeof(labelBuf), "%s diff 5d", biosample);
	     else if (sameString(treatment, "DIFF_4d"))
		safef(labelBuf, sizeof(labelBuf), "%s diff 4d", biosample);
	     else if (sameString(treatment, "Estradiol_100nM_1hr"))
	        safef(labelBuf, sizeof(labelBuf), "%s estradi 1h", biosample);
	     else if (sameString(treatment, "Estradiol_ctrl_0hr"))
	        safef(labelBuf, sizeof(labelBuf), "%s estradi 0h", biosample);
	     else
	        errAbort("Unknown treatment %s", treatment);
	     label = labelBuf;
	     }
	fprintf(f, "%s\t%s\t%s\t%s\t%s\t%s\n", inRow[0], inRow[1], inRow[2], inRow[3], treatment, label);
	}
    }
carefulClose(&f);
}
Ejemplo n.º 6
0
struct hash *hashTwoColumnFile(char *fileName)
/* Given a two column file (key, value) return a hash. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = hashNew(16);
char *row[3];
int fields = 0;
while ((fields = lineFileChop(lf, row)) != 0)
    {
    lineFileExpectWords(lf, 2, fields);
    char *name = row[0];
    char *value = lmCloneString(hash->lm, row[1]);
    hashAdd(hash, name, value);
    }
lineFileClose(&lf);
return hash;
}
Ejemplo n.º 7
0
struct rgi *readRgi(char *inName)
{
struct rgi *rgiList = NULL, *rgi;
struct lineFile *lf = lineFileOpen(inName, TRUE);
int wordCount;
char *words[8];

while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    lineFileExpectWords(lf, 4, wordCount);
    rgi = rgiLoad(words);
    slAddHead(&rgiList, rgi);
    uglyf("%s %s: min %d, max %d\n", rgi->a, rgi->b, rgi->minDistance, rgi->maxDistance);
    }
lineFileClose(&lf);
slReverse(&rgiList);
return rgiList;
}
Ejemplo n.º 8
0
Archivo: vcf.c Proyecto: bh0085/kent
static void vcfParseData(struct vcfFile *vcff, int maxRecords)
/* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
 * at the beginning of a data row, parse and store all data rows from lineFile. */
{
if (vcff == NULL)
    return;
int recCount = 0, expected = 8;
if (vcff->genotypeCount > 0)
    expected = 9 + vcff->genotypeCount;
char *words[VCF_MAX_COLUMNS];
int wordCount;
while ((wordCount = lineFileChop(vcff->lf, words)) > 0)
    {
    if (maxRecords >= 0 && recCount >= maxRecords)
	break;
    lineFileExpectWords(vcff->lf, expected, wordCount);
    struct vcfRecord *record;
    AllocVar(record);
    record->file = vcff;
    record->chrom = vcfFilePooledStr(vcff, words[0]);
    record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
    // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn.
    record->chromEnd = record->chromStart+1;
    record->name = vcfFilePooledStr(vcff, words[2]);
    parseRefAndAlt(vcff, record, words[3], words[4]);
    record->qual = vcfFilePooledStr(vcff, words[5]);
    parseFilterColumn(vcff, record, words[6]);
    parseInfoColumn(vcff, record, words[7]);
    if (vcff->genotypeCount > 0)
	{
	record->format = vcfFilePooledStr(vcff, words[8]);
	record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
						       vcff->genotypeCount * sizeof(char *));
	int i;
	// Don't bother actually parsing all these until & unless we need the info:
	for (i = 0;  i < vcff->genotypeCount;  i++)
	    record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
	}
    slAddHead(&(vcff->records), record);
    recCount++;
    }
slReverse(&(vcff->records));
lineFileClose(&(vcff->lf));
}
void loadOneBed(struct lineFile *lf, int bedSize, struct bedStub **pList)
/* Load one bed file.  Make sure all lines have bedSize fields.
 * Put results in *pList. */
{
char *words[64], *line, *dupe;
int wordCount;
struct bedStub *bed;

verbose(1, "Reading %s\n", lf->fileName);
while (lineFileNextReal(lf, &line))
    {
    if (hasBin)
	nextWord(&line);
    dupe = cloneString(line);
    if (strictTab)
	wordCount = chopTabs(line, words);
    else
	wordCount = chopLine(line, words);
    /* ignore empty lines	*/
    if (0 == wordCount)
	continue;
    lineFileExpectWords(lf, bedSize, wordCount);
    AllocVar(bed);
    bed->chrom = cloneString(words[0]);
    bed->chromStart = lineFileNeedNum(lf, words, 1);
    bed->chromEnd = lineFileNeedNum(lf, words, 2);
    if (! noStrict)
	{
	if (bed->chromEnd < 1)
	    errAbort("ERROR: line %d:'%s'\nchromEnd is less than 1\n",
		     lf->lineIx, dupe);
	if (bed->chromStart == bed->chromEnd && !allowStartEqualEnd)
	    errAbort("ERROR: line %d:'%s'\nchromStart == chromEnd (%d) (zero-length item)\n"
		     "Use -allowStartEqualEnd if that is legit (e.g. for insertion point).\n",
		     lf->lineIx, dupe, bed->chromStart);
	if (bed->chromStart > bed->chromEnd)
	    errAbort("ERROR: line %d:'%s'\nchromStart after chromEnd (%d > %d)\n",
		     lf->lineIx, dupe, bed->chromStart, bed->chromEnd);
	}
    bed->line = dupe;
    slAddHead(pList, bed);
    }
}
Ejemplo n.º 10
0
void flagMhcClones(char *mhcFile, char *gsDir)
/* flagMhcClones - Look for clones Stephan wants in MHC.. */
{
struct lineFile *lf = lineFileOpen(mhcFile, TRUE);
char *line, *words[16];
int lineSize, wordCount, i;
char clonePath[512];
char *clone, *cloneVer;
static char *phases[3] = {"fin", "draft", "predraft",};
boolean found;

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == '#')
        continue;
    wordCount = chopLine(line, words);
    if (wordCount == 0)
        continue;
    lineFileExpectWords(lf, 7, wordCount);
    clone = words[0];
    cloneVer = words[1];
    found = FALSE;
    for (i = 0; i < 3; ++i)
        {
	char *phase = phases[i];
	sprintf(clonePath, "%s/%s/fa/%s.fa", gsDir, phase, clone);
	if (fileExists(clonePath))
	    {
	    struct dnaSeq *seq = faReadDna(clonePath);
	    char *e = strchr(seq->name, '_');

	    if (e != NULL) *e = 0;
	    if (!sameString(seq->name, cloneVer))
		printf("%s\t(wrong version %s)\n", cloneVer, seq->name);
	    else if (i != 0)
	        printf("%s\t(not finished)\n", cloneVer);
	    found = TRUE;
	    }
	}
    if (!found)
        printf("%s\t(not found)\n", cloneVer);
    }
}
Ejemplo n.º 11
0
static struct psl *fileNext(struct pslReader* pr)
/* read the next record from a file */
{
char *row[PSLX_NUM_COLS];
int numCols;

while ((numCols = lineFileChopNextTab(pr->lf, row, PSLX_NUM_COLS)) > 0)
    {
    lineFileExpectWords(pr->lf, (pr->isPslx ? PSLX_NUM_COLS : PSL_NUM_COLS), numCols);
    if ((pr->chrom == NULL) || (sameString(row[13], pr->chrom)))
        {
        if (pr->isPslx)
            return pslxLoad(row);
        else
            return pslLoad(row);
        }
    }
return NULL;
}
Ejemplo n.º 12
0
struct tomRough *loadAllRough(char *fileName)
/* Load up all bands from database. */
{
struct tomRough *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[16], *line;
int wordCount, lineSize;

while (lineFileNext(lf, &line, &lineSize))
    {
    wordCount = chopCommas(line, words);
    lineFileExpectWords(lf, 5, wordCount);
    el = tomRoughLoad(words);
    slAddHead(&list, el);
    }
slReverse(&list);
lineFileClose(&lf);
printf("Loaded %d rough lines\n", slCount(list));
return list;
}
Ejemplo n.º 13
0
void readBaseProbs(struct lineFile *lf, char **words,
	char *firstWord, float **pArray, int colCount)
/* Allocate and read base probabilities. */
{
char *line;
int wordCount;
float *array;
int i;

lineFileNeedNext(lf, &line, NULL);
wordCount = chopByWhite(line, words, colCount+1);
lineFileExpectWords(lf, colCount+1, wordCount);
if (!sameString(words[0], firstWord))
    errAbort("Expecting %s, got %s line %d of %s", firstWord, words[0], 
    	lf->lineIx, lf->fileName);
AllocArray(array, colCount);
for (i=0; i<colCount; ++i)
    array[i] = atof(words[i+1]);
*pArray = array;
}
struct clone *readTrans(char *fileName)
/* Read info in trans file. */
{
    char cloneName[128], lastCloneName[128];
    struct clone *cloneList = NULL, *clone = NULL;
    struct frag *frag;
    struct lineFile *lf = lineFileOpen(fileName, TRUE);
    char *words[8], *parts[4], *subParts[3];
    int wordCount, partCount, subCount;

    strcpy(lastCloneName, "");
    while ((wordCount = lineFileChop(lf, words)) != 0)
    {
        lineFileExpectWords(lf, 3, wordCount);
        partCount = chopString(words[2], "(:)", parts, ArraySize(parts));
        if (partCount != 2)
            errAbort("Badly formatted third field line %d of %s",
                     lf->lineIx, lf->fileName);
        subCount = chopString(parts[1], ".", subParts, ArraySize(subParts));
        if (subCount != 2)
            errAbort("Badly formatted third field line %d of %s (expecting start..end)",
                     lf->lineIx, lf->fileName);
        fragToCloneName(words[0], cloneName);
        if (!sameString(cloneName, lastCloneName))
        {
            AllocVar(clone);
            clone->name = cloneString(cloneName);
            slAddHead(&cloneList, clone);
        }
        AllocVar(frag);
        frag->name = cloneString(words[0]);
        frag->ffaName = cloneString(words[1]);
        frag->start = lineFileNeedNum(lf, subParts, 0) - 1;
        frag->end = lineFileNeedNum(lf, subParts, 1);
        slAddTail(&clone->fragList, frag);
        strcpy(lastCloneName, cloneName);
    }
    lineFileClose(&lf);
    slReverse(&cloneList);
    return cloneList;
}
Ejemplo n.º 15
0
struct agpFrag *readAgpFile(char *agpName)
/* Read agps from file. */
{
struct lineFile *lf = lineFileOpen(agpName, TRUE);
int wordCount;
char *words[16];
struct agpFrag *list = NULL, *el;

while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    if (words[4][0] != 'N')
        {
	lineFileExpectWords(lf, 9, wordCount);
	el = agpFragLoad(words);
	slAddHead(&list, el);
	}
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}
Ejemplo n.º 16
0
struct hash *hashGeneLevels(char *fileName, int cellCount)
/* Get a hash with key of gene name and value an array of expression values. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = hashNew(16);
int fieldCount = cellCount+1;
char *words[fieldCount+1];
int wordCount;
while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    lineFileExpectWords(lf, fieldCount, wordCount);
    char *name = words[0];
    double *vals;
    AllocArray(vals, cellCount);
    int i;
    for (i=0; i<cellCount; ++i)
        vals[i] = sqlDouble(words[i+1]);
    hashAdd(hash, name, vals);
    }
lineFileClose(&lf);
return hash;
}
void readGold(char *fileName, struct clonePos **retList, struct hash **retHash)
/* Read .agp/gold formatted file */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[12];
struct clonePos *cpList = NULL, *cp;
struct hash *hash = newHash(0);
int wordCount;

while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    char *type = words[4];
    char *clone = words[5];
    int fragStart, fragEnd;
    double fragSize;
    if (type[0] == 'N')
        continue;
    lineFileExpectWords(lf, 9, wordCount);
    chopSuffix(clone);
    fragStart = lineFileNeedNum(lf, words, 1)-1;
    fragEnd = lineFileNeedNum(lf, words, 2);
    fragSize = fragEnd - fragStart;
    if ((cp = hashFindVal(hash, clone)) == NULL)
        {
	AllocVar(cp);
	hashAddSaveName(hash, clone, cp, &cp->name);
	slAddHead(&cpList, cp);
	}
    cp->weightedPos += fragSize * fragStart;
    cp->totSize += fragSize;
    }
lineFileClose(&lf);
slReverse(&cpList);
for (cp = cpList; cp != NULL; cp = cp->next)
    cp->pos = cp->weightedPos/cp->totSize;
*retList = cpList;
*retHash = hash;
}
void addTpfToTabFile(char *chromName, char *tabFile, FILE *f)
/* Add one tpf FILE to tab-separated file */
{
struct lineFile *lf = lineFileOpen(tabFile, TRUE);
char *row[3];
int wordCount;
int ix = 0;
while ((wordCount = lineFileChop(lf, row)) != 0)
    {
    if (wordCount < 3)
        {
	if (wordCount < 2 || !sameWord("GAP", row[0]))
	    lineFileExpectWords(lf, 3, wordCount);
	row[2] = "?";
	}
    fprintf(f, "%s\t", chromName);
    fprintf(f, "%s\t", row[0]);
    fprintf(f, "%s\t", row[1]);
    fprintf(f, "%s\t", row[2]);
    fprintf(f, "%d\n", ix++);
    }
lineFileClose(&lf);
}
boolean readMotif(struct lineFile *lf, struct motif *m)
/* Read five lines of motif info. */
{
char *line;
char *words[maxMotifSize+1];
int wordCount;
int i,j;
int colCount = 0;

/* Get first line and parse it. */
ZeroVar(m);
if (!lineFileNext(lf, &line, NULL))
    return FALSE;
wordCount = chopLine(line, words);
if (wordCount < 6 || !sameString(words[1], "@"))
    errAbort("Bad line %d of %s", lf->lineIx, lf->fileName);
m->score = atof(words[0]);
m->pos = atof(words[2]);
m->posSd = atof(words[4]);
strncpy(m->consensus, words[5], sizeof(m->consensus));

/* Get next lines with columns. */
for (i=0; i<4; ++i)
    {
    if (!lineFileNext(lf, &line, NULL))
        errAbort("Unexpected end of file in %s", lf->fileName);
    wordCount = chopLine(line, words);
    if (i == 0)
        m->size = colCount = wordCount - 1;
    else
        lineFileExpectWords(lf, colCount+1, wordCount);
    for (j=0; j<colCount; ++j)
        m->profile[i][j] = atof(words[j+1]);
    }
return TRUE;
}
Ejemplo n.º 20
0
void readProbeList(char *fileName, struct probe **retList, struct hash **retHash)
/* Read in sequence list from file.  (Set aliSize field to zero). */
{
struct hash *hash = newHash(0);
struct hashEl *hel;
struct probe *list = NULL, *el;
char *words[4];
int wordCount;
struct lineFile *lf = lineFileOpen(fileName, TRUE);

while ((wordCount = lineFileChop(lf, words)) > 0)
    {
    lineFileExpectWords(lf, 2, wordCount);
    AllocVar(el);
    hel = hashAdd(hash, words[0], el);
    el->name = hel->name;
    el->size = atoi(words[1]);
    slAddHead(&list, el);
    }
slReverse(&list);
lineFileClose(&lf);
*retList = list;
*retHash = hash;
}
Ejemplo n.º 21
0
void cloneSpan(char *fileName)
/* cloneSpan - List clones and the amount the span by looking at .gl file. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
int wordCount, lineSize;
char *words[16], *line;
struct hash *hash = newHash(0);
struct hashEl *hel;
char *cloneName;
int start, end;
struct clone *cloneList = NULL, *clone;
int totalSpan = 0, totalBases = 0;

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == '#')
        continue;
    wordCount = chopLine(line, words);
    if (wordCount == 0)
        continue;
    if (wordCount < 3)
       lineFileExpectWords(lf, 3, wordCount);
    cloneName = words[0];
    chopSuffix(cloneName);
    start = sqlUnsigned(words[1]);
    end = sqlUnsigned(words[2]);
    clone = hashFindVal(hash, cloneName);
    if (clone == NULL)
        {
	AllocVar(clone);
	hel = hashAdd(hash, cloneName, clone);
	clone->name = hel->name;
	clone->start = start;
	clone->end = end;
	slAddHead(&cloneList, clone);
	}
    else
        {
	if (clone->start > start)
	    clone->start = start;
	if (clone->end < end)
	    clone->end = end;
	}
    clone->baseCount += end-start;
    }
lineFileClose(&lf);
slReverse(&cloneList);

for (clone = cloneList; clone != NULL; clone = clone->next)
    {
    int span = clone->end - clone->start;
#ifdef SOMETIMES
    printf("clone %s, bases %d, spans %d, density %4.2f%%\n",
        clone->name, clone->baseCount, span,
	100.0 * (double)clone->baseCount/(double)span);
#endif
    totalSpan += span;
    totalBases += clone->baseCount;
    }
printf("%s bases %d, spans %d, density %4.2f%%\n",
    fileName, totalBases, totalSpan,
    100.0 * (double)totalBases/(double)totalSpan);
}
struct genScanFeature *parseGenscanLine(struct lineFile *lf, char *line)
/* Parse a single line. */
{
char *words[16], *parts[3];
int wordCount, partCount;
char *type;
struct genScanFeature *gsf;
boolean isLong = FALSE;
int size;

wordCount = chopLine(line, words);
if (wordCount < 2)
    errAbort("Expecting at least 2 words line %d of %s", lf->lineIx, lf->fileName);
type = words[1];
if (sameString(type, "PlyA") || sameString(type, "Prom"))
    {
    lineFileExpectWords(lf, 7, wordCount);
    }
else if (sameString(type, "Init") || sameString(type, "Intr") || sameString(type, "Term") || sameString(type, "Sngl"))
    {
    lineFileExpectWords(lf, 13, wordCount);
    isLong = TRUE;
    }
else
    {
    errAbort("Unrecognized type %s line %d of %s", type, lf->lineIx, lf->fileName);
    }
AllocVar(gsf);
gsf->name = cloneString(words[0]);
partCount = chopString(words[0], ".", parts, ArraySize(parts));
if (partCount != 2 || (parts[0][0] != 'S' && !isdigit(parts[0][0])) || !isdigit(parts[1][0]))
    errAbort("Expecting N.NN field 1 line %d of %s", lf->lineIx, lf->fileName);
gsf->geneId = atoi(parts[0]);
gsf->featId = atoi(parts[1]);
gsf->type = cloneString(type);
gsf->strand = words[2][0];
if (gsf->strand == '-')
    {
    gsf->start = lineFileNeedNum(lf, words, 4) - 1;
    gsf->end = lineFileNeedNum(lf, words, 3);
    }
else
    {
    gsf->start = lineFileNeedNum(lf, words, 3) - 1;
    gsf->end = lineFileNeedNum(lf, words, 4);
    }
size = lineFileNeedNum(lf, words, 5);
if (size != gsf->end - gsf->start)
    errAbort("Len doesn't match Begin to End line %d of %s", lf->lineIx, lf->fileName);

if (isLong)
    {
    gsf->frame = lineFileNeedNum(lf, words, 6);
    gsf->phase = lineFileNeedNum(lf, words, 7);
    gsf->iac = lineFileNeedNum(lf, words, 8);
    gsf->dot = lineFileNeedNum(lf, words, 9);
    gsf->codRg = lineFileNeedNum(lf, words, 10);
    gsf->p = atof(words[11]);
    gsf->tScore = atof(words[12]);
    }
else
    gsf->tScore = atof(words[6]);
return gsf;
}
Ejemplo n.º 23
0
void regCompanionEnhProCellSpecificPairs(char *enhBed, char *cellDescriptions, 
	char *geneLevels, char *pairsIn, char *outDir)
/* regCompanionEnhProCellSpecificPairs - Select enh/pro pairs that are seen in a given cell 
 * lines. */
{
/* Load up cell descriptions into cell array */
struct expRecord *cell, *cellList = expRecordLoadAll(cellDescriptions);
int cellCount = slCount(cellList);
struct expRecord **cellArray;
AllocArray(cellArray, cellCount);
int i;
for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next)
    cellArray[i] = cell;
verbose(2, "Got %d cells in %s\n", cellCount, cellDescriptions);

/* Load up enhBed into a hash keyed by name */
struct bed *enh, *enhList;
int fieldCount;
bedLoadAllReturnFieldCount(enhBed, &enhList, &fieldCount);
if (fieldCount != 15)
   errAbort("Expecting bed 15 format in %s", enhBed);
struct hash *enhHash = hashNew(16);
for (enh = enhList; enh != NULL; enh = enh->next)
    {
    if (enh->expCount != cellCount)
        errAbort("Inconsistent input: %d cells in %s, but %d in %s\n", 
		cellCount, cellDescriptions, enh->expCount, enhBed);
    hashAddUnique(enhHash, enh->name, enh);
    }
verbose(2, "Got %d enhancers in %s\n", enhHash->elCount, enhBed);

/* Get a hash with key of gene name and value an array of expression values. */
struct hash *geneHash = hashGeneLevels(geneLevels, cellCount);
verbose(2, "Got %d genes in %s\n", geneHash->elCount, geneLevels);

/* Open inPairs.bed, just to make sure it's there before we do any output. */
struct lineFile *lf = lineFileOpen(pairsIn, TRUE);

/* Remove trailing slash from output dir if any */
if (lastChar(outDir) == '/')
    {
    int len = strlen(outDir);
    outDir[len-1] = 0;
    }

/* Make output directory and open all output files. */
makeDirsOnPath(outDir);
FILE *outFiles[cellCount];
for (i=0, cell = cellList; i < cellCount; ++i, cell = cell->next)
    {
    char path[PATH_LEN];
    safef(path, sizeof(path), "%s/%s.bed", outDir, cell->description);
    outFiles[i] = mustOpen(path, "w");
    }

/* Stream through input file and copy to appropriate outputs. */
char *words[bedKnownFields*2];	// Make a little bigger than any known bed
int wordCount, wordsRequired = 0;
char *separator = "->";
int separatorSize = strlen(separator);
int pairCount = 0;
while ((wordCount = lineFileChop(lf, words)) != 0)
    {
    /* Make sure all lines have same # of fields, and at least 4. */
    if (wordsRequired == 0)
	{
        wordsRequired = wordCount;
	lineFileExpectAtLeast(lf, 4, wordCount);
	}
    else
	lineFileExpectWords(lf, wordsRequired, wordCount);
    ++pairCount;

    /* Parse out name field. */
    char *name = words[3];
    char *sepPos = stringIn(separator, name);
    if (sepPos == NULL)
        errAbort("Expecting %s in %s line %d of %s", separator, name, lf->lineIx, lf->fileName);
    char *enhName = cloneStringZ(name, sepPos-name);
    char *geneName = sepPos + separatorSize;

    /* Look up enhancer and gene. */
    enh = hashMustFindVal(enhHash, enhName);
    double *geneLevels = hashMustFindVal(geneHash, geneName);
    freez(&enhName);

    /* Output ones over minimum levels. */
    for (i=0; i < cellCount; ++i)
        {
	double enhLevel = enh->expScores[i];
	double geneLevel = geneLevels[i];
	if (enhLevel >= minAct && geneLevel >= minExp)
	    {
	    int j;
	    FILE *f = outFiles[i];
	    fprintf(f, "%s", words[0]);
	    for (j=1; j<wordCount; ++j)
		fprintf(f, "\t%s", words[j]);
	    fprintf(f, "\n");
	    }
	}
    }
verbose(2, "Got %d pairs in %s\n", pairCount, pairsIn);

/* Clean up. */
lineFileClose(&lf);
for (i=0; i<cellCount; ++i)
    carefulClose(&outFiles[i]);
}
Ejemplo n.º 24
0
static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, 
	int itemsPerSlot, struct bbiBoundsArray *bounds, 
	int sectionCount, boolean doCompress, FILE *f, 
	int resTryCount, int resScales[], int resSizes[], 
	struct bbExIndexMaker *eim,  int bedCount,
	bits16 fieldCount, bits32 *retMaxBlockSize)
/* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
 * to boundsArray */
{
int maxBlockSize = 0;
struct bbiChromUsage *usage = usageList;
char *line, *row[fieldCount+1];
int lastField = fieldCount-1;
int itemIx = 0, sectionIx = 0;
bits64 blockStartOffset = 0;
int startPos = 0, endPos = 0;
bits32 chromId = 0;
struct dyString *stream = dyStringNew(0);

/* Will keep track of some things that help us determine how much to reduce. */
bits32 resEnds[resTryCount];
int resTry;
for (resTry = 0; resTry < resTryCount; ++resTry)
    resEnds[resTry] = 0;
boolean atEnd = FALSE, sameChrom = FALSE;
bits32 start = 0, end = 0;
char *chrom = NULL;
struct bed *bed;
AllocVar(bed);

/* Help keep track of which beds are in current chunk so as to write out
 * namedChunks to eim if need be. */
long sectionStartIx = 0, sectionEndIx = 0;

for (;;)
    {
    /* Get next line of input if any. */
    if (lineFileNextReal(lf, &line))
	{
	/* Chop up line and make sure the word count is right. */
	int wordCount;
	if (tabSep)
	    wordCount = chopTabs(line, row);
	else
	    wordCount = chopLine(line, row);
	lineFileExpectWords(lf, fieldCount, wordCount);

	loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE);

	chrom = bed->chrom;
	start = bed->chromStart;
	end = bed->chromEnd;

	sameChrom = sameString(chrom, usage->name);
	}
    else  /* No next line */
	{
	atEnd = TRUE;
	}


    /* Check conditions that would end block and save block info and advance to next if need be. */
    if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
        {
	/* Save stream to file, compressing if need be. */
	if (stream->stringSize > maxBlockSize)
	    maxBlockSize = stream->stringSize;
	if (doCompress)
            {
	    size_t maxCompSize = zCompBufSize(stream->stringSize);

            // keep around an area of scratch memory
            static int compBufSize = 0;
            static char *compBuf = NULL;
            // check to see if buffer needed for compression is big enough
            if (compBufSize < maxCompSize)
                {
                // free up the old not-big-enough piece
                freez(&compBuf); // freez knows bout NULL

                // get new scratch area
                compBufSize = maxCompSize;
                compBuf = needLargeMem(compBufSize);
                }

	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
	    mustWrite(f, compBuf, compSize);
	    }
	else
	    mustWrite(f, stream->string, stream->stringSize);
	dyStringClear(stream);

	/* Save block offset and size for all named chunks in this section. */
	if (eim != NULL)
	    {
	    bits64 blockEndOffset = ftell(f);
	    bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset,
		sectionStartIx, sectionEndIx);
	    sectionStartIx = sectionEndIx;
	    }

	/* Save info on existing block. */
	struct bbiBoundsArray *b = &bounds[sectionIx];
	b->offset = blockStartOffset;
	b->range.chromIx = chromId;
	b->range.start = startPos;
	b->range.end = endPos;
	++sectionIx;
	itemIx = 0;

	if (atEnd)
	    break;
	}

    /* Advance to next chromosome if need be and get chromosome id. */
    if (!sameChrom)
        {
	usage = usage->next;
	assert(usage != NULL);
	assert(sameString(chrom, usage->name));
	for (resTry = 0; resTry < resTryCount; ++resTry)
	    resEnds[resTry] = 0;
	}
    chromId = usage->id;

    /* At start of block we save a lot of info. */
    if (itemIx == 0)
        {
	blockStartOffset = ftell(f);
	startPos = start;
	endPos = end;
	}
    /* Otherwise just update end. */
        {
	if (endPos < end)
	    endPos = end;
	/* No need to update startPos since list is sorted. */
	}

    /* Save name into namedOffset if need be. */
    if (eim != NULL)
	{
	bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx);
	sectionEndIx += 1;
	}

    /* Write out data. */
    dyStringWriteOne(stream, chromId);
    dyStringWriteOne(stream, start);
    dyStringWriteOne(stream, end);
    if (fieldCount > 3)
        {
	int i;
	/* Write 3rd through next to last field and a tab separator. */
	for (i=3; i<lastField; ++i)
	    {
	    char *s = row[i];
	    dyStringAppend(stream, s);
	    dyStringAppendC(stream, '\t');
	    }
	/* Write last field and terminal zero */
	char *s = row[lastField];
	dyStringAppend(stream, s);
	}
    dyStringAppendC(stream, 0);

    itemIx += 1;

    /* Do zoom counting. */
    for (resTry = 0; resTry < resTryCount; ++resTry)
        {
	bits32 resEnd = resEnds[resTry];
	if (start >= resEnd)
	    {
	    resSizes[resTry] += 1;
	    resEnds[resTry] = resEnd = start + resScales[resTry];
	    }
	while (end > resEnd)
	    {
	    resSizes[resTry] += 1;
	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
	    }
	}
    }
assert(sectionIx == sectionCount);
freez(&bed);
*retMaxBlockSize = maxBlockSize;
}
void loadGeneToMotif(struct sqlConnection *conn, char *fileName, char *table,
	struct hash *geneToModuleHash, struct hash *moduleAndMotifHash,
	struct hash *motifHash, struct hash *positionsHash,
	char *regionTable)
/* Load file which is a big matrix with genes for rows and motifs for
 * columns.  There is a semicolon-separated list of numbers in the matrix 
 * where a gene has the motif, and an empty (tab separated) field
 * where there is no motif.  The numbers are relative to the
 * region associated with the gene in the positionsHash. 
 * Only load bits of this where motif actually occurs in module associated 
 * with gene. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
FILE *f = hgCreateTabFile(tmpDir, table);
char *motifNames[32*1024], *row[32*1024];
int motifCount, rowSize, i;
char *gene, *module;
int geneCount = 0, total = 0;
struct dyString *dy = dyStringNew(512);
struct genomePos *motifPosList = NULL, *motifPosForGene;
struct genomePos *regionPosList = NULL, *regionPos;

/* Read first line, which is labels. */
if (!lineFileNextReal(lf, &line))
    errAbort("Empty file %s", fileName);
subChar(line, ' ', '_');
motifCount = chopLine(line, motifNames);
if (motifCount >= ArraySize(motifNames))
    errAbort("Too many motifs line 1 of %s", fileName);
lineFileExpectAtLeast(lf, 2, motifCount);
motifNames[0] = NULL;
for (i=1; i<motifCount; ++i)
    {
    char name[64];
    motifNames[i] = cloneString(fixMotifName(motifNames[i],name,sizeof(name)));
    if (!hashLookup(motifHash, motifNames[i]))
        errAbort("Motif %s is in %s but not modules_motifs.gxm",
		motifNames[i], fileName);
    }

/* Read subsequent lines. */
while ((rowSize = lineFileChopTab(lf, row)) != 0)
    {
    lineFileExpectWords(lf, motifCount, rowSize);
    gene = row[0];
    module = hashFindVal(geneToModuleHash, gene);
    if (module == NULL)
	{
        warn("WARNING: Gene %s in line %d of %s but not module_assignments.tab", 
		gene, lf->lineIx, lf->fileName);
	continue;
	}
    regionPos = NULL;
    for (i=1; i<rowSize; ++i)
        {
	if (row[i][0] != 0)
	    {
	    if (hashLookup2(moduleAndMotifHash, module, motifNames[i]))
		{
		regionPos = hashFindVal(positionsHash, gene);
		if (regionPos == NULL)
		    {
		    warn("WARNING: %s in %s but not gene_positions.tab",
		    	gene, fileName);
		    i = rowSize; continue;
		    }
		
		motifPosForGene = convertMotifPos(row[i], regionPos, 
			hashMustFindVal(motifHash, motifNames[i]), lf);
		motifPosList = slCat(motifPosForGene, motifPosList);
		++total;
		}
	    }
	}
    if (regionPos != NULL)
        {
	slAddHead(&regionPosList, regionPos);
	}
    ++geneCount;
    }
lineFileClose(&lf);

/* Output sorted table of all motif hits. */
    {
    struct genomePos *pos;
    slSort(&motifPosList, genomePosCmp);
    for (pos = motifPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->motif);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\t", pos->strand);
	fprintf(f, "%s\n", pos->name);
	}
    dyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "    gene varchar(255) not null,\n"
    "              #Indices\n"
    "    INDEX(gene(12)),\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  table);
    sqlRemakeTable(conn, table, dy->string);
    verbose(1, "%d genes, %d motifs, %d motifs in genes\n",
	    geneCount, motifCount-1, total);
    hgLoadTabFile(conn, tmpDir, table, &f);
    // hgRemoveTabFile(tmpDir, table);
    verbose(1, "Loaded %s table\n", table);
    slFreeList(&motifPosList);
    }

/* Now output sorted table of upstream regions. */
    {
    FILE *f = hgCreateTabFile(tmpDir, regionTable);
    struct genomePos *pos;
    dyStringClear(dy);
    dyStringPrintf(dy,
    "CREATE TABLE  %s (\n"
    "    bin smallInt unsigned not null,\n"
    "    chrom varChar(255) not null,\n"
    "    chromStart int not null,\n"
    "    chromEnd int not null,\n"
    "    name varchar(255) not null,\n"
    "    score int not null,\n"
    "    strand char(1) not null,\n"
    "              #Indices\n"
    "    INDEX(name(16)),\n"
    "    INDEX(chrom(8),bin)\n"
    ")\n",  regionTable);
    sqlRemakeTable(conn, regionTable, dy->string);
    slSort(&regionPosList, genomePosCmp);
    for (pos = regionPosList; pos != NULL; pos = pos->next)
	{
	int start = pos->start;
	int end = pos->end;
	if (start < 0) start = 0;
	fprintf(f, "%d\t", binFromRange(start, end));
	fprintf(f, "%s\t", pos->chrom);
	fprintf(f, "%d\t%d\t", start, end);
	fprintf(f, "%s\t", pos->name);
	fprintf(f, "%d\t", pos->score);
	fprintf(f, "%c\n", pos->strand);
	}
    hgLoadTabFile(conn, tmpDir, regionTable, &f);
    // hgRemoveTabFile(tmpDir, regionTable);
    }
}
void dumpHapmapPhaseIIISummary()
/* Read .bed files, accumulate info, and aggregate into hapmapPhaseIIISummary file. */
{
int i;
char inFile[256];
struct lineFile *lf = NULL;
int wordCount;
char *words[13];
char key[128];
struct summary *sum, *sumList = NULL;
struct hash *hash = hashNew(24);

for (i = 0;  i < HAP_PHASEIII_POPCOUNT;  i++)
    {
    struct hapmapSnps hs;
    safef(inFile, sizeof(inFile), "hapmapSnps%s.bed", hapmapPhaseIIIPops[i]);
    lf = lineFileOpen(inFile, TRUE);
    while ((wordCount = lineFileChopTab(lf, words)) > 0)
	{
	lineFileExpectWords(lf, 12, wordCount);
	hapmapSnpsStaticLoad(words, &hs);
	// Key by chrom as well as name because the pseudoautosomal regions (PAR)
	// of chrX and chrY have independent (but identical) SNP items.
	safef(key, sizeof(key), "%s:%s", hs.chrom, hs.name);
	sum = hashFindVal(hash, key);
	if (sum == NULL)
	    {
	    sum = summaryNew(&hs, i);
	    hashAdd(hash, key, sum);
	    slAddHead(&sumList, sum);
	    }
	else
	    addSnpToSum(sum, &hs, i);
	}
    lineFileClose(&lf);
    }
for (i = 0;  i < HAP_ORTHO_COUNT;  i++)
    {
    struct hapmapAllelesOrtho ho;
    safef(inFile, sizeof(inFile), "hapmapAlleles%s.bed", hapmapOrthoSpecies[i]);
    lf = lineFileOpen(inFile, TRUE);
    while ((wordCount = lineFileChopTab(lf, words)) > 0)
	{
	lineFileExpectWords(lf, 13, wordCount);
	hapmapAllelesOrthoStaticLoad(words, &ho);
	safef(key, sizeof(key), "%s:%s", ho.chrom, ho.name);
	sum = hashFindVal(hash, key);
	if (sum == NULL)
	    errAbort("Ortho SNP '%s' doesn't match any HapMap SNPs!", ho.name);
	addOrthoToSum(sum, &ho, i);
	}
    lineFileClose(&lf);
    }
slReverse(&sumList);
// That leaves it mostly sorted, but not all!  Leave final sorting up to hgLoadBed.
FILE *f = mustOpen("hapmapPhaseIIISummary.bed", "w");
for (sum = sumList;  sum != NULL;  sum = sum->next)
    {
    struct hapmapPhaseIIISummary *fs = sum->finalSum;
    // Convert fs->score (heterozygosity * 1000) from total into average:
    fs->score = (int)((float)fs->score / fs->popCount + 0.5);
    // Determine whether the overall{Major,Minor}Alleles are indeed the same
    // as the first population encountered:
    char firstPopMajorAl = fs->overallMajorAllele;
    char firstPopMinorAl = fs->overallMinorAllele;
    int firstPopYea = 0, firstPopNay = 0;
    for (i = 0;  i < HAP_PHASEIII_POPCOUNT;  i++)
	{
	if (fs->foundInPop[i])
	    {
	    if (sum->popMajorAlleles[i] == firstPopMajorAl)
		firstPopYea++;
	    else
		firstPopNay++;
	    }
	}
    if (firstPopNay > firstPopYea)
	{
	fs->overallMajorAllele = firstPopMinorAl;
	fs->overallMinorAllele = firstPopMajorAl;
	}
    hapmapPhaseIIISummaryTabOut(fs, f);
    }
carefulClose(&f);
// All done -- no need to waste time freeing hash and sumList.
}
Ejemplo n.º 27
0
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, 
	int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f,
	int resTryCount, int resScales[], int resSizes[], 
	boolean doCompress, bits32 *retMaxSectionSize)
/* Read through lf, chunking it into sections that get written to f.  Save info
 * about sections in bounds. */
{
int maxSectionSize = 0;
struct bbiChromUsage *usage = usageList;
int itemIx = 0, sectionIx = 0;
bits32 reserved32 = 0;
UBYTE reserved8 = 0;
struct sectionItem items[itemsPerSlot];
struct sectionItem *lastB = NULL;
bits32 resEnds[resTryCount];
int resTry;
for (resTry = 0; resTry < resTryCount; ++resTry)
    resEnds[resTry] = 0;
struct dyString *stream = dyStringNew(0);

/* remove initial browser and track lines */
lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    /* Get next line of input if any. */
    char *row[5];
    int rowSize = lineFileChopNext(lf, row, ArraySize(row));

    /* Figure out whether need to output section. */
    boolean sameChrom = FALSE;
    if (rowSize > 0)
	sameChrom = sameString(row[0], usage->name);
    if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom)
        {
	/* Figure out section position. */
	bits32 chromId = usage->id;
	bits32 sectionStart = items[0].start;
	bits32 sectionEnd = items[itemIx-1].end;

	/* Save section info for indexing. */
	assert(sectionIx < sectionCount);
	struct bbiBoundsArray *section = &bounds[sectionIx++];
	section->offset = ftell(f);
	section->range.chromIx = chromId;
	section->range.start = sectionStart;
	section->range.end = sectionEnd;

	/* Output section header to stream. */
	dyStringClear(stream);
	UBYTE type = bwgTypeBedGraph;
	bits16 itemCount = itemIx;
	dyStringWriteOne(stream, chromId);			// chromId
	dyStringWriteOne(stream, sectionStart);		// start
	dyStringWriteOne(stream, sectionEnd);	// end
	dyStringWriteOne(stream, reserved32);		// itemStep
	dyStringWriteOne(stream, reserved32);		// itemSpan
	dyStringWriteOne(stream, type);			// type
	dyStringWriteOne(stream, reserved8);			// reserved
	dyStringWriteOne(stream, itemCount);			// itemCount

	/* Output each item in section to stream. */
	int i;
	for (i=0; i<itemIx; ++i)
	    {
	    struct sectionItem *item = &items[i];
	    dyStringWriteOne(stream, item->start);
	    dyStringWriteOne(stream, item->end);
	    dyStringWriteOne(stream, item->val);
	    }

	/* Save stream to file, compressing if need be. */
	if (stream->stringSize > maxSectionSize)
	    maxSectionSize = stream->stringSize;
	if (doCompress)
	    {
	    size_t maxCompSize = zCompBufSize(stream->stringSize);
	    char compBuf[maxCompSize];
	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
	    mustWrite(f, compBuf, compSize);
	    }
	else
	    mustWrite(f, stream->string, stream->stringSize);


	/* If at end of input we are done. */
	if (rowSize == 0)
	    break;

	/* Set up for next section. */
	itemIx = 0;

	if (!sameChrom)
	    {
	    usage = usage->next;
	    assert(usage != NULL);
            if (!sameString(row[0], usage->name))
                errAbort("read %s, expecting %s on line %d in file %s\n", 
                    row[0], usage->name, lf->lineIx, lf->fileName);
	    assert(sameString(row[0], usage->name));
	    lastB = NULL;
	    for (resTry = 0; resTry < resTryCount; ++resTry)
		resEnds[resTry] = 0;
	    }
	}

    /* Parse out input. */
    lineFileExpectWords(lf, 4, rowSize);
    bits32 start = lineFileNeedNum(lf, row, 1);
    bits32 end = lineFileNeedNum(lf, row, 2);
    float val = lineFileNeedDouble(lf, row, 3);

    /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */
    if (start > end)
        errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName);
    if (lastB != NULL)
        {
	if (lastB->start > start)
	    errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName);
	if (lastB->end > start)
	    errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName);
	}


    /* Do zoom counting. */
    for (resTry = 0; resTry < resTryCount; ++resTry)
        {
	bits32 resEnd = resEnds[resTry];
	if (start >= resEnd)
	    {
	    resSizes[resTry] += 1;
	    resEnds[resTry] = resEnd = start + resScales[resTry];
	    }
	while (end > resEnd)
	    {
	    resSizes[resTry] += 1;
	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
	    }
	}

    /* Save values in output array. */
    struct sectionItem *b = &items[itemIx];
    b->start = start;
    b->end = end;
    b->val = val;
    lastB = b;
    itemIx += 1;
    }
assert(sectionIx == sectionCount);

*retMaxSectionSize = maxSectionSize;
}
Ejemplo n.º 28
0
void outputUniqueOnSharedKey(char *inTab, struct asObject *as, struct asColumn *keyCol,
    struct slPair *fieldList, char *outTab, char *outErr)
/* Scan through tab-separated file inTab and output fields in fieldList to
 * outTab. Make sure there is only one row for each value of sharedKey field. 
 * If there would be multiple different rows in output with sharedKey, 
 * complain about it in outErr. */
{
/* Open input and output. */
struct lineFile *lf = lineFileOpen(inTab, TRUE);
FILE *f = mustOpen(outTab, "w");
FILE *fErr = mustOpen(outErr, "w");

/* Set up array for input fields with more than we expect for better error reporting. */
int oldFieldCount = slCount(as->columnList);
int newFieldCount = slCount(fieldList);
int allocFields = oldFieldCount+10;
char *words[allocFields];

/* Set up array for output fields that says where to find them in input. */
int *oldIx = makeNewToOldArray(as, fieldList);

/* Figure out index of key field. */
int keyIx = slIxFromElement(as->columnList, keyCol);

/* Go through each line of input, outputting selected columns. */
struct hash *uniqHash = hashNew(18); 
struct hash *errHash = hashNew(0);
struct dyString *dy = dyStringNew(1024);
int fieldCount;
while ((fieldCount = lineFileChopNextTab(lf, words, allocFields)) > 0)
    {
    lineFileExpectWords(lf, oldFieldCount, fieldCount);

    /* Collect possible output into dy. */
    dyStringClear(dy);
    dyStringPrintf(dy, "%s", words[oldIx[0]]);
    int i;
    for (i=1; i<newFieldCount; ++i)
	dyStringPrintf(dy,  "\t%s", words[oldIx[i]]);
    dyStringPrintf(dy, "\n");

    /* Check that this line is either unique for this key, or the same as previous lines
     * for the key. */
    char *key = words[keyIx];
    char *oldVal = hashFindVal(uniqHash, key);
    if (oldVal != NULL)
        {
	if (!sameString(oldVal, dy->string))
	    {
	    /* Error reporting is a little complex.  We want to output all lines associated
	     * with key, including the first one, but we only want to do first line once. */
	    if (!hashLookup(errHash, key))
	        {
		hashAdd(errHash, key, NULL);
		fputs(oldVal, fErr);
		}
	    fputs(dy->string, fErr);
	    }
	}
    else
	{
	hashAdd(uniqHash, key, cloneString(dy->string));
        fputs(dy->string, f);
	}
    }

/* Report error summary */
if (errHash->elCount > 0)
    {
    warn("Warning: %d shared keys have multiple values in table 2. See %s.\n"
         "Only first row for each key put in %s" , errHash->elCount, outErr, outTab);
    if (!mergeOk)
        noWarnAbort();
    }

/* Clean up and go home. */
freez(&oldIx);
carefulClose(&fErr);
carefulClose(&f);
lineFileClose(&lf);
}
Ejemplo n.º 29
0
static void parseBedGraphSection(struct lineFile *lf, boolean clipDontDie, 
	struct hash *chromSizeHash, struct lm *lm, 
	int itemsPerSlot, struct bwgSection **pSectionList)
/* Parse out bedGraph section until we get to something that is not in bedGraph format. */
{
/* Set up hash and list to store chromosomes. */
struct hash *chromHash = hashNew(0);
struct bedGraphChrom *chrom, *chromList = NULL;

/* Collect lines in items on appropriate chromosomes. */
struct bwgBedGraphItem *item;
char *line;
while (lineFileNextReal(lf, &line))
    {
    /* Check for end of section. */
    if (stepTypeLine(line))
        {
	lineFileReuse(lf);
	break;
	}

    /* Parse out our line and make sure it has exactly 4 columns. */
    char *words[5];
    int wordCount = chopLine(line, words);
    lineFileExpectWords(lf, 4, wordCount);

    /* Get chromosome. */
    char *chromName = words[0];
    chrom = hashFindVal(chromHash, chromName);
    if (chrom == NULL)
        {
	lmAllocVar(chromHash->lm, chrom);
	hashAddSaveName(chromHash, chromName, chrom, &chrom->name);
	chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM);
	slAddHead(&chromList, chrom);
	}

    /* Convert to item and add to chromosome list. */
    lmAllocVar(lm, item);
    item->start = lineFileNeedNum(lf, words, 1);
    item->end = lineFileNeedNum(lf, words, 2);
    item->val = lineFileNeedDouble(lf, words, 3);

    /* Do sanity checking on coordinates. */
    if (item->start > item->end)
        errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", 
		item->start, item->end, lf->lineIx, lf->fileName);
    if (item->end > chrom->size)
	{
        warn("bedGraph error line %d of %s: chromosome %s has size %u but item ends at %u",
	        lf->lineIx, lf->fileName, chrom->name, chrom->size, item->end);
	if (!clipDontDie)
	    noWarnAbort();
	}
    else
	{
	slAddHead(&chrom->itemList, item);
	}
    }
slSort(&chromList, bedGraphChromCmpName);

/* Loop through each chromosome and output the item list, broken into sections
 * for that chrom. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    slSort(&chrom->itemList, bwgBedGraphItemCmp);

    /* Check to make sure no overlap between items. */
    struct bwgBedGraphItem *item = chrom->itemList, *nextItem;
    for (nextItem = item->next; nextItem != NULL; nextItem = nextItem->next)
        {
	if (item->end > nextItem->start)
	    errAbort("Overlap between %s %d %d and %s %d %d.\nPlease remove overlaps and try again",
	        chrom->name, item->start, item->end, chrom->name, nextItem->start, nextItem->end);
	item = nextItem;
	}

    /* Break up into sections of no more than items-per-slot size. */
    struct bwgBedGraphItem *startItem, *endItem, *nextStartItem = chrom->itemList;
    for (startItem = chrom->itemList; startItem != NULL; startItem = nextStartItem)
	{
	/* Find end item of this section, and start item for next section.
	 * Terminate list at end item. */
	int sectionSize = 0;
	int i;
	endItem = startItem;
	for (i=0; i<itemsPerSlot; ++i)
	    {
	    if (nextStartItem == NULL)
		break;
	    endItem = nextStartItem;
	    nextStartItem = nextStartItem->next;
	    ++sectionSize;
	    }
	endItem->next = NULL;

	/* Fill in section and add it to section list. */
	struct bwgSection *section;
	lmAllocVar(lm, section);
	section->chrom = cloneString(chrom->name);
	section->start = startItem->start;
	section->end = endItem->end;
	section->type = bwgTypeBedGraph;
	section->items.bedGraphList = startItem;
	section->itemCount = sectionSize;
	slAddHead(pSectionList, section);
	}
    }

/* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in 
 * hash's memory. */
hashFree(&chromHash);
chromList = NULL;
}
Ejemplo n.º 30
0
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, 
	struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount)
/* Go through bed file and collect chromosomes and statistics. */
{
char *row[3];
struct hash *uniqHash = hashNew(0);
struct bbiChromUsage *usage = NULL, *usageList = NULL;
int lastStart = -1;
bits32 id = 0;
bits64 totalBases = 0, bedCount = 0;
int minDiff = BIGNUM;

lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    int rowSize = lineFileChopNext(lf, row, ArraySize(row));
    if (rowSize == 0)
        break;
    lineFileExpectWords(lf, 3, rowSize);
    char *chrom = row[0];
    int start = lineFileNeedNum(lf, row, 1);
    int end = lineFileNeedNum(lf, row, 2);
    if (start > end)
        {
	    errAbort("end (%d) before start (%d) line %d of %s",
	    	end, start, lf->lineIx, lf->fileName);
	}
    ++bedCount;
    totalBases += (end - start);
    if (usage == NULL || differentString(usage->name, chrom))
        {
	if (hashLookup(uniqHash, chrom))
	    {
	    errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
	    	lf->fileName, lf->lineIx);
	    }
	hashAdd(uniqHash, chrom, NULL);
	struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom);
	if (chromHashEl == NULL)
	    errAbort("%s is not found in chromosome sizes file", chrom);
	int chromSize = ptToInt(chromHashEl->val);
	AllocVar(usage);
	usage->name = cloneString(chrom);
	usage->id = id++;
	usage->size = chromSize;
	slAddHead(&usageList, usage);
	lastStart = -1;
	}
    if (end > usage->size)
        errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName);
    usage->itemCount += 1;
    if (lastStart >= 0)
        {
	int diff = start - lastStart;
	if (diff < minDiff)
	    {
	    if (diff < 0)
		errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
		    lf->fileName, lf->lineIx);
	    minDiff = diff;
	    }
	}
    lastStart = start;
    }
slReverse(&usageList);
*retMinDiff = minDiff;
*retAveSize = (double)totalBases/bedCount;
*retBedCount = bedCount;
freeHash(&uniqHash);
return usageList;
}