void pslToBed(char *pslFile, char *bedFile, struct hash *cdsHash, bool doPosName) /* pslToBed -- tranform a psl format file to a bed format file */ { struct lineFile *pslLf = pslFileOpen(pslFile); FILE *bedFh = mustOpen(bedFile, "w"); struct psl *psl; while ((psl = pslNext(pslLf)) != NULL) { struct bed *bed = bedFromPsl(psl); if (doPosName) { char *newName = needMem(512); safef(newName, 512, "%s:%d-%d", psl->qName, psl->qStart, psl->qEnd); freeMem(bed->name); bed->name = newName; } if (cdsHash) { struct cds *cds = hashFindVal(cdsHash, psl->qName); if (cds == NULL) bed->thickStart = bed->thickEnd = bed->chromStart; else setThick(psl, bed, cds); } bedTabOutN(bed, 12, bedFh); bedFree(&bed); pslFree(&psl); } carefulClose(&bedFh); lineFileClose(&pslLf); }
struct bed *orthoBedFromPsl(struct sqlConnection *conn, char *db, char *orthoDb, char *netTable, struct psl *psl) /** Produce a bed on the orthologous genome from the original psl. */ { struct bed *bed = NULL, *orthoBed = NULL; int i; bed = bedFromPsl(psl); orthoBed = orthoBedFromBed(conn, db, orthoDb, netTable, bed); bedFree(&bed); return orthoBed; }
struct bed *createBedsFromPsls(char *pslFile, int expCount) /** creates a list of beds from a pslfile, allocates memory for arrays as determined by expCount */ { struct psl *pslList = NULL, *psl = NULL; struct bed *bedList = NULL, *bed = NULL; pslList = pslLoadAll(pslFile); for(psl = pslList; psl != NULL; psl = psl->next) { bed = bedFromPsl(psl); freez(&bed->name); bed->name=parseNameFromHgc(psl->qName); bed->score = 0; bed->expCount = 0; bed->expIds = needMem(sizeof(int)*expCount); bed->expScores = needMem(sizeof(float)*expCount); slAddHead(&bedList,bed); } slReverse(&bedList); pslFreeList(&pslList); return bedList; }
void affyPslAndAtlasToBedNew(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line, *name; int i, wordCount, expCount; char **row; double *data, median; double invMedian, ratio, logRatio; char *affyId; struct hash *hash = newHash(17); struct psl *psl; struct bed *bed; FILE *f = NULL; int dataCount = 0, pslCount = 0, bedCount = 0; int minExpVal = 20; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExp(line+1, expRecOut); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = mustOpen(bedOut, "w"); /* Build up a hash keyed by affyID with an int array of data * for value. Do output in short case. */ AllocArray(row, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } AllocArray(data, expCount); for (i=0; i<expCount; ++i) { data[i] = atof(row[i]); if (data[i] < minExpVal) data[i] = minExpVal; } median = findPositiveMedian(data, expCount, minExpVal); if (median >= 0) { invMedian = 1.0/median; for (i=0; i<expCount; ++i) { double val = data[i]; val = safeLog2(invMedian*val); data[i] = val; } if (shortOut) shortDataOut(f, affyId, expCount, data); else hashAdd(hash, affyId, data); } data = NULL; ++dataCount; } lineFileClose(&lf); warn("%d rows of expression data\n", dataCount); /* Stream through psl file, converting it to bed with expression data. */ if (!shortOut) { lf = pslFileOpen(pslFile); while ((psl = pslNext(lf)) != NULL) { ++pslCount; /* get probe id from sequence name */ name=parseNameFromHgc(psl->qName); data = hashFindVal(hash, name); if (data != NULL) { struct bed *bed = bedFromPsl(psl); bed->expCount = expCount; AllocArray(bed->expIds, expCount); AllocArray(bed->expScores, expCount); for (i=0; i<expCount; ++i) { bed->expScores[i] = data[i]; bed->expIds[i] = i; } bedTabOutN(bed, 15, f); ++bedCount; bedFree(&bed); } pslFree(&psl); } warn("%d records in %s", pslCount, pslFile); warn("%d records written to %s", bedCount, bedOut); } lineFileClose(&lf); carefulClose(&f); }
void outputBedsFromPsls(struct hash *pslHash,char *bedOutName, char *expRecordOutName, char *affyFileName, char *expFileName) /** For each set of entries in affyFile find matching psl and create a bed. */ { struct bed *bed = NULL, *b=NULL; struct psl *pslList = NULL, *psl = NULL; struct hash *expHash = NULL; int numExps = 0; int expCount = 0; int i =0; char *probeSet = NULL; char *row[4]; char key[128]; struct slName *expNames = NULL, *name = NULL; FILE *bedOut = NULL; FILE *expRecordOut = NULL; char *toDiffFileName = optionVal("toDiffFile", NULL); FILE *toDiffOut = NULL; struct lineFile *lf = NULL; fillInExpHash(expFileName, &expHash, &expNames, &expCount); lf = lineFileOpen(affyFileName, TRUE); bedOut = mustOpen(bedOutName, "w"); if(toDiffFileName != NULL) toDiffOut = mustOpen(toDiffFileName, "w"); /* Loop through either adding experiments to beds or if new probeset create bed from psl and start over. */ while(lineFileChopNextTab(lf, row, sizeof(row))) { /* Do we have to make a new bed? */ if(probeSet == NULL || differentWord(probeSet, row[0])) { occassionalDot(); numExps = 0; /* If we have probeset print out the current beds. */ if(probeSet != NULL) { for(b = bed; b != NULL; b = b->next) { int avgCount = 0; for(i = 0; i < b->expCount; i++) if(b->expScores[i] != -10000) avgCount++; if(avgCount != 0 && b->score > 0) b->score = log(b->score / avgCount) * 100; else b->score = 0; bedTabOutN(b, 15, bedOut); if(toDiffOut != NULL) outputToDiffRecord(b, expNames, toDiffOut); } } bedFreeList(&bed); /* Lookup key in pslHash to find list of psl. */ safef(key, sizeof(key), "%s", row[0]); pslList = hashFindVal(pslHash, key); /* Can have multiple psls. */ for(psl = pslList; psl != NULL; psl = psl->next) { b = bedFromPsl(psl); AllocArray(b->expIds, expCount ); AllocArray(b->expScores, expCount); b->expCount = expCount; initBedScores(b, expCount); slAddHead(&bed, b); } } if(bed != NULL) { /* Allocate larger arrays if necessary. */ if(numExps > expCount) { errAbort("Supposed to be %d experiments but probeset %s has at least %d", expCount, bed->name, numExps); } for(b = bed; b != NULL; b = b->next) { int exp = hashIntVal(expHash, row[1]); if(differentWord(row[3], "NaN")) b->expScores[exp] = atof(row[3]); if(differentWord(row[2], "NaN")) b->score += atof(row[2]); } numExps++; } freez(&probeSet); probeSet = cloneString(row[0]); } expRecordOut = mustOpen(expRecordOutName, "w"); i = 0; for(name = expNames; name != NULL; name = name->next) { subChar(name->name, ',', '_'); subChar(name->name, ' ', '_'); fprintf(expRecordOut, "%d\t%s\tuclaExp\tuclaExp\tuclaExp\tuclaExp\t1\t%s,\n", i++, name->name, name->name); } hashFree(&expHash); slFreeList(&expNames); carefulClose(&expRecordOut); carefulClose(&bedOut); lineFileClose(&lf); }