void affyPslAndAtlasToBedNew(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line, *name; int i, wordCount, expCount; char **row; double *data, median; double invMedian, ratio, logRatio; char *affyId; struct hash *hash = newHash(17); struct psl *psl; struct bed *bed; FILE *f = NULL; int dataCount = 0, pslCount = 0, bedCount = 0; int minExpVal = 20; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExp(line+1, expRecOut); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = mustOpen(bedOut, "w"); /* Build up a hash keyed by affyID with an int array of data * for value. Do output in short case. */ AllocArray(row, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } AllocArray(data, expCount); for (i=0; i<expCount; ++i) { data[i] = atof(row[i]); if (data[i] < minExpVal) data[i] = minExpVal; } median = findPositiveMedian(data, expCount, minExpVal); if (median >= 0) { invMedian = 1.0/median; for (i=0; i<expCount; ++i) { double val = data[i]; val = safeLog2(invMedian*val); data[i] = val; } if (shortOut) shortDataOut(f, affyId, expCount, data); else hashAdd(hash, affyId, data); } data = NULL; ++dataCount; } lineFileClose(&lf); warn("%d rows of expression data\n", dataCount); /* Stream through psl file, converting it to bed with expression data. */ if (!shortOut) { lf = pslFileOpen(pslFile); while ((psl = pslNext(lf)) != NULL) { ++pslCount; /* get probe id from sequence name */ name=parseNameFromHgc(psl->qName); data = hashFindVal(hash, name); if (data != NULL) { struct bed *bed = bedFromPsl(psl); bed->expCount = expCount; AllocArray(bed->expIds, expCount); AllocArray(bed->expScores, expCount); for (i=0; i<expCount; ++i) { bed->expScores[i] = data[i]; bed->expIds[i] = i; } bedTabOutN(bed, 15, f); ++bedCount; bedFree(&bed); } pslFree(&psl); } warn("%d records in %s", pslCount, pslFile); warn("%d records written to %s", bedCount, bedOut); } lineFileClose(&lf); carefulClose(&f); }
void hgGnfMicroarray(char *expTable, char *dataTable, char *atlasFile) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line; int i, wordCount, expCount; char **row; float *data; char *affyId; struct hash *hash = newHash(17); FILE *f = NULL; int dataCount = 0; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (startsWith("Gene Name", line)) line += 9; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExpTable(line+1, expTable); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = hgCreateTabFile(tabDir, dataTable); AllocArray(row, expCount); AllocArray(data, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (chopName != NULL) { char *e = stringIn(chopName, affyId); if (e != NULL) *e = 0; } if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } for (i=0; i<expCount; ++i) { data[i] = sqlFloat(row[i]); } shortDataOut(f, affyId, expCount, data); ++dataCount; if (limit != 0 && dataCount >= limit) break; } lineFileClose(&lf); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); expDataCreateTable(conn, dataTable); hgLoadTabFile(conn, tabDir, dataTable, &f); hgRemoveTabFile(tabDir, dataTable); sqlDisconnect(&conn); } }