struct bed *bigBedGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from bigBed, in all regions, that pass filtering. */ { /* Connect to big bed and get metadata and filter. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpen(fileName); struct asObject *as = bigBedAsOrDefault(bbi); struct asFilter *filter = asFilterFromCart(cart, db, table, as); /* Get beds a region at a time. */ struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) addFilteredBedsOnRegion(bbi, region, table, filter, lm, &bedList); slReverse(&bedList); /* Clean up and return. */ if (retFieldCount != NULL) *retFieldCount = bbi->definedFieldCount; bbiFileClose(&bbi); freeMem(fileName); return bedList; }
struct bed *bamGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from BAM, in all regions, that pass filtering. */ { int maxOut = bigFileMaxOutput(); /* Figure out bam file name get column info and filter. */ struct asObject *as = bamAsObj(); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct hash *idHash = identifierHash(db, table); /* Get beds a region at a time. */ struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) { char *fileName = bamFileName(table, conn, region->chrom); addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut); freeMem(fileName); if (maxOut <= 0) { warn("Reached output limit of %d data values, please make region smaller,\n" "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); break; } } slReverse(&bedList); return bedList; }
void bigBedTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) /* Print out selected fields from Big Bed. If fields is NULL, then print out all fields. */ { if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = bigBedGetFields(table, conn); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) hashAddInt(fieldHash, bb->name, i); // If bigBed has name column, look up pasted/uploaded identifiers if any: struct hash *idHash = NULL; if (slCount(bbList) >= 4) idHash = identifierHash(db, table); /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } /* Output row of labels */ fprintf(f, "#%s", fieldArray[0]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", fieldArray[i]); fprintf(f, "\n"); /* Open up bigBed file. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpen(fileName); struct asObject *as = bigBedAsOrDefault(bbi); struct asFilter *filter = NULL; if (anyFilter()) { filter = asFilterFromCart(cart, db, table, as); if (filter) { fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); } } /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); for (region = regionList; region != NULL; region = region->next) { struct lm *lm = lmInit(0); struct bigBedInterval *iv, *ivList = bigBedIntervalQuery(bbi, region->chrom, region->start, region->end, 0, lm); char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; for (iv = ivList; iv != NULL; iv = iv->next) { bigBedIntervalToRow(iv, region->chrom, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { if ((idHash != NULL) && (hashLookup(idHash, row[3]) == NULL)) continue; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", row[columnArray[i]]); fprintf(f, "\n"); } } lmCleanup(&lm); } /* Clean up and exit. */ bbiFileClose(&bbi); hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); }
void bamTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) /* Print out selected fields from BAM. If fields is NULL, then print out all fields. */ { struct hTableInfo *hti = NULL; hti = getHti(db, table, conn); struct hash *idHash = NULL; char *idField = getIdField(db, curTrack, table, hti); int idFieldNum = 0; /* if we know what field to use for the identifiers, get the hash of names */ if (idField != NULL) idHash = identifierHash(db, table); if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = bamGetFields(); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) { /* if we know the field for identifiers, save it away */ if ((idField != NULL) && sameString(idField, bb->name)) idFieldNum = i; hashAddInt(fieldHash, bb->name, i); } /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } /* Output row of labels */ fprintf(f, "#%s", fieldArray[0]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", fieldArray[i]); fprintf(f, "\n"); struct asObject *as = bamAsObj(); struct asFilter *filter = NULL; if (anyFilter()) { filter = asFilterFromCart(cart, db, table, as); if (filter) { fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); } } /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { struct lm *lm = lmInit(0); char *fileName = bamFileName(table, conn, region->chrom); struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom, region->start, region->end, lm); char *row[SAMALIGNMENT_NUM_COLS]; char numBuf[BAM_NUM_BUF_SIZE]; for (sam = samList; sam != NULL && (maxOut > 0); sam = sam->next) { samAlignmentToRow(sam, numBuf, row); if (asFilterOnRow(filter, row)) { /* if we're looking for identifiers, check if this matches */ if ((idHash != NULL)&&(hashLookup(idHash, row[idFieldNum]) == NULL)) continue; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", row[columnArray[i]]); fprintf(f, "\n"); maxOut --; } } freeMem(fileName); lmCleanup(&lm); } if (maxOut == 0) warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); /* Clean up and exit. */ hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); }
void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f, boolean isTabix) /* Print out selected fields from VCF. If fields is NULL, then print out all fields. */ { struct hTableInfo *hti = NULL; hti = getHti(db, table, conn); struct hash *idHash = NULL; char *idField = getIdField(db, curTrack, table, hti); int idFieldNum = 0; /* if we know what field to use for the identifiers, get the hash of names */ if (idField != NULL) idHash = identifierHash(db, table); if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = vcfGetFields(); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) { /* if we know the field for identifiers, save it away */ if ((idField != NULL) && sameString(idField, bb->name)) idFieldNum = i; hashAddInt(fieldHash, bb->name, i); } /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } // If we are outputting a subset of fields, invalidate the VCF header. boolean allFields = (fieldCount == VCFDATALINE_NUM_COLS); if (!allFields) fprintf(f, "# Only selected columns are included below; output is not valid VCF.\n"); struct asObject *as = vcfAsObj(); struct asFilter *filter = NULL; if (anyFilter()) filter = asFilterFromCart(cart, db, table, as); /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); // Include the header, absolutely necessary for VCF parsing. boolean printedHeader = FALSE; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { char *fileName = vcfFileName(tdb, conn, table, region->chrom); struct vcfFile *vcff; if (isTabix) vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut); else vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut, TRUE); if (vcff == NULL) noWarnAbort(); // If we are outputting all fields, but this VCF has no genotype info, omit the // genotype columns from output: if (allFields && vcff->genotypeCount == 0) fieldCount = VCFDATALINE_NUM_COLS - 2; if (!printedHeader) { fprintf(f, "%s", vcff->headerString); if (filter) fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); if (!allFields) { fprintf(f, "#%s", fieldArray[0]); for (i=1; i<fieldCount; ++i) fprintf(f, "\t%s", fieldArray[i]); fprintf(f, "\n"); } printedHeader = TRUE; } char *row[VCFDATALINE_NUM_COLS]; char numBuf[VCF_NUM_BUF_SIZE]; for (rec = vcff->records; rec != NULL && (maxOut > 0); rec = rec->next) { vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row); if (asFilterOnRow(filter, row)) { /* if we're looking for identifiers, check if this matches */ if ((idHash != NULL) && (hashLookup(idHash, row[idFieldNum]) == NULL)) continue; // All fields output: after asFilter'ing, preserve original VCF chrom if (allFields && !sameString(rec->chrom, region->chrom)) row[0] = rec->chrom; int i; fprintf(f, "%s", row[columnArray[0]]); for (i=1; i<fieldCount; ++i) { fprintf(f, "\t%s", row[columnArray[i]]); } fprintf(f, "\n"); maxOut --; } } vcfFileFree(&vcff); freeMem(fileName); } if (maxOut == 0) warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); /* Clean up and exit. */ dyStringFree(&dyAlt); dyStringFree(&dyFilter); dyStringFree(&dyInfo); dyStringFree(&dyGt); hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); }