struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords) /* Parse header and rows within the given position range from a VCF file that has been * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has * no items in range. * If maxErr not zero, then continue to parse until this number of error have been reached. * A maxErr less than zero does not stop and reports all errors. */ { struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff == NULL) return NULL; if (isNotEmpty(chrom) && start != end) { if (lineFileSetTabixRegion(lf, chrom, start, end)) vcfParseData(vcff, maxRecords); } return vcff; }
struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords) /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. If chrom is non-NULL, * seek to the position range and parse all lines in range into * vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */ { struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff == NULL) return NULL; if (isNotEmpty(chrom) && start != end) { if (lineFileSetTabixRegion(lf, chrom, start, end)) { vcff->records = vcfParseData(vcff, maxRecords); lineFileClose(&(vcff->lf)); // Not sure why it is closed. Angie? } } return vcff; }
struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count, boolean isTabix) /* Return some semi-random IDs from a VCF file. */ { /* Read 10000 items from vcf file, or if they ask for a big list, then 4x what they ask for. */ struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database)); struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) : lineFileMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); int orderedCount = count * 4; if (orderedCount < 100) orderedCount = 100; struct slName *idList = NULL; char *words[4]; int i; for (i = 0; i < orderedCount && lineFileChop(lf, words); i++) { // compress runs of identical ID, in case most are placeholder if (i == 0 || !sameString(words[2], idList->name)) slAddHead(&idList, slNameNew(words[2])); } lineFileClose(&lf); /* Shuffle list and trim it to count if necessary. */ shuffleList(&idList); struct slName *sl; for (sl = idList, i = 0; sl != NULL; sl = sl->next, i++) { if (i+1 >= count) { slNameFreeList(&(sl->next)); break; } } freez(&fileName); return idList; }
void showSchemaVcf(char *table, struct trackDb *tdb, boolean isTabix) /* Show schema on vcf. */ { struct sqlConnection *conn = hAllocConn(database); char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database)); struct asObject *as = vcfAsObj(); hPrintf("<B>Database:</B> %s", database); hPrintf(" <B>Primary Table:</B> %s<br>", table); hPrintf("<B>VCF File:</B> %s", fileName); hPrintf("<BR>\n"); hPrintf("<B>Format description:</B> %s<BR>", as->comment); hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for more details<BR>\n", "http://www.1000genomes.org/wiki/analysis/vcf4.0"); /* Put up table that describes fields. */ hTableStart(); hPrintf("<TR><TH>field</TH>"); hPrintf("<TH>description</TH> "); puts("</TR>\n"); struct asColumn *col; int colCount = 0; for (col = as->columnList; col != NULL; col = col->next) { hPrintf("<TR><TD><TT>%s</TT></TD>", col->name); hPrintf("<TD>%s</TD></TR>", col->comment); ++colCount; } hTableEnd(); /* Put up another section with sample rows. */ webNewSection("Sample Rows"); hTableStart(); /* Fetch sample rows. */ struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) : lineFileMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); char *row[VCF_MAX_SCHEMA_COLS]; int i; for (i = 0; i < 10; i++) { int colCount = lineFileChop(lf, row); int colIx; if (i == 0) { // Print field names as column headers, using colCount to compute genotype span hPrintf("<TR>"); for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount; colIx++, col = col->next) { if (sameString("genotypes", col->name) && colCount > colIx+1) hPrintf("<TH colspan=%d>%s</TH>", colCount - colIx, col->name); else hPrintf("<TH>%s</TH>", col->name); } hPrintf("</TR>\n"); } hPrintf("<TR>"); for (colIx=0; colIx < colCount; ++colIx) { if (colCount > VCFDATALINE_NUM_COLS && colIx == colCount - 1) hPrintf("<TD>...</TD>"); else writeHtmlCell(row[colIx]); } hPrintf("</TR>\n"); } hTableEnd(); printTrackHtml(tdb); /* Clean up and go home. */ lineFileClose(&lf); freeMem(fileName); hFreeConn(&conn); }