Example #1
0
File: vcf.c Project: bh0085/kent
struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
				    int maxErr, int maxRecords)
/* Parse header and rows within the given position range from a VCF file that has been
 * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has
 * no items in range.
 * If maxErr not zero, then continue to parse until this number of error have been reached.
 * A maxErr less than zero does not stop and reports all errors. */
{
struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE);
struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
if (vcff == NULL)
    return NULL;
if (isNotEmpty(chrom) && start != end)
    {
    if (lineFileSetTabixRegion(lf, chrom, start, end))
	vcfParseData(vcff, maxRecords);
    }
return vcff;
}
Example #2
0
struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
				    int maxErr, int maxRecords)
/* Open a VCF file that has been compressed and indexed by tabix and
 * parse VCF header, or return NULL if unable.  If chrom is non-NULL,
 * seek to the position range and parse all lines in range into
 * vcff->records.  If maxErr >= zero, then continue to parse until
 * there are maxErr+1 errors.  A maxErr less than zero does not stop
 * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
{
struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE);
struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
if (vcff == NULL)
    return NULL;
if (isNotEmpty(chrom) && start != end)
    {
    if (lineFileSetTabixRegion(lf, chrom, start, end))
        {
        vcff->records = vcfParseData(vcff, maxRecords);
        lineFileClose(&(vcff->lf)); // Not sure why it is closed.  Angie?
        }
    }
return vcff;
}
Example #3
0
struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count, boolean isTabix)
/* Return some semi-random IDs from a VCF file. */
{
/* Read 10000 items from vcf file,  or if they ask for a big list, then 4x what they ask for. */
struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database));
struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) :
				lineFileMayOpen(fileName, TRUE);
if (lf == NULL)
    noWarnAbort();
int orderedCount = count * 4;
if (orderedCount < 100)
    orderedCount = 100;
struct slName *idList = NULL;
char *words[4];
int i;
for (i = 0;  i < orderedCount && lineFileChop(lf, words); i++)
    {
    // compress runs of identical ID, in case most are placeholder
    if (i == 0 || !sameString(words[2], idList->name))
	slAddHead(&idList, slNameNew(words[2]));
    }
lineFileClose(&lf);
/* Shuffle list and trim it to count if necessary. */
shuffleList(&idList);
struct slName *sl;
for (sl = idList, i = 0; sl != NULL; sl = sl->next, i++)
    {
    if (i+1 >= count)
	{
	slNameFreeList(&(sl->next));
	break;
	}
    }
freez(&fileName);
return idList;
}
Example #4
0
void showSchemaVcf(char *table, struct trackDb *tdb, boolean isTabix)
/* Show schema on vcf. */
{
struct sqlConnection *conn = hAllocConn(database);
char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database));

struct asObject *as = vcfAsObj();
hPrintf("<B>Database:</B> %s", database);
hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Primary Table:</B> %s<br>", table);
hPrintf("<B>VCF File:</B> %s", fileName);
hPrintf("<BR>\n");
hPrintf("<B>Format description:</B> %s<BR>", as->comment);
hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for  more details<BR>\n",
	"http://www.1000genomes.org/wiki/analysis/vcf4.0");

/* Put up table that describes fields. */
hTableStart();
hPrintf("<TR><TH>field</TH>");
hPrintf("<TH>description</TH> ");
puts("</TR>\n");
struct asColumn *col;
int colCount = 0;
for (col = as->columnList; col != NULL; col = col->next)
    {
    hPrintf("<TR><TD><TT>%s</TT></TD>", col->name);
    hPrintf("<TD>%s</TD></TR>", col->comment);
    ++colCount;
    }
hTableEnd();

/* Put up another section with sample rows. */
webNewSection("Sample Rows");
hTableStart();

/* Fetch sample rows. */
struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) :
				lineFileMayOpen(fileName, TRUE);
if (lf == NULL)
    noWarnAbort();
char *row[VCF_MAX_SCHEMA_COLS];
int i;
for (i = 0;  i < 10;  i++)
    {
    int colCount = lineFileChop(lf, row);
    int colIx;
    if (i == 0)
	{
	// Print field names as column headers, using colCount to compute genotype span
	hPrintf("<TR>");
	for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount;
	     colIx++, col = col->next)
	    {

	    if (sameString("genotypes", col->name) && colCount > colIx+1)
		hPrintf("<TH colspan=%d>%s</TH>", colCount - colIx, col->name);
	    else
		hPrintf("<TH>%s</TH>", col->name);
	    }
	hPrintf("</TR>\n");
	}
    hPrintf("<TR>");
    for (colIx=0; colIx < colCount; ++colIx)
	{
	if (colCount > VCFDATALINE_NUM_COLS && colIx == colCount - 1)
	    hPrintf("<TD>...</TD>");
	else
	    writeHtmlCell(row[colIx]);
	}
    hPrintf("</TR>\n");
    }
hTableEnd();
printTrackHtml(tdb);

/* Clean up and go home. */
lineFileClose(&lf);
freeMem(fileName);
hFreeConn(&conn);
}