Beispiel #1
0
void initTitleHelper(struct titleHelper *th, char *track, int startIx, int centerIx, int endIx,
		     int nRecords, struct vcfFile *vcff)
/* Set up info including arrays of ref & alt alleles for cluster mouseover. */
{
th->track = track;
th->startIx = startIx;
th->centerIx = centerIx;
th->endIx = endIx;
th->nRecords = nRecords;
int len = endIx - startIx;
AllocArray(th->refs, len);
AllocArray(th->alts, len);
struct vcfRecord *rec;
int i;
for (rec = vcff->records, i = 0;  rec != NULL && i < endIx;  rec = rec->next, i++)
    {
    if (i < startIx)
	continue;
    char refAl[16];
    abbrevAndHandleRC(refAl, sizeof(refAl), rec->alleles[0]);
    th->refs[i-startIx] = vcfFilePooledStr(vcff, refAl);
    char altAl1[16];
    abbrevAndHandleRC(altAl1, sizeof(altAl1), rec->alleles[1]);
    tolowers(altAl1);
    th->alts[i-startIx] = vcfFilePooledStr(vcff, altAl1);
    }
}
Beispiel #2
0
struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words)
/* Parse words from a VCF data line into a VCF record structure. */
{
struct vcfRecord *record = vcfFileAlloc(vcff, sizeof(struct vcfRecord));
record->file = vcff;
record->chrom = vcfFilePooledStr(vcff, words[0]);
record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
// chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn.
record->chromEnd = record->chromStart+1;
record->name = vcfFilePooledStr(vcff, words[2]);
parseRefAndAlt(vcff, record, words[3], words[4]);
record->qual = vcfFilePooledStr(vcff, words[5]);
parseFilterColumn(vcff, record, words[6]);
// ADDED BY BO PENG to get whole INFO column
record->unparsedInfoElements = vcfFilePooledStr(vcff, words[7]);
parseInfoColumn(vcff, record, words[7]);
if (vcff->genotypeCount > 0)
    {
    record->format = vcfFilePooledStr(vcff, words[8]);
    record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
						   vcff->genotypeCount * sizeof(char *));
    int i;
    // Don't bother actually parsing all these until & unless we need the info:
    for (i = 0;  i < vcff->genotypeCount;  i++)
	record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
    }
return record;
}
Beispiel #3
0
static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string)
/* Translate string into array of vcfInfoElement. */
{
if (sameString(string, "."))
    {
    record->infoCount = 0;
    return;
    }
char *elWords[VCF_MAX_INFO];
record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords));
if (record->infoCount >= VCF_MAX_INFO)
    vcfFileErr(vcff, "INFO column contains at least %d elements; "
	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
record->infoElements = vcfFileAlloc(vcff, record->infoCount * sizeof(struct vcfInfoElement));
char *emptyString = vcfFilePooledStr(vcff, "");
int i;
for (i = 0;  i < record->infoCount;  i++)
    {
    char *elStr = elWords[i];
    char *eq = strchr(elStr, '=');
    struct vcfInfoElement *el = &(record->infoElements[i]);
    if (eq == NULL)
	{
	el->key = vcfFilePooledStr(vcff, elStr);
	enum vcfInfoType type = typeForInfoKey(vcff, el->key);
	if (type != vcfInfoFlag)
	    {
	    vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)",
		       elStr, type);
	    if (type == vcfInfoString)
		{
		el->values = vcfFileAlloc(vcff, sizeof(union vcfDatum));
		el->values[0].datString = emptyString;
		}
	    }
	continue;
	}
    *eq = '\0';
    el->key = vcfFilePooledStr(vcff, elStr);
    enum vcfInfoType type = typeForInfoKey(vcff, el->key);
    char *valStr = eq+1;
    el->count = parseInfoValue(record, el->key, type, valStr, &(el->values), &(el->missingData));
    if (el->count >= VCF_MAX_INFO)
	vcfFileErr(vcff, "A single element of the INFO column has at least %d values; "
	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
    }
}
Beispiel #4
0
static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt)
/* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns.
 * Use the length of the reference sequence to set record->chromEnd.
 * Note: this trashes the alt argument, since this is expected to be its last use. */
{
char *altAlleles[VCF_MAX_INFO];
int altCount = chopCommas(alt, altAlleles);
record->alleleCount = 1 + altCount;
record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0]));
record->alleles[0] = vcfFilePooledStr(vcff, ref);
int i;
for (i = 0;  i < altCount;  i++)
    record->alleles[1+i] = vcfFilePooledStr(vcff, altAlleles[i]);
int refLen = strlen(ref);
if (refLen == dnaFilteredSize(ref))
    record->chromEnd = record->chromStart + refLen;
}
Beispiel #5
0
static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type,
			  char *valStr, union vcfDatum **pData, bool **pMissingData)
/* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
{
char *valWords[VCF_MAX_INFO];
int count = chopCommas(valStr, valWords);
struct vcfFile *vcff = record->file;
union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum));
bool *missingData = vcfFileAlloc(vcff, count * sizeof(*missingData));
int j;
for (j = 0;  j < count;  j++)
    {
    if (type != vcfInfoString && type != vcfInfoCharacter && sameString(valWords[j], "."))
	missingData[j] = TRUE;
    switch (type)
	{
	case vcfInfoInteger:
	    data[j].datInt = atoi(valWords[j]);
	    break;
	case vcfInfoFloat:
	    data[j].datFloat = atof(valWords[j]);
	    break;
	case vcfInfoFlag:
	    // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1
	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
	    break;
	case vcfInfoCharacter:
	    data[j].datChar = valWords[j][0];
	    break;
	case vcfInfoString:
	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
	    break;
	default:
	    errAbort("invalid vcfInfoType (uninitialized?) %d", type);
	    break;
	}
    }
// If END is given, use it as chromEnd:
if (sameString(infoKey, vcfInfoEnd))
    record->chromEnd = data[0].datInt;
*pData = data;
*pMissingData = missingData;
return count;
}
Beispiel #6
0
Datei: vcf.c Projekt: bh0085/kent
static void vcfParseData(struct vcfFile *vcff, int maxRecords)
/* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
 * at the beginning of a data row, parse and store all data rows from lineFile. */
{
if (vcff == NULL)
    return;
int recCount = 0, expected = 8;
if (vcff->genotypeCount > 0)
    expected = 9 + vcff->genotypeCount;
char *words[VCF_MAX_COLUMNS];
int wordCount;
while ((wordCount = lineFileChop(vcff->lf, words)) > 0)
    {
    if (maxRecords >= 0 && recCount >= maxRecords)
	break;
    lineFileExpectWords(vcff->lf, expected, wordCount);
    struct vcfRecord *record;
    AllocVar(record);
    record->file = vcff;
    record->chrom = vcfFilePooledStr(vcff, words[0]);
    record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
    // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn.
    record->chromEnd = record->chromStart+1;
    record->name = vcfFilePooledStr(vcff, words[2]);
    parseRefAndAlt(vcff, record, words[3], words[4]);
    record->qual = vcfFilePooledStr(vcff, words[5]);
    parseFilterColumn(vcff, record, words[6]);
    parseInfoColumn(vcff, record, words[7]);
    if (vcff->genotypeCount > 0)
	{
	record->format = vcfFilePooledStr(vcff, words[8]);
	record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
						       vcff->genotypeCount * sizeof(char *));
	int i;
	// Don't bother actually parsing all these until & unless we need the info:
	for (i = 0;  i < vcff->genotypeCount;  i++)
	    record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
	}
    slAddHead(&(vcff->records), record);
    recCount++;
    }
slReverse(&(vcff->records));
lineFileClose(&(vcff->lf));
}
Beispiel #7
0
unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec)
/* For indels, VCF includes the left neighboring base; for example, if the alleles are
 * AA/- following a G base, then the VCF record will start one base to the left and have
 * "GAA" and "G" as the alleles.  That is not nice for display for two reasons:
 * 1. Indels appear one base wider than their dbSNP entries.
 * 2. In pgSnp display mode, the two alleles are always the same color.
 * However, for hgTracks' mapBox we need the correct chromStart for identifying the
 * record in hgc -- so return the original chromStart. */
{
unsigned int chromStartOrig = rec->chromStart;
struct vcfFile *vcff = rec->file;
if (rec->alleleCount > 1)
    {
    boolean allSameFirstBase = TRUE;
    char firstBase = rec->alleles[0][0];
    int i;
    for (i = 1;  i < rec->alleleCount;  i++)
	if (rec->alleles[i][0] != firstBase)
	    {
	    allSameFirstBase = FALSE;
	    break;
	    }
    if (allSameFirstBase)
	{
	rec->chromStart++;
	for (i = 0;  i < rec->alleleCount;  i++)
	    {
	    if (rec->alleles[i][1] == '\0')
		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
	    else
		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
	    }
	}
    }
return chromStartOrig;
}
Beispiel #8
0
static void parseFilterColumn(struct vcfFile *vcff, struct vcfRecord *record, char *filterStr)
/* Transform ;-separated filter codes into count + string array. */
{
// We don't want to modify something allocated with vcfFilePooledStr because that uses
// hash element names for storage!  So don't make a vcfFilePooledStr copy of filterStr and
// chop that; instead, chop a temp string and pool the words separately.
static struct dyString *tmp = NULL;
if (tmp == NULL)
    tmp = dyStringNew(0);
dyStringClear(tmp);
dyStringAppend(tmp, filterStr);
record->filterCount = countChars(filterStr, ';') + 1;
record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **));
(void)chopByChar(tmp->string, ';', record->filters, record->filterCount);
int i;
for (i = 0;  i < record->filterCount;  i++)
    record->filters[i] = vcfFilePooledStr(vcff, record->filters[i]);
}
Beispiel #9
0
void vcfParseGenotypes(struct vcfRecord *record)
/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
 * This destroys genotypesUnparsedStrings. */
{
if (record->genotypeUnparsedStrings == NULL)
    return;
struct vcfFile *vcff = record->file;
record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype));
char format[VCF_MAX_FORMAT_LEN];
safecpy(format, sizeof(format), record->format);
char *formatWords[VCF_MAX_FORMAT];
int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords));
if (formatWordCount >= VCF_MAX_FORMAT)
    {
    vcfFileErr(vcff, "The FORMAT column has at least %d words; "
	       "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT);
    formatWordCount = VCF_MAX_FORMAT;
    }
if (differentString(formatWords[0], vcfGtGenotype))
    vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"",
	       vcfGtGenotype, formatWords[0]);
int i;
// Store the pooled format word pointers and associated types for use in inner loop below.
enum vcfInfoType formatTypes[VCF_MAX_FORMAT];
for (i = 0;  i < formatWordCount;  i++)
    {
    formatTypes[i] = typeForGtFormat(vcff, formatWords[i]);
    formatWords[i] = vcfFilePooledStr(vcff, formatWords[i]);
    }
for (i = 0;  i < vcff->genotypeCount;  i++)
    {
    char *string = record->genotypeUnparsedStrings[i];
    struct vcfGenotype *gt = &(record->genotypes[i]);
    // Each genotype can have multiple :-separated info elements:
    char *gtWords[VCF_MAX_FORMAT];
    int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords));
    if (gtWordCount != formatWordCount)
	vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s "
		   "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount);
    if (gtWordCount > formatWordCount)
	gtWordCount = formatWordCount;
    gt->id = vcff->genotypeIds[i];
    gt->infoCount = gtWordCount;
    gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement));
    int j;
    for (j = 0;  j < gtWordCount;  j++)
	{
	// Special parsing of genotype:
	if (sameString(formatWords[j], vcfGtGenotype))
	    {
	    char *genotype = gtWords[j];
	    char *sep = strchr(genotype, '|');
	    if (sep != NULL)
		gt->isPhased = TRUE;
	    else
		sep = strchr(genotype, '/');
	    if (genotype[0] == '.')
		gt->hapIxA = -1;
	    else
		gt->hapIxA = atoi(genotype);
	    if (sep == NULL)
		gt->isHaploid = TRUE;
	    else if (sep[1] == '.')
		gt->hapIxB = -1;
	    else
		gt->hapIxB = atoi(sep+1);
	    }
	struct vcfInfoElement *el = &(gt->infoElements[j]);
	el->key = formatWords[j];
	el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j],
				   &(el->values), &(el->missingData));
	if (el->count >= VCF_MAX_INFO)
	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
		       "has at least %d values; "
		       "VCF_MAX_INFO may need to be increased in vcf.c!",
		       gt->id, VCF_MAX_INFO);
	}
    }
record->genotypeUnparsedStrings = NULL;
}