struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words) /* Parse words from a VCF data line into a VCF record structure. */ { struct vcfRecord *record = vcfFileAlloc(vcff, sizeof(struct vcfRecord)); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn. record->chromEnd = record->chromStart+1; record->name = vcfFilePooledStr(vcff, words[2]); parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); // ADDED BY BO PENG to get whole INFO column record->unparsedInfoElements = vcfFilePooledStr(vcff, words[7]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } return record; }
static void parseColumnHeaderRow(struct vcfFile *vcff, char *line) /* Make sure column names are as we expect, and store genotype sample IDs if any are given. */ { if (line[0] != '#') { vcfFileErr(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), " "not \"%s\"", line); lineFileReuse(vcff->lf); return; } char *words[VCF_MAX_COLUMNS]; int wordCount = chopLine(line+1, words); if (wordCount >= VCF_MAX_COLUMNS) vcfFileErr(vcff, "header contains at least %d columns; " "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS); expectColumnName(vcff, "CHROM", words, 0); expectColumnName(vcff, "POS", words, 1); expectColumnName(vcff, "ID", words, 2); expectColumnName(vcff, "REF", words, 3); expectColumnName(vcff, "ALT", words, 4); expectColumnName2(vcff, "QUAL", "PROB", words, 5); expectColumnName(vcff, "FILTER", words, 6); expectColumnName(vcff, "INFO", words, 7); if (wordCount > 8) { expectColumnName(vcff, "FORMAT", words, 8); if (wordCount < 10) vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?"); vcff->genotypeCount = (wordCount - 9); vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; for (i = 9; i < wordCount; i++) vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]); } }
static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string) /* Translate string into array of vcfInfoElement. */ { if (sameString(string, ".")) { record->infoCount = 0; return; } char *elWords[VCF_MAX_INFO]; record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords)); if (record->infoCount >= VCF_MAX_INFO) vcfFileErr(vcff, "INFO column contains at least %d elements; " "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO); record->infoElements = vcfFileAlloc(vcff, record->infoCount * sizeof(struct vcfInfoElement)); char *emptyString = vcfFilePooledStr(vcff, ""); int i; for (i = 0; i < record->infoCount; i++) { char *elStr = elWords[i]; char *eq = strchr(elStr, '='); struct vcfInfoElement *el = &(record->infoElements[i]); if (eq == NULL) { el->key = vcfFilePooledStr(vcff, elStr); enum vcfInfoType type = typeForInfoKey(vcff, el->key); if (type != vcfInfoFlag) { vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)", elStr, type); if (type == vcfInfoString) { el->values = vcfFileAlloc(vcff, sizeof(union vcfDatum)); el->values[0].datString = emptyString; } } continue; } *eq = '\0'; el->key = vcfFilePooledStr(vcff, elStr); enum vcfInfoType type = typeForInfoKey(vcff, el->key); char *valStr = eq+1; el->count = parseInfoValue(record, el->key, type, valStr, &(el->values), &(el->missingData)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the INFO column has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO); } }
static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr, union vcfDatum **pData, bool **pMissingData) /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */ { char *valWords[VCF_MAX_INFO]; int count = chopCommas(valStr, valWords); struct vcfFile *vcff = record->file; union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum)); bool *missingData = vcfFileAlloc(vcff, count * sizeof(*missingData)); int j; for (j = 0; j < count; j++) { if (type != vcfInfoString && type != vcfInfoCharacter && sameString(valWords[j], ".")) missingData[j] = TRUE; switch (type) { case vcfInfoInteger: data[j].datInt = atoi(valWords[j]); break; case vcfInfoFloat: data[j].datFloat = atof(valWords[j]); break; case vcfInfoFlag: // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1 data[j].datString = vcfFilePooledStr(vcff, valWords[j]); break; case vcfInfoCharacter: data[j].datChar = valWords[j][0]; break; case vcfInfoString: data[j].datString = vcfFilePooledStr(vcff, valWords[j]); break; default: errAbort("invalid vcfInfoType (uninitialized?) %d", type); break; } } // If END is given, use it as chromEnd: if (sameString(infoKey, vcfInfoEnd)) record->chromEnd = data[0].datInt; *pData = data; *pMissingData = missingData; return count; }
static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt) /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns. * Use the length of the reference sequence to set record->chromEnd. * Note: this trashes the alt argument, since this is expected to be its last use. */ { char *altAlleles[VCF_MAX_INFO]; int altCount = chopCommas(alt, altAlleles); record->alleleCount = 1 + altCount; record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0])); record->alleles[0] = vcfFilePooledStr(vcff, ref); int i; for (i = 0; i < altCount; i++) record->alleles[1+i] = vcfFilePooledStr(vcff, altAlleles[i]); int refLen = strlen(ref); if (refLen == dnaFilteredSize(ref)) record->chromEnd = record->chromStart + refLen; }
static void parseFilterColumn(struct vcfFile *vcff, struct vcfRecord *record, char *filterStr) /* Transform ;-separated filter codes into count + string array. */ { // We don't want to modify something allocated with vcfFilePooledStr because that uses // hash element names for storage! So don't make a vcfFilePooledStr copy of filterStr and // chop that; instead, chop a temp string and pool the words separately. static struct dyString *tmp = NULL; if (tmp == NULL) tmp = dyStringNew(0); dyStringClear(tmp); dyStringAppend(tmp, filterStr); record->filterCount = countChars(filterStr, ';') + 1; record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **)); (void)chopByChar(tmp->string, ';', record->filters, record->filterCount); int i; for (i = 0; i < record->filterCount; i++) record->filters[i] = vcfFilePooledStr(vcff, record->filters[i]); }
static void vcfParseData(struct vcfFile *vcff, int maxRecords) /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned * at the beginning of a data row, parse and store all data rows from lineFile. */ { if (vcff == NULL) return; int recCount = 0, expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; char *words[VCF_MAX_COLUMNS]; int wordCount; while ((wordCount = lineFileChop(vcff->lf, words)) > 0) { if (maxRecords >= 0 && recCount >= maxRecords) break; lineFileExpectWords(vcff->lf, expected, wordCount); struct vcfRecord *record; AllocVar(record); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn. record->chromEnd = record->chromStart+1; record->name = vcfFilePooledStr(vcff, words[2]); parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } slAddHead(&(vcff->records), record); recCount++; } slReverse(&(vcff->records)); lineFileClose(&(vcff->lf)); }
void vcfParseGenotypes(struct vcfRecord *record) /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */ { if (record->genotypeUnparsedStrings == NULL) return; struct vcfFile *vcff = record->file; record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype)); char format[VCF_MAX_FORMAT_LEN]; safecpy(format, sizeof(format), record->format); char *formatWords[VCF_MAX_FORMAT]; int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords)); if (formatWordCount >= VCF_MAX_FORMAT) { vcfFileErr(vcff, "The FORMAT column has at least %d words; " "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT); formatWordCount = VCF_MAX_FORMAT; } if (differentString(formatWords[0], vcfGtGenotype)) vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"", vcfGtGenotype, formatWords[0]); int i; // Store the pooled format word pointers and associated types for use in inner loop below. enum vcfInfoType formatTypes[VCF_MAX_FORMAT]; for (i = 0; i < formatWordCount; i++) { formatTypes[i] = typeForGtFormat(vcff, formatWords[i]); formatWords[i] = vcfFilePooledStr(vcff, formatWords[i]); } for (i = 0; i < vcff->genotypeCount; i++) { char *string = record->genotypeUnparsedStrings[i]; struct vcfGenotype *gt = &(record->genotypes[i]); // Each genotype can have multiple :-separated info elements: char *gtWords[VCF_MAX_FORMAT]; int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords)); if (gtWordCount != formatWordCount) vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s " "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount); if (gtWordCount > formatWordCount) gtWordCount = formatWordCount; gt->id = vcff->genotypeIds[i]; gt->infoCount = gtWordCount; gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement)); int j; for (j = 0; j < gtWordCount; j++) { // Special parsing of genotype: if (sameString(formatWords[j], vcfGtGenotype)) { char *genotype = gtWords[j]; char *sep = strchr(genotype, '|'); if (sep != NULL) gt->isPhased = TRUE; else sep = strchr(genotype, '/'); if (genotype[0] == '.') gt->hapIxA = -1; else gt->hapIxA = atoi(genotype); if (sep == NULL) gt->isHaploid = TRUE; else if (sep[1] == '.') gt->hapIxB = -1; else gt->hapIxB = atoi(sep+1); } struct vcfInfoElement *el = &(gt->infoElements[j]); el->key = formatWords[j]; el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j], &(el->values), &(el->missingData)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the genotype column for \"%s\" " "has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", gt->id, VCF_MAX_INFO); } } record->genotypeUnparsedStrings = NULL; }
static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2; char *firstEq = strchr(ptr, '='); if (firstEq == NULL) { vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line); return; } regmatch_t substrs[8]; // Some of the metadata lines are crucial for parsing the rest of the file: if (startsWith("##fileformat=", line) || startsWith("##format", line)) { if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs))) { // substrs[2] is major version #, substrs[3] is set only if there is a minor version, // and substrs[4] is the minor version #. vcff->majorVersion = atoi(line + substrs[2].rm_so); if (substrs[3].rm_so != -1) vcff->minorVersion = atoi(line + substrs[4].rm_so); } else vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"", fileformatRegex, line); } else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line)) { boolean isInfo = startsWith("##INFO=", line); nonAsciiWorkaround(line); if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)) || regexMatchSubstr(line, infoOrFormatRegex3_3, substrs, ArraySize(substrs))) // substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description. { struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); char *number = vcfFileCloneSubstr(vcff, line, substrs[3]); if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G")) // A is #alts which varies line-to-line; "G" is #genotypes which we haven't // yet seen. Why is there a G here -- shouldn't such attributes go in the // genotype columns? def->fieldCount = -1; else def->fieldCount = atoi(number); def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]); // greedy regex pulls in end quote, trim if found: if (line[substrs[5].rm_eo-1] == '"') line[substrs[5].rm_eo-1] = '\0'; def->description = vcfFileCloneSubstr(vcff, line, substrs[5]); slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def); } else vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"", (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line); } else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line)) { boolean isFilter = startsWith("##FILTER", line); if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) || regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs))) { // substrs[2] is ID/key, substrs[4] is Description. struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); def->description = vcfFileCloneSubstr(vcff, line, substrs[4]); slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def); } else { if (isFilter) vcfFileErr(vcff, "##FILTER line does not match expected pattern /%s/ or /%s/: \"%s\"", filterOrAltRegex, filterRegex3_3, line); else vcfFileErr(vcff, "##ALT line does not match expected pattern /%s/: \"%s\"", filterOrAltRegex, line); } } }