/** * get genotype probabilities by parsing GP token from VCF line */ void vcf_parse_gp(VCFInfo *vcf_info, float *geno_probs, char *cur, long gp_idx) { char delim[] = "\t"; char inner_delim[] = ":"; char *tok, *inner_tok, *inner_cur; char gtype[VCF_MAX_FORMAT]; long i, n, n_geno_probs, expect_geno_probs; float prob_homo_ref, prob_het, prob_homo_alt, prob_sum; expect_geno_probs = vcf_info->n_sample * 3; n_geno_probs = 0; while((tok = strsep(&cur, delim)) != NULL) { /* each genotype string is delimited by ':' * each GP portion is delimited by ',' */ util_strncpy(gtype, tok, sizeof(gtype)); i = 0; inner_cur = gtype; while((i <= gp_idx) && (inner_tok = strsep(&inner_cur, inner_delim)) != NULL) { if(i == gp_idx) { n = sscanf(inner_tok, "%g,%g,%g", &prob_homo_ref, &prob_het, &prob_homo_alt); if(n != 3) { if(strcmp(inner_tok, ".") == 0) { /* '.' indicates missing data * set all probabilities to 0.333 */ prob_homo_ref = prob_het = prob_homo_alt = 0.333; } else { my_err("%s:%d: failed to parse genotype probabilities from " "string '%s'", __FILE__, __LINE__, inner_tok); } } /* check that probs sum to 1.0, normalize if they don't */ prob_sum = prob_homo_ref + prob_het + prob_homo_alt; if((prob_sum > 1.001) || (prob_sum < 0.999)) { prob_homo_ref = prob_homo_ref / prob_sum; prob_het = prob_het / prob_sum; prob_homo_alt = prob_homo_alt / prob_sum; } geno_probs[n_geno_probs] = prob_homo_ref; geno_probs[n_geno_probs + 1] = prob_het; geno_probs[n_geno_probs + 2] = prob_homo_alt; n_geno_probs += 3; } i++; } } if(n_geno_probs != expect_geno_probs) { my_err("%s:%d: expected %ld genotype probabilities per line, but got " "%ld", __FILE__, __LINE__, expect_geno_probs, n_geno_probs); } }
static int tls_client_cache_set(tls_client_cache_t *cache, char *addr, gnutls_datum_t data) { unsigned i = 0; int found = 0; if (data.size > TLS_CLIENT_MAX_SESSION_DATA_SIZE) { return SOCK_TLS_CACHE_ERROR; } for (i = 0; i < cache->size; i++) { if (strcmp(addr, cache->element[i].addr) == 0) { found = 1; break; } } if (!found) { i = cache->index++; cache->index %= cache->size; } util_strncpy(cache->element[i].addr, addr, INET6_ADDRSTRLEN); memcpy(cache->element[i].session_data, data.data, data.size); cache->element[i].session_data_size = data.size; return SOCK_OK; }
/* Open a file expanding home directories */ FILE *util_fopen( char *filename, char *flags ) { char fname[MAX_FILE_NAME]; FILE *fd; if (filename[0] == '~' && filename[1] == '/') snprintf(fname, sizeof(fname), "%s/%s", getenv("HOME"), &filename[2]); else util_strncpy(fname, filename, sizeof(fname)); fd = fopen(fname, flags); return fd; } /* End of 'util_fopen' function */
u_int8 gmenu_input_getstr(u_int8 *buffer) { if (input) { if (input->charbuffer && (buffer != input->charbuffer)) { util_strncpy(buffer, input->charbuffer, input->maxlen); } return input->curfocus; } buffer[0] = '\0'; return 0; }
/* Convert file name to the one with escaped special symbols */ char *util_escape_fname( char *out, char *in ) { int i, j, len; char in_name[MAX_FILE_NAME]; len = strlen(in); util_strncpy(in_name, in, sizeof(in_name)); for ( i = j = 0; i <= len; i ++ ) { if (UTIL_FNAME_IS_SPECIAL(in_name[i])) out[j ++] = '\\'; out[j ++] = in_name[i]; } return out; } /* End of 'util_escape_fname' function */
/* * When given a string like "a/b/c/d/e/file" * this function will handle the creation of * the directory structure, included nested * directories. */ static void pak_tree_build(const char *entry) { char *directory; char *elements[28]; char *pathsplit; char *token; size_t itr; size_t jtr; pathsplit = (char *)mem_a(56); directory = (char *)mem_a(56); memset(pathsplit, 0, 56); util_strncpy(directory, entry, 56); for (itr = 0; (token = pak_tree_sep(&directory, "/")) != NULL; itr++) { elements[itr] = token; } for (jtr = 0; jtr < itr - 1; jtr++) { util_strcat(pathsplit, elements[jtr]); util_strcat(pathsplit, "/"); if (fs_dir_make(pathsplit)) { mem_d(pathsplit); mem_d(directory); /* TODO: undo on fail */ return; } } mem_d(pathsplit); mem_d(directory); }
static size_t opts_ini_parse ( fs_file_t *filehandle, char *(*loadhandle)(const char *, const char *, const char *, char **), char **errorhandle, char **parse_file ) { size_t linesize; size_t lineno = 1; size_t error = 0; char *line = NULL; char section_data[2048] = ""; char oldname_data[2048] = ""; /* parsing and reading variables */ char *parse_beg; char *parse_end; char *read_name; char *read_value; while (fs_file_getline(&line, &linesize, filehandle) != FS_FILE_EOF) { parse_beg = line; /* handle BOM */ if (lineno == 1 && ( (unsigned char)parse_beg[0] == 0xEF && (unsigned char)parse_beg[1] == 0xBB && (unsigned char)parse_beg[2] == 0xBF ) ) { parse_beg ++; /* 0xEF */ parse_beg ++; /* 0xBB */ parse_beg ++; /* 0xBF */ } if (*(parse_beg = opts_ini_lskip(opts_ini_rstrip(parse_beg))) == ';' || *parse_beg == '#') { /* ignore '#' is a perl extension */ } else if (*parse_beg == '[') { /* section found */ if (*(parse_end = opts_ini_next(parse_beg + 1, ']')) == ']') { * parse_end = '\0'; /* terminate bro */ util_strncpy(section_data, parse_beg + 1, sizeof(section_data)); section_data[sizeof(section_data) - 1] = '\0'; *oldname_data = '\0'; } else if (!error) { /* otherwise set error to the current line number */ error = lineno; } } else if (*parse_beg && *parse_beg != ';') { /* not a comment, must be a name value pair :) */ if (*(parse_end = opts_ini_next(parse_beg, '=')) != '=') parse_end = opts_ini_next(parse_beg, ':'); if (*parse_end == '=' || *parse_end == ':') { *parse_end = '\0'; /* terminate bro */ read_name = opts_ini_rstrip(parse_beg); read_value = opts_ini_lskip(parse_end + 1); if (*(parse_end = opts_ini_next(read_value, '\0')) == ';') * parse_end = '\0'; opts_ini_rstrip(read_value); /* valid name value pair, lets call down to handler */ util_strncpy(oldname_data, read_name, sizeof(oldname_data)); oldname_data[sizeof(oldname_data) - 1] ='\0'; if ((*errorhandle = loadhandle(section_data, read_name, read_value, parse_file)) && !error) error = lineno; } else if (!strcmp(section_data, "includes")) { /* Includes are special */ if (*(parse_end = opts_ini_next(parse_beg, '=')) == '=' || *(parse_end = opts_ini_next(parse_beg, ':')) == ':') { static const char *invalid_include = "invalid use of include"; vec_append(*errorhandle, strlen(invalid_include), invalid_include); error = lineno; } else { read_name = opts_ini_rstrip(parse_beg); if ((*errorhandle = loadhandle(section_data, read_name, read_name, parse_file)) && !error) error = lineno; } } else if (!error) { /* otherwise set error to the current line number */ error = lineno; } } lineno++; } mem_d(line); return error; }
int abmfP_write_action_function( GenCodeInfo genCodeInfo, ABObj action ) { int rc = 0; /* return code */ BOOL isTTCB = FALSE; BOOL ss_cb = FALSE; File codeFile = genCodeInfo->code_file; BOOL topUserSegWritten = FALSE; BOOL bottomUserSegWritten = FALSE; BOOL funcBodyWritten = FALSE; BOOL funcEndWritten = FALSE; BOOL actionPrintfWritten = FALSE; int return_value = 0; ABObj fromObj = obj_get_from(action); ABObj actualFromObj = NULL; ABObj toObj = obj_get_to(action); ABObj module = NULL; char actionName[1024]; char actionPrintf[1024]; abmfP_gencode_enter_func(genCodeInfo); abmfP_ip_obj(genCodeInfo) = obj_get_to(action); util_strncpy(actionName, abmfP_get_action_name(action), 1024); sprintf(actionPrintf, "printf(\"action: %s()\\n\");\n", actionName); /*** *** START OF FUNCTION ***/ switch (obj_get_when(action)) { case AB_WHEN_AFTER_CREATED: /* * post-create procs have the signature of an Xt Callback, * although they are called as conventional functions. */ fromObj = obj_get_from(action); actualFromObj = get_actual_from_obj(action); abmfP_write_xm_callback_begin(genCodeInfo, FALSE, actionName); write_instance_ptr_var(genCodeInfo, actualFromObj, get_from_var_name(), "callData", TRUE, NULL); abio_puts(genCodeInfo->code_file, nlstr); break; case AB_WHEN_DRAGGED_FROM: { abio_printf(genCodeInfo->code_file, abmfP_lib_default_dragCB->def, /* this is a format string */ actionName,actionName,actionName, actionName,actionName, actionName); abio_puts(genCodeInfo->code_file, "\n\n"); /* these are all in the "library" definition */ topUserSegWritten = TRUE; bottomUserSegWritten = TRUE; funcBodyWritten = TRUE; funcEndWritten = TRUE; actionPrintfWritten = TRUE; } break; case AB_WHEN_DROPPED_ON: { abio_printf(genCodeInfo->code_file, abmfP_lib_default_dropCB->def, /* this is a format string */ actionName,actionName,actionName,actionName); abio_puts(genCodeInfo->code_file, "\n\n"); /* these are all in the "library" definition */ topUserSegWritten = TRUE; bottomUserSegWritten = TRUE; funcBodyWritten = TRUE; funcEndWritten = TRUE; actionPrintfWritten = TRUE; } break; case AB_WHEN_TOOLTALK_QUIT: case AB_WHEN_TOOLTALK_DO_COMMAND: case AB_WHEN_TOOLTALK_GET_STATUS: case AB_WHEN_TOOLTALK_PAUSE_RESUME: isTTCB = TRUE; abio_printf(codeFile, begin_tt_callback_body, actionName); abmfP_write_c_block_begin(genCodeInfo); write_tooltalk_cb_vars(genCodeInfo, action); break; case AB_WHEN_SESSION_RESTORE: ss_cb = TRUE; abio_printf(codeFile, begin_ss_restore_callback_body, abmfP_get_action_name(action)); abmfP_write_c_block_begin(genCodeInfo); write_ss_cb_vars(genCodeInfo, action); break; case AB_WHEN_SESSION_SAVE: ss_cb = TRUE; abio_printf(codeFile, begin_ss_save_callback_body, abmfP_get_action_name(action)); abmfP_write_c_block_begin(genCodeInfo); write_ss_cb_vars(genCodeInfo, action); break; default: abmfP_write_xm_callback_begin(genCodeInfo, FALSE, actionName); break; } /* switch obj_get_when() */ /***** ***** TOP USER SEGMENT *****/ if (!topUserSegWritten) { STRING contents = (actionPrintfWritten? NULL:(isTTCB? actionPrintf:NULL)); abmfP_write_user_var_and_code_seg(genCodeInfo, contents); abio_puts(codeFile, nlstr); topUserSegWritten = TRUE; if (contents != NULL) { actionPrintfWritten = TRUE; } } /*** *** FUNCTION BODY ***/ if (isTTCB) { write_tooltalk_cb_body1(genCodeInfo, action); abmfP_write_user_code_seg(genCodeInfo, NULL); write_tooltalk_cb_body2(genCodeInfo, action); funcBodyWritten = TRUE; bottomUserSegWritten = TRUE; } else if (ss_cb) { write_ss_cb_body1(genCodeInfo, action); abmfP_write_user_code_seg(genCodeInfo, NULL); write_ss_cb_body2(genCodeInfo, action); funcBodyWritten = TRUE; bottomUserSegWritten = TRUE; } else if (!funcBodyWritten) switch (obj_get_func_type(action)) { case AB_FUNC_BUILTIN: rc = abmfP_write_builtin_action(genCodeInfo, action, TRUE); return_if_err(rc,rc); funcBodyWritten = TRUE; break; case AB_FUNC_USER_DEF: abmfP_write_user_start_comment(genCodeInfo, "vvv Add C code below vvv"); abmfP_write_user_end_comment(genCodeInfo, "^^^ Add C code above ^^^"); bottomUserSegWritten = TRUE; funcBodyWritten = TRUE; break; case AB_FUNC_CODE_FRAG: abio_puts(codeFile, obj_get_func_code(action)); funcBodyWritten = TRUE; break; case AB_FUNC_ON_ITEM_HELP: abio_printf(codeFile, "dtb_do_onitem_help();\n"); funcBodyWritten = TRUE; break; case AB_FUNC_HELP_VOLUME: abio_printf(codeFile, "dtb_show_help_volume_info(\"%s\", \"%s\");\n", istr_string(action->info.action.volume_id), istr_string(action->info.action.location)); funcBodyWritten = TRUE; break; default: { char *obj_name_string = obj_get_name(fromObj); util_printf_err(catgets(Dtb_project_catd, 1, 78, "unknown function type for action from object, %s"), obj_name_string); return_code(ERR); } break; } /***** ***** BOTTOM USER SEGMENT *****/ if (!bottomUserSegWritten) { STRING contents = (actionPrintfWritten? NULL:actionPrintf); abmfP_write_user_code_seg(genCodeInfo, contents); bottomUserSegWritten = TRUE; if (contents != NULL) { actionPrintfWritten = TRUE; } } /***** ***** FUNCTION END *****/ if (!funcEndWritten) { abmfP_write_c_func_end(genCodeInfo, NULL); funcEndWritten = TRUE; } epilogue: abmfP_gencode_exit_func(genCodeInfo); return return_value; }
/** * Gets next line of IMPUTE file and parses it into ImputeInfo datastructure. * * IMPUTE files are described here: * http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html * * example line: * --- rs149201999 16050408 T C 0.966 0.034 0 0.395 0.467 .... * * If geno_probs array is non-null genotype probabilities are parsed and * stored in the provided array. The array must be of length * n_samples*3. * * If haplotypes array is non-null phased genotypes are parsed and * stored in the provided array. The array must be of length * n_samples*2. * * IMPUTE files contain EITHER haplotypes OR genotypes so only * one of geno_probs or haplotypes should be non-null (at most). * * Returns 0 on success, -1 if at EOF. */ int impute_read_line(gzFile fh, ImputeInfo *impute_info, SNP *snp, float *geno_probs, char *haplotypes) { char *cur, *token; int n_fix_header, ref_len, alt_len; size_t tok_num; const char delim[] = " \t"; /* read a line */ if(util_gzgetline(fh, &impute_info->buf, &impute_info->buf_size) == -1) { return -1; } cur = impute_info->buf; tok_num = 0; /* SNP name, often just set to "---" */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } /* SNP identifier (rs_id) */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } util_strncpy(snp->name, token, sizeof(snp->name)); /* pos */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } snp->pos = util_parse_long(token); /* ref allele */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } util_strncpy(snp->allele1, token, sizeof(snp->allele1)); /* alt allele */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2)); /* now parse haplotypes and/or genotype likelihoods */ if(geno_probs && haplotypes) { my_err("impute2 files contain EITHER genotypes or haplotypes, but " "both requested\n"); } else if(geno_probs) { impute_parse_geno_probs(geno_probs, cur, impute_info->n_samples); } else if(haplotypes) { impute_parse_haplotypes(haplotypes, cur, impute_info->n_samples); } }
/* * Finds the target matching the given description, creating it if necessary. * * Assumes: strings are pointers to allocated space. Sets strings to NULL, if * the values are used. * * Assumes that obj may not be in the object tree yet, and may return it as the * target. */ static ABObj find_or_create_target( ABObj obj, ABObj module, ISTRING interface_name, ISTRING parent_name, ISTRING obj_name, ISTRING item_label ) { ABObj target = NULL; /* the real thing, baby! */ ABObj target_project = NULL; char target_interface_file[MAXPATHLEN+1]; char target_interface_name[GIL_MAX_NAME_SIZE]; ABObj target_module = NULL; char target_parent_name[GIL_MAX_NAME_SIZE]; ABObj target_parent = NULL; char target_obj_name[GIL_MAX_NAME_SIZE]; ABObj target_obj = NULL; char target_item_label[GIL_MAX_NAME_SIZE]; ABObj target_item = NULL; AB_TRAVERSAL trav; *target_interface_file = 0; *target_interface_name = 0; *target_parent_name = 0; *target_obj_name = 0; *target_item_label = 0; /* must have object name */ if (debugging()) { assert( (obj_name != NULL) && (obj_is_project(module) || obj_is_module(module))); } if (obj_is_project(module)) { /* The first string must be an interface file name */ if (interface_name != NULL) { util_strncpy(target_interface_file, istr_string(interface_name), GIL_MAX_NAME_SIZE); if (parent_name != NULL) { util_strncpy(target_parent_name, istr_string(parent_name), GIL_MAX_NAME_SIZE); } } else if (parent_name != NULL) { util_strncpy(target_interface_file, istr_string(parent_name), GIL_MAX_NAME_SIZE); } else { abil_print_load_err(ERR_WANT_FULL_NAME); goto epilogue; } /* derive the name from the file name */ strcpy(target_interface_name, target_interface_file); { int len = strlen(target_interface_name); if ( (len >= 2) && ( util_streq(&(target_interface_name[len-2]), ".G") || util_streq(&(target_interface_name[len-2]), ".P"))) { target_interface_name[len-2] = 0; } } util_strncpy(target_obj_name, istr_string(obj_name), GIL_MAX_NAME_SIZE); if (item_label != NULL) { util_strncpy(target_item_label, istr_string(item_label), GIL_MAX_NAME_SIZE); } } else /* ! obj_is_project() */ { if (parent_name != NULL) { /* we have parent name and object name */ util_strncpy(target_parent_name, istr_string(parent_name), GIL_MAX_NAME_SIZE); } util_strncpy(target_obj_name, istr_string(obj_name), GIL_MAX_NAME_SIZE); if (item_label != NULL) { /* we have object name and item_label */ util_strncpy(target_item_label, istr_string(item_label), GIL_MAX_NAME_SIZE); } } /* * We've got the name broken down into the appropriate pieces. * Now find the actual target. */ /*util_dprintf(3, "finding: module:'%s' parent:'%s' obj:'%s' item:'%s'\n", target_interface_name, target_parent_name, target_obj_name, target_item_label);*/ /* * Find target project */ target_project = obj_get_project(module); /* * Find target module */ if (util_strempty(target_interface_name)) { target_module = module; } else { /* find specified intefarce (module) */ for (trav_open(&trav, target_project, AB_TRAV_MODULES); (target_module = trav_next(&trav)) != NULL; ) { if ( (target_module != target_project) && (util_streq( obj_get_file(target_module), target_interface_file))) { break; } } trav_close(&trav); if (target_module == NULL) { target_module = obj_create(AB_TYPE_MODULE, target_project); obj_set_is_defined(target_module, FALSE); obj_set_file(target_module, target_interface_file); obj_set_name(target_module, target_interface_name); } } /* * Find target parent */ if (util_strempty(target_parent_name)) { target_parent = target_module; } else { for (trav_open(&trav, target_module, AB_TRAV_ALL | AB_TRAV_MOD_PARENTS_FIRST); (target_parent = trav_next(&trav)) != NULL; ) { if ( (target_parent != target_module) && (util_streq( obj_get_name(target_parent), target_parent_name))) { break; } } trav_close(&trav); if (target_parent == NULL) { target_parent = obj_create(AB_TYPE_UNDEF, target_module); obj_set_is_defined(target_parent, FALSE); obj_set_file(target_parent, target_interface_file); obj_set_name(target_parent, target_parent_name); } } /* * Find target obj */ for (trav_open(&trav, target_parent, AB_TRAV_ALL | AB_TRAV_MOD_PARENTS_FIRST); (target_obj = trav_next(&trav)) != NULL; ) { if ( (target_obj != target_parent) && util_streq(target_obj_name, obj_get_name(target_obj))) { break; } } trav_close(&trav); if (target_obj == NULL) { target_obj = obj_create(AB_TYPE_UNDEF, target_parent); obj_set_is_defined(target_obj, FALSE); obj_set_file(target_obj, target_interface_file); obj_set_name(target_obj, target_obj_name); } /* * Find item */ if (util_strempty(target_item_label)) { target_item = NULL; } else { for (trav_open(&trav, target_obj, AB_TRAV_ITEMS); (target_item = trav_next(&trav)) != NULL; ) { if ( (target_item != target_obj) && util_streq(obj_get_label(target_item), target_item_label)) { break; } } trav_close(&trav); if (target_item == NULL) { target_item = obj_create(AB_TYPE_ITEM, target_obj); obj_set_is_defined(target_item, FALSE); obj_set_file(target_item, target_interface_file); obj_set_label(target_item, target_item_label); obj_set_name_from_label(target_item, obj_get_name(obj_get_parent(target_item))); } } if (target_item != NULL) { target = target_item; } else { target = target_obj; } epilogue: return target; }
/* * Insertion functions (the opposite of extraction). Yes for generating * PAKs. */ static bool pak_insert_one(pak_file_t *pak, const char *file) { pak_directory_t dir; unsigned char *dat; long len; FILE *fp; /* * We don't allow insertion on files that already exist within the * pak file. Weird shit can happen if we allow that ;). We also * don't allow insertion if the pak isn't opened in write mode. */ if (!pak || !file || !pak->insert || pak_exists(pak, file, NULL)) return false; if (!(fp = fs_file_open(file, "rb"))) return false; /* * Calculate the total file length, since it will be wrote to * the directory entry, and the actual contents of the file * to the PAK file itself. */ if (fs_file_seek(fp, 0, SEEK_END) != 0 || ((len = fs_file_tell(fp)) < 0)) goto err; if (fs_file_seek(fp, 0, SEEK_SET) != 0) goto err; dir.len = len; dir.pos = fs_file_tell(pak->handle); /* * We're limited to 56 bytes for a file name string, that INCLUDES * the directory and '/' seperators. */ if (strlen(file) >= 56) goto err; util_strncpy(dir.name, file, strlen(file)); /* * Allocate some memory for loading in the data that will be * redirected into the PAK file. */ if (!(dat = (unsigned char *)mem_a(dir.len))) goto err; fs_file_read (dat, dir.len, 1, fp); fs_file_close(fp); fs_file_write(dat, dir.len, 1, pak->handle); /* * Now add the directory to the directories vector, so pak_close * can actually write it. */ vec_push(pak->directories, dir); return true; err: fs_file_close(fp); return false; }
/** * Gets next line of VCF file and parses it into VCFInfo datastructure. * * If geno_probs array is non-null genotype likelihoods are parsed and * stored in the provided array. The array must be of length * n_sample*3. * * If haplotypes array is non-null phased genotypes are parsed and * stored in the provided array. The array must be of length * n_sample*2. * * Returns 0 on success, -1 if at EOF. */ int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp, float *geno_probs, char *haplotypes) { char *cur, *token; int n_fix_header, ref_len, alt_len; size_t tok_num; /* Used to allow space or tab delimiters here but now only allow * tab. This is because VCF specification indicates that fields * should be tab-delimited, and occasionally some fields contain * spaces. */ /* const char delim[] = " \t";*/ const char delim[] = "\t"; n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *); /* read a line */ if(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) == -1) { return -1; } cur = vcf_info->buf; tok_num = 0; /* chrom */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } /* we don't bother to store chromosome since we store * SNPs from each chromosome in their own table */ /* util_strncpy(snp->chrom, token, sizeof(snp->chrom)); */ /* pos */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } snp->pos = util_parse_long(token); /* ID */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(snp->name, token, sizeof(snp->name)); /* ref */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } ref_len = util_strncpy(snp->allele1, token, sizeof(snp->allele1)); /* used to warn about truncations, but makes program too * chatty if there are a lot of them */ vcf_info->ref_len = 0; /* vcf_info->ref_len = strlen(token); */ /* if(ref_len != vcf_info->ref_len) { */ /* my_warn("truncating long allele (%ld bp) to %ld bp\n", */ /* vcf_info->ref_len, ref_len); */ /* } */ /* alt */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2)); vcf_info->alt_len = 0; /* vcf_info->alt_len = strlen(token); */ /* if(alt_len != vcf_info->alt_len) { */ /* my_warn("truncating long allele (%ld bp) to %ld bp\n", */ /* vcf_info->alt_len, alt_len); */ /* } */ /* qual */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->qual, token, sizeof(vcf_info->qual)); /* filter */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->filter, token, sizeof(vcf_info->filter)); /* info */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->info, token, sizeof(vcf_info->info)); /* format */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->format, token, sizeof(vcf_info->format)); /* now parse haplotypes and/or genotype likelihoods */ if(geno_probs && haplotypes) { char *cur_copy; /* Both genotype probs and haplotypes requested. * Need to copy string because it is modified * by the tokenizing in the parsing functions. * * This could be made more efficient by doing the parsing * of both types of data at same time */ cur_copy = my_malloc(strlen(cur)+1); strcpy(cur_copy, cur); vcf_parse_geno_probs(vcf_info, geno_probs, cur_copy); my_free(cur_copy); vcf_parse_haplotypes(vcf_info, haplotypes, cur); } else if(geno_probs) { vcf_parse_geno_probs(vcf_info, geno_probs, cur); } else if(haplotypes) { vcf_parse_haplotypes(vcf_info, haplotypes, cur); } /* my_free(line); */ return 0; }
/** * get genotype probabilities by parsing and converting genotype likelihoods * (GL) from VCF line */ void vcf_parse_gl(VCFInfo *vcf_info, float *geno_probs, char *cur, long gl_idx) { char delim[] = "\t"; char inner_delim[] = ":"; char *tok, *inner_tok, *inner_cur; char gtype[VCF_MAX_FORMAT]; long i, n, n_geno_probs, expect_geno_probs; float like_homo_ref, like_het, like_homo_alt; float prob_homo_ref, prob_het, prob_homo_alt, prob_sum; expect_geno_probs = vcf_info->n_sample * 3; n_geno_probs = 0; while((tok = strsep(&cur, delim)) != NULL) { /* each genotype string is delimited by ':' * each GL portion is delimited by ',' */ util_strncpy(gtype, tok, sizeof(gtype)); i = 0; inner_cur = gtype; while((i <= gl_idx) && (inner_tok = strsep(&inner_cur, inner_delim)) != NULL) { if(i == gl_idx) { n = sscanf(inner_tok, "%g,%g,%g", &like_homo_ref, &like_het, &like_homo_alt); if(n != 3) { if(strcmp(inner_tok, ".") == 0) { /* '.' indicates missing data * set all likelihoods to log(0.333) = -0.477 */ like_homo_ref = like_het = like_homo_alt = -0.477; } else { my_err("%s:%d: failed to parse genotype likelihoods from " "string '%s'", __FILE__, __LINE__, inner_tok); } } /* convert log10(prob) to prob */ prob_homo_ref = pow(10.0, like_homo_ref); prob_het = pow(10.0, like_het); prob_homo_alt = pow(10.0, like_homo_alt); if((n_geno_probs + 3) > expect_geno_probs) { my_err("%s:%d: more genotype likelihoods per line than expected", __FILE__, __LINE__); } /* most of time probs sum to 1.0, but sometimes they do not * possibly reflects different likelihoods used for indel * calling but not sure. Normalize probs so they sum to 1.0 * This is like getting posterior assuming uniform prior. */ prob_sum = prob_homo_ref + prob_het + prob_homo_alt; prob_homo_ref = prob_homo_ref / prob_sum; prob_het = prob_het / prob_sum; prob_homo_alt = prob_homo_alt / prob_sum; geno_probs[n_geno_probs] = prob_homo_ref; geno_probs[n_geno_probs + 1] = prob_het; geno_probs[n_geno_probs + 2] = prob_homo_alt; n_geno_probs += 3; } i++; } } if(n_geno_probs != expect_geno_probs) { my_err("%s:%d: expected %ld genotype likelihoods per line, but got " "%ld", __FILE__, __LINE__, expect_geno_probs, n_geno_probs); } }
void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes, char *cur) { int gt_idx, hap1, hap2, i, n; static int warn_phase = TRUE; static int warn_parse = TRUE; long expect_haps, n_haps; char gt_str[VCF_MAX_FORMAT]; /* char delim[] = " \t"; */ char delim[] = "\t"; char inner_delim[] = ":"; char *inner_cur, *tok, *inner_tok; /* get index of GT token in format string*/ gt_idx = get_format_index(vcf_info->format, "GT"); if(gt_idx == -1) { my_err("%s:%d: VCF format string does not specify GT token " "so cannot obtain haplotypes. Format string: '%s'.\n" "To use this file, you must run snp2h5 without " "the --haplotype option.", __FILE__, __LINE__, vcf_info->format); } expect_haps = vcf_info->n_sample * 2; n_haps = 0; while((tok = strsep(&cur, delim)) != NULL) { /* Each genotype string is delimited by ':' * The GT portions of the string are delimited by '/' or '|' * '|' indicates phased, '/' indicates unphased. */ util_strncpy(gt_str, tok, sizeof(gt_str)); i = 0; inner_cur = gt_str; while((i <= gt_idx) && (inner_tok = strsep(&inner_cur, inner_delim)) != NULL) { if(i == gt_idx) { n = sscanf(inner_tok, "%d|%d", &hap1, &hap2); if(n != 2) { /* try with '/' separator instead */ n = sscanf(inner_tok, "%d/%d", &hap1, &hap2); if(n == 2) { if(warn_phase) { my_warn("%s:%d: some genotypes are unphased (delimited " "with '/' instead of '|')\n", __FILE__, __LINE__, inner_tok); warn_phase = FALSE; } } else { if(warn_parse) { my_warn("%s:%d: could not parse some genotype " "strings that look like: '%s'\n", __FILE__, __LINE__, inner_tok); warn_parse = FALSE; } hap1 = VCF_GTYPE_MISSING; hap2 = VCF_GTYPE_MISSING; } } if((hap1 != VCF_GTYPE_MISSING && hap1 != 0 && hap1 != 1) || (hap2 != VCF_GTYPE_MISSING && hap2 != 0 && hap2 != 1)) { /* Copy number polymorphisms and multi-allelic SNPs * can have values other than 0 and 1 (e.g. 3, 4, ...). * Combined haplotype test does not currently deal with * these. Set the genotypes to MISSING (-1) */ hap1 = VCF_GTYPE_MISSING; hap2 = VCF_GTYPE_MISSING; } if((n_haps + 2) > expect_haps) { my_err("%s:%d: more genotypes per line than expected", __FILE__, __LINE__); } haplotypes[n_haps] = hap1; haplotypes[n_haps+1] = hap2; n_haps += 2; } i++; } } if(n_haps != expect_haps) { my_err("%s:%d: expected %ld genotype values per line, but got " "%ld", __FILE__, __LINE__, expect_haps, n_haps); } }