int32_t load_clusters(char* fname, uintptr_t unfiltered_indiv_ct, uintptr_t* indiv_exclude, uintptr_t indiv_ct, char* person_ids, uintptr_t max_person_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr) { unsigned char* wkspace_mark = wkspace_base; FILE* infile = NULL; uintptr_t indiv_ctl = (indiv_ct + (BITCT - 1)) / BITCT; uintptr_t topsize = 0; int32_t retval = 0; char* idbuf = &(tbuf[MAXLINELEN]); uintptr_t max_cluster_id_len = 0; uintptr_t assigned_ct = 0; uintptr_t cluster_ct = 0; Ll_str* cluster_names = NULL; uintptr_t* already_seen; char* cluster_ids; uint32_t* cluster_map; uint32_t* cluster_starts; uint32_t* tmp_cluster_starts; uintptr_t topsize_bak; Ll_str* llptr; char* sorted_ids; uint32_t* id_map; char* fam_id; char* indiv_id; char* cluster_name_ptr; uintptr_t ulii; int32_t sorted_idx; uint32_t indiv_uidx; uint32_t slen; uint32_t uii; sorted_ids = (char*)top_alloc(&topsize, indiv_ct * max_person_id_len); if (!sorted_ids) { goto load_clusters_ret_NOMEM; } id_map = (uint32_t*)top_alloc(&topsize, indiv_ct * sizeof(int32_t)); if (!id_map) { goto load_clusters_ret_NOMEM; } topsize_bak = topsize; already_seen = (uintptr_t*)top_alloc(&topsize, indiv_ctl * sizeof(intptr_t)); if (!already_seen) { goto load_clusters_ret_NOMEM; } fill_ulong_zero(already_seen, indiv_ctl); memcpy(sorted_ids, person_ids, indiv_ct * max_person_id_len); wkspace_left -= topsize; retval = sort_item_ids_noalloc(sorted_ids, id_map, unfiltered_indiv_ct, indiv_exclude, indiv_ct, person_ids, max_person_id_len, 0, 0, strcmp_deref); wkspace_left += topsize; if (retval) { goto load_clusters_ret_1; } // two-pass load // 1. load cluster names, track longest length, validate format, verify no // individual ID appears multiple times // intermission. sort cluster names, purge duplicates, allocate memory for // return values // 2. populate return name arrays if (fopen_checked(&infile, fname, "r")) { goto load_clusters_ret_OPEN_FAIL; } tbuf[MAXLINELEN - 1] = ' '; if (!mwithin_col) { mwithin_col = 1; } while (fgets(tbuf, MAXLINELEN, infile)) { if (!tbuf[MAXLINELEN - 1]) { logprint("Error: Pathologically long line in --within file.\n"); goto load_clusters_ret_INVALID_FORMAT; } fam_id = skip_initial_spaces(tbuf); if (is_eoln_kns(*fam_id)) { continue; } indiv_id = next_item(fam_id); cluster_name_ptr = next_item_mult(indiv_id, mwithin_col); if (no_more_items_kns(cluster_name_ptr)) { logprint("Error: Fewer entries than expected in --within file line.\n"); goto load_clusters_ret_INVALID_FORMAT; } sorted_idx = bsearch_fam_indiv(idbuf, sorted_ids, max_person_id_len, indiv_ct, fam_id, indiv_id); if (sorted_idx == -1) { continue; } if (is_set(already_seen, sorted_idx)) { idbuf[strlen_se(fam_id)] = ' '; sprintf(logbuf, "Error: Duplicate individual %s in --within file.\n", idbuf); logprintb(); goto load_clusters_ret_INVALID_FORMAT; } set_bit_noct(already_seen, sorted_idx); slen = strlen_se(cluster_name_ptr); if ((!keep_na) && (slen == 2) && (!memcmp(cluster_name_ptr, "NA", 2))) { // postponed to here because, even without 'keep-NA', we do not want to // ignore cluster=NA lines for the purpose of detecting duplicate indivs continue; } if (slen >= max_cluster_id_len) { max_cluster_id_len = slen + 1; } llptr = top_alloc_llstr(&topsize, slen + 1); llptr->next = cluster_names; memcpyx(llptr->ss, cluster_name_ptr, slen, '\0'); cluster_names = llptr; assigned_ct++; } if (!feof(infile)) { goto load_clusters_ret_READ_FAIL; } if (cluster_names) { *max_cluster_id_len_ptr = max_cluster_id_len; wkspace_left -= topsize; if (wkspace_alloc_c_checked(cluster_ids_ptr, assigned_ct * max_cluster_id_len)) { goto load_clusters_ret_NOMEM2; } cluster_ids = *cluster_ids_ptr; for (ulii = 0; ulii < assigned_ct; ulii++) { strcpy(&(cluster_ids[ulii * max_cluster_id_len]), cluster_names->ss); cluster_names = cluster_names->next; } // deallocate cluster ID linked list and duplicate indiv ID detector from // top of stack, allocate cluster size tracker wkspace_left += topsize; topsize = topsize_bak; tmp_cluster_starts = (uint32_t*)top_alloc(&topsize, assigned_ct * sizeof(int32_t)); if (!tmp_cluster_starts) { goto load_clusters_ret_NOMEM; } wkspace_left -= topsize; // may as well use natural sort of cluster names qsort(cluster_ids, assigned_ct, max_cluster_id_len, strcmp_natural); cluster_ct = collapse_duplicate_ids(cluster_ids, assigned_ct, max_cluster_id_len, tmp_cluster_starts); *cluster_ct_ptr = cluster_ct; // shrink wkspace_reset(cluster_ids); wkspace_alloc_c_checked(cluster_ids_ptr, cluster_ct * max_cluster_id_len); if (wkspace_alloc_ui_checked(cluster_map_ptr, assigned_ct * sizeof(int32_t)) || wkspace_alloc_ui_checked(cluster_starts_ptr, (cluster_ct + 1) * sizeof(int32_t))) { goto load_clusters_ret_NOMEM2; } wkspace_left += topsize; cluster_map = *cluster_map_ptr; cluster_starts = *cluster_starts_ptr; memcpy(cluster_starts, tmp_cluster_starts, cluster_ct * sizeof(int32_t)); cluster_starts[cluster_ct] = assigned_ct; rewind(infile); // second pass while (fgets(tbuf, MAXLINELEN, infile)) { fam_id = skip_initial_spaces(tbuf); if (is_eoln_kns(*fam_id)) { continue; } indiv_id = next_item(fam_id); cluster_name_ptr = next_item_mult(indiv_id, mwithin_col); slen = strlen_se(cluster_name_ptr); if ((!keep_na) && (slen == 2) && (!memcmp(cluster_name_ptr, "NA", 2))) { continue; } sorted_idx = bsearch_fam_indiv(idbuf, sorted_ids, max_person_id_len, indiv_ct, fam_id, indiv_id); if (sorted_idx == -1) { continue; } indiv_uidx = id_map[(uint32_t)sorted_idx]; cluster_name_ptr[slen] = '\0'; sorted_idx = bsearch_str_natural(cluster_name_ptr, cluster_ids, max_cluster_id_len, 0, cluster_ct - 1); uii = tmp_cluster_starts[(uint32_t)sorted_idx]; tmp_cluster_starts[(uint32_t)sorted_idx] += 1; cluster_map[uii] = indiv_uidx; } if (!feof(infile)) { goto load_clusters_ret_READ_FAIL; } for (ulii = 0; ulii < cluster_ct; ulii++) { if (cluster_starts[ulii + 1] - cluster_starts[ulii] > 1) { #ifdef __cplusplus std::sort(&(cluster_map[cluster_starts[ulii]]), &(cluster_map[cluster_starts[ulii + 1]])); #else qsort(&(cluster_map[cluster_starts[ulii]]), cluster_starts[ulii + 1] - cluster_starts[ulii], sizeof(int32_t), intcmp); #endif } } sprintf(logbuf, "--within: %" PRIuPTR " cluster%s loaded, covering a total of %" PRIuPTR " %s.\n", cluster_ct, (cluster_ct == 1)? "" : "s", assigned_ct, species_str(assigned_ct)); logprintb(); } else { logprint("Warning: No individuals named in --within file remain in the current analysis.\n"); } while (0) { load_clusters_ret_NOMEM2: wkspace_left += topsize; load_clusters_ret_NOMEM: retval = RET_NOMEM; break; load_clusters_ret_OPEN_FAIL: retval = RET_OPEN_FAIL; break; load_clusters_ret_READ_FAIL: retval = RET_READ_FAIL; break; load_clusters_ret_INVALID_FORMAT: retval = RET_INVALID_FORMAT; break; } load_clusters_ret_1: if (retval) { wkspace_reset(wkspace_mark); } fclose_cond(infile); return retval; }
int32_t scan_column_widths(FILE* infile, uintptr_t column_sep, uintptr_t** col_widths_ptr, uintptr_t* col_ct_ptr, unsigned char** spacebuf_ptr, unsigned char** rjustify_buf_ptr) { uintptr_t malloc_size = INITIAL_COLS * sizeof(intptr_t); uintptr_t* col_widths = (uintptr_t*)malloc(malloc_size); uintptr_t col_ct = 0; uintptr_t max_col_ct = INITIAL_COLS; // not a hard limit // actually a one-based index, to simplify distinguishing between // beginning-of-line-and-not-in-column from beginning-of-line-and-in-column // first element of col_widths[] is essentially unused as a result uintptr_t cur_col_idx = 0; uintptr_t cur_col_width = 0; uintptr_t line_idx = 1; int32_t retval = 0; unsigned char* readptr; unsigned char* line_end; unsigned char* readbuf_end; unsigned char* token_end; uintptr_t* new_col_widths; uintptr_t cur_read; if (!col_widths) { goto scan_column_widths_ret_NOMEM; } fill_ulong_zero(col_widths, max_col_ct); cur_read = fread(g_readbuf, 1, BUFSIZE, infile); if (ferror(infile)) { goto scan_column_widths_ret_READ_FAIL; } if (!cur_read) { goto scan_column_widths_ret_INVALID_FORMAT; } readptr = g_readbuf; readbuf_end = &(g_readbuf[cur_read]); while (1) { line_end = (unsigned char*)memchr(readptr, '\n', (uintptr_t)(readbuf_end - readptr)); if (!line_end) { line_end = readbuf_end; } while (readptr < line_end) { if (!cur_col_width) { if (skip_spaces_ck(&readptr, line_end)) { break; } if (++cur_col_idx == max_col_ct) { malloc_size *= 2; new_col_widths = realloc(col_widths, malloc_size); if (!new_col_widths) { goto scan_column_widths_ret_READ_FAIL; } col_widths = new_col_widths; fill_ulong_zero(&(col_widths[max_col_ct]), max_col_ct); max_col_ct *= 2; } } token_end = get_token_end_ck(readptr, line_end); cur_col_width += (uintptr_t)(token_end - readptr); if (token_end == line_end) { break; } if (cur_col_width > col_widths[cur_col_idx]) { col_widths[cur_col_idx] = cur_col_width; } cur_col_width = 0; readptr = token_end; } if ((line_end < readbuf_end) || (!cur_read)) { handle_last_column(col_widths, cur_col_idx, cur_col_width, &col_ct); if (line_end == readbuf_end) { // EOF break; } readptr = &(line_end[1]); line_idx++; cur_col_idx = 0; cur_col_width = 0; continue; } // in middle of line cur_read = fread(g_readbuf, 1, BUFSIZE, infile); if (ferror(infile)) { goto scan_column_widths_ret_READ_FAIL; } readptr = g_readbuf; readbuf_end = &(g_readbuf[cur_read]); } cur_col_width = 0; for (cur_col_idx = 1; cur_col_idx <= col_ct; cur_col_idx++) { if (col_widths[cur_col_idx] > cur_col_width) { cur_col_width = col_widths[cur_col_idx]; } } *spacebuf_ptr = (unsigned char*)malloc(cur_col_width + column_sep); if (!(*spacebuf_ptr)) { goto scan_column_widths_ret_NOMEM; } memset(*spacebuf_ptr, 32, cur_col_width + column_sep); if (rjustify_buf_ptr) { *rjustify_buf_ptr = (unsigned char*)malloc(cur_col_width); if (!(*rjustify_buf_ptr)) { goto scan_column_widths_ret_NOMEM; } } while (0) { scan_column_widths_ret_NOMEM: retval = RET_NOMEM; break; scan_column_widths_ret_READ_FAIL: retval = RET_READ_FAIL; break; scan_column_widths_ret_INVALID_FORMAT: fputs("Error: Empty input file.\n", stderr); retval = RET_INVALID_FORMAT; break; } *col_widths_ptr = col_widths; *col_ct_ptr = col_ct; return retval; }