Example #1
0
int32_t load_clusters(char* fname, uintptr_t unfiltered_indiv_ct, uintptr_t* indiv_exclude, uintptr_t indiv_ct, char* person_ids, uintptr_t max_person_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr) {
  unsigned char* wkspace_mark = wkspace_base;
  FILE* infile = NULL;
  uintptr_t indiv_ctl = (indiv_ct + (BITCT - 1)) / BITCT;
  uintptr_t topsize = 0;
  int32_t retval = 0;
  char* idbuf = &(tbuf[MAXLINELEN]);
  uintptr_t max_cluster_id_len = 0;
  uintptr_t assigned_ct = 0;
  uintptr_t cluster_ct = 0;
  Ll_str* cluster_names = NULL;
  uintptr_t* already_seen;
  char* cluster_ids;
  uint32_t* cluster_map;
  uint32_t* cluster_starts;
  uint32_t* tmp_cluster_starts;
  uintptr_t topsize_bak;
  Ll_str* llptr;
  char* sorted_ids;
  uint32_t* id_map;
  char* fam_id;
  char* indiv_id;
  char* cluster_name_ptr;
  uintptr_t ulii;
  int32_t sorted_idx;
  uint32_t indiv_uidx;
  uint32_t slen;
  uint32_t uii;

  sorted_ids = (char*)top_alloc(&topsize, indiv_ct * max_person_id_len);
  if (!sorted_ids) {
    goto load_clusters_ret_NOMEM;
  }
  id_map = (uint32_t*)top_alloc(&topsize, indiv_ct * sizeof(int32_t));
  if (!id_map) {
    goto load_clusters_ret_NOMEM;
  }
  topsize_bak = topsize;
  already_seen = (uintptr_t*)top_alloc(&topsize, indiv_ctl * sizeof(intptr_t));
  if (!already_seen) {
    goto load_clusters_ret_NOMEM;
  }
  fill_ulong_zero(already_seen, indiv_ctl);
  memcpy(sorted_ids, person_ids, indiv_ct * max_person_id_len);
  wkspace_left -= topsize;
  retval = sort_item_ids_noalloc(sorted_ids, id_map, unfiltered_indiv_ct, indiv_exclude, indiv_ct, person_ids, max_person_id_len, 0, 0, strcmp_deref);
  wkspace_left += topsize;
  if (retval) {
    goto load_clusters_ret_1;
  }

  // two-pass load
  // 1. load cluster names, track longest length, validate format, verify no
  //    individual ID appears multiple times
  // intermission. sort cluster names, purge duplicates, allocate memory for
  //               return values
  // 2. populate return name arrays
  if (fopen_checked(&infile, fname, "r")) {
    goto load_clusters_ret_OPEN_FAIL;
  }
  tbuf[MAXLINELEN - 1] = ' ';
  if (!mwithin_col) {
    mwithin_col = 1;
  }
  while (fgets(tbuf, MAXLINELEN, infile)) {
    if (!tbuf[MAXLINELEN - 1]) {
      logprint("Error: Pathologically long line in --within file.\n");
      goto load_clusters_ret_INVALID_FORMAT;
    }
    fam_id = skip_initial_spaces(tbuf);
    if (is_eoln_kns(*fam_id)) {
      continue;
    }
    indiv_id = next_item(fam_id);
    cluster_name_ptr = next_item_mult(indiv_id, mwithin_col);
    if (no_more_items_kns(cluster_name_ptr)) {
      logprint("Error: Fewer entries than expected in --within file line.\n");
      goto load_clusters_ret_INVALID_FORMAT;
    }
    sorted_idx = bsearch_fam_indiv(idbuf, sorted_ids, max_person_id_len, indiv_ct, fam_id, indiv_id);
    if (sorted_idx == -1) {
      continue;
    }
    if (is_set(already_seen, sorted_idx)) {
      idbuf[strlen_se(fam_id)] = ' ';
      sprintf(logbuf, "Error: Duplicate individual %s in --within file.\n", idbuf);
      logprintb();
      goto load_clusters_ret_INVALID_FORMAT;
    }
    set_bit_noct(already_seen, sorted_idx);
    slen = strlen_se(cluster_name_ptr);
    if ((!keep_na) && (slen == 2) && (!memcmp(cluster_name_ptr, "NA", 2))) {
      // postponed to here because, even without 'keep-NA', we do not want to
      // ignore cluster=NA lines for the purpose of detecting duplicate indivs
      continue;
    }
    if (slen >= max_cluster_id_len) {
      max_cluster_id_len = slen + 1;
    }
    llptr = top_alloc_llstr(&topsize, slen + 1);
    llptr->next = cluster_names;
    memcpyx(llptr->ss, cluster_name_ptr, slen, '\0');
    cluster_names = llptr;
    assigned_ct++;
  }
  if (!feof(infile)) {
    goto load_clusters_ret_READ_FAIL;
  }
  if (cluster_names) {
    *max_cluster_id_len_ptr = max_cluster_id_len;
    wkspace_left -= topsize;
    if (wkspace_alloc_c_checked(cluster_ids_ptr, assigned_ct * max_cluster_id_len)) {
      goto load_clusters_ret_NOMEM2;
    }
    cluster_ids = *cluster_ids_ptr;
    for (ulii = 0; ulii < assigned_ct; ulii++) {
      strcpy(&(cluster_ids[ulii * max_cluster_id_len]), cluster_names->ss);
      cluster_names = cluster_names->next;
    }
    // deallocate cluster ID linked list and duplicate indiv ID detector from
    // top of stack, allocate cluster size tracker
    wkspace_left += topsize;
    topsize = topsize_bak;
    tmp_cluster_starts = (uint32_t*)top_alloc(&topsize, assigned_ct * sizeof(int32_t));
    if (!tmp_cluster_starts) {
      goto load_clusters_ret_NOMEM;
    }
    wkspace_left -= topsize;
    // may as well use natural sort of cluster names
    qsort(cluster_ids, assigned_ct, max_cluster_id_len, strcmp_natural);
    cluster_ct = collapse_duplicate_ids(cluster_ids, assigned_ct, max_cluster_id_len, tmp_cluster_starts);
    *cluster_ct_ptr = cluster_ct;
    // shrink
    wkspace_reset(cluster_ids);
    wkspace_alloc_c_checked(cluster_ids_ptr, cluster_ct * max_cluster_id_len);
    if (wkspace_alloc_ui_checked(cluster_map_ptr, assigned_ct * sizeof(int32_t)) ||
        wkspace_alloc_ui_checked(cluster_starts_ptr, (cluster_ct + 1) * sizeof(int32_t))) {
      goto load_clusters_ret_NOMEM2;
    }
    wkspace_left += topsize;
    cluster_map = *cluster_map_ptr;
    cluster_starts = *cluster_starts_ptr;
    memcpy(cluster_starts, tmp_cluster_starts, cluster_ct * sizeof(int32_t));
    cluster_starts[cluster_ct] = assigned_ct;
    rewind(infile);
    // second pass
    while (fgets(tbuf, MAXLINELEN, infile)) {
      fam_id = skip_initial_spaces(tbuf);
      if (is_eoln_kns(*fam_id)) {
	continue;
      }
      indiv_id = next_item(fam_id);
      cluster_name_ptr = next_item_mult(indiv_id, mwithin_col);
      slen = strlen_se(cluster_name_ptr);
      if ((!keep_na) && (slen == 2) && (!memcmp(cluster_name_ptr, "NA", 2))) {
	continue;
      }
      sorted_idx = bsearch_fam_indiv(idbuf, sorted_ids, max_person_id_len, indiv_ct, fam_id, indiv_id);
      if (sorted_idx == -1) {
	continue;
      }
      indiv_uidx = id_map[(uint32_t)sorted_idx];
      cluster_name_ptr[slen] = '\0';
      sorted_idx = bsearch_str_natural(cluster_name_ptr, cluster_ids, max_cluster_id_len, 0, cluster_ct - 1);
      uii = tmp_cluster_starts[(uint32_t)sorted_idx];
      tmp_cluster_starts[(uint32_t)sorted_idx] += 1;
      cluster_map[uii] = indiv_uidx;
    }
    if (!feof(infile)) {
      goto load_clusters_ret_READ_FAIL;
    }
    for (ulii = 0; ulii < cluster_ct; ulii++) {
      if (cluster_starts[ulii + 1] - cluster_starts[ulii] > 1) {
#ifdef __cplusplus
	std::sort(&(cluster_map[cluster_starts[ulii]]), &(cluster_map[cluster_starts[ulii + 1]]));
#else
	qsort(&(cluster_map[cluster_starts[ulii]]), cluster_starts[ulii + 1] - cluster_starts[ulii], sizeof(int32_t), intcmp);
#endif
      }
    }
    sprintf(logbuf, "--within: %" PRIuPTR " cluster%s loaded, covering a total of %" PRIuPTR " %s.\n", cluster_ct, (cluster_ct == 1)? "" : "s", assigned_ct, species_str(assigned_ct));
    logprintb();
  } else {
    logprint("Warning: No individuals named in --within file remain in the current analysis.\n");
  }

  while (0) {
  load_clusters_ret_NOMEM2:
    wkspace_left += topsize;
  load_clusters_ret_NOMEM:
    retval = RET_NOMEM;
    break;
  load_clusters_ret_OPEN_FAIL:
    retval = RET_OPEN_FAIL;
    break;
  load_clusters_ret_READ_FAIL:
    retval = RET_READ_FAIL;
    break;
  load_clusters_ret_INVALID_FORMAT:
    retval = RET_INVALID_FORMAT;
    break;
  }
 load_clusters_ret_1:
  if (retval) {
    wkspace_reset(wkspace_mark);
  }
  fclose_cond(infile);
  return retval;
}
Example #2
0
int32_t scan_column_widths(FILE* infile, uintptr_t column_sep, uintptr_t** col_widths_ptr, uintptr_t* col_ct_ptr, unsigned char** spacebuf_ptr, unsigned char** rjustify_buf_ptr) {
  uintptr_t malloc_size = INITIAL_COLS * sizeof(intptr_t);
  uintptr_t* col_widths = (uintptr_t*)malloc(malloc_size);
  uintptr_t col_ct = 0;
  uintptr_t max_col_ct = INITIAL_COLS; // not a hard limit

  // actually a one-based index, to simplify distinguishing between
  // beginning-of-line-and-not-in-column from beginning-of-line-and-in-column
  // first element of col_widths[] is essentially unused as a result
  uintptr_t cur_col_idx = 0;

  uintptr_t cur_col_width = 0;
  uintptr_t line_idx = 1;
  int32_t retval = 0;
  unsigned char* readptr;
  unsigned char* line_end;
  unsigned char* readbuf_end;
  unsigned char* token_end;
  uintptr_t* new_col_widths;
  uintptr_t cur_read;
  if (!col_widths) {
    goto scan_column_widths_ret_NOMEM;
  }
  fill_ulong_zero(col_widths, max_col_ct);
  cur_read = fread(g_readbuf, 1, BUFSIZE, infile);
  if (ferror(infile)) {
    goto scan_column_widths_ret_READ_FAIL;
  }
  if (!cur_read) {
    goto scan_column_widths_ret_INVALID_FORMAT;
  }
  readptr = g_readbuf;
  readbuf_end = &(g_readbuf[cur_read]);
  while (1) {
    line_end = (unsigned char*)memchr(readptr, '\n', (uintptr_t)(readbuf_end - readptr));
    if (!line_end) {
      line_end = readbuf_end;
    }
    while (readptr < line_end) {
      if (!cur_col_width) {
	if (skip_spaces_ck(&readptr, line_end)) {
	  break;
	}
	if (++cur_col_idx == max_col_ct) {
	  malloc_size *= 2;
	  new_col_widths = realloc(col_widths, malloc_size);
	  if (!new_col_widths) {
	    goto scan_column_widths_ret_READ_FAIL;
	  }
          col_widths = new_col_widths;
	  fill_ulong_zero(&(col_widths[max_col_ct]), max_col_ct);
	  max_col_ct *= 2;
	}
      }
      token_end = get_token_end_ck(readptr, line_end);
      cur_col_width += (uintptr_t)(token_end - readptr);
      if (token_end == line_end) {
	break;
      }
      if (cur_col_width > col_widths[cur_col_idx]) {
	col_widths[cur_col_idx] = cur_col_width;
      }
      cur_col_width = 0;
      readptr = token_end;
    }
    if ((line_end < readbuf_end) || (!cur_read)) {
      handle_last_column(col_widths, cur_col_idx, cur_col_width, &col_ct);
      if (line_end == readbuf_end) {
	// EOF
	break;
      }
      readptr = &(line_end[1]);
      line_idx++;
      cur_col_idx = 0;
      cur_col_width = 0;
      continue;
    }
    // in middle of line
    cur_read = fread(g_readbuf, 1, BUFSIZE, infile);
    if (ferror(infile)) {
      goto scan_column_widths_ret_READ_FAIL;
    }
    readptr = g_readbuf;
    readbuf_end = &(g_readbuf[cur_read]);
  }
  cur_col_width = 0;
  for (cur_col_idx = 1; cur_col_idx <= col_ct; cur_col_idx++) {
    if (col_widths[cur_col_idx] > cur_col_width) {
      cur_col_width = col_widths[cur_col_idx];
    }
  }
  *spacebuf_ptr = (unsigned char*)malloc(cur_col_width + column_sep);
  if (!(*spacebuf_ptr)) {
    goto scan_column_widths_ret_NOMEM;
  }
  memset(*spacebuf_ptr, 32, cur_col_width + column_sep);
  if (rjustify_buf_ptr) {
    *rjustify_buf_ptr = (unsigned char*)malloc(cur_col_width);
    if (!(*rjustify_buf_ptr)) {
      goto scan_column_widths_ret_NOMEM;
    }
  }
  while (0) {
  scan_column_widths_ret_NOMEM:
    retval = RET_NOMEM;
    break;
  scan_column_widths_ret_READ_FAIL:
    retval = RET_READ_FAIL;
    break;
  scan_column_widths_ret_INVALID_FORMAT:
    fputs("Error: Empty input file.\n", stderr);
    retval = RET_INVALID_FORMAT;
    break;
  }
  *col_widths_ptr = col_widths;
  *col_ct_ptr = col_ct;
  return retval;
}