static readstat_error_t sav_submit_value_labels(value_label_t *value_labels, int32_t label_count, readstat_type_t value_type, sav_ctx_t *ctx) { char label_name_buf[256]; readstat_error_t retval = READSTAT_OK; int32_t i; snprintf(label_name_buf, sizeof(label_name_buf), SAV_LABEL_NAME_PREFIX "%d", ctx->value_labels_count); for (i=0; i<label_count; i++) { value_label_t *vlabel = &value_labels[i]; readstat_value_t value = { .type = value_type }; if (value_type == READSTAT_TYPE_DOUBLE) { double val_d = 0.0; memcpy(&val_d, vlabel->value, 8); if (ctx->bswap) val_d = byteswap_double(val_d); value.v.double_value = val_d; sav_tag_missing_double(&value, ctx); } else { char unpadded_val[8*4+1]; retval = readstat_convert(unpadded_val, sizeof(unpadded_val), vlabel->value, 8, ctx->converter); if (retval != READSTAT_OK) break; value.v.string_value = unpadded_val; } ctx->value_label_handler(label_name_buf, value, vlabel->label, ctx->user_ctx); } return retval; }
sav_ctx_t *sav_ctx_init(sav_file_header_record_t *header, readstat_io_t *io) { sav_ctx_t *ctx = NULL; if ((ctx = malloc(sizeof(sav_ctx_t))) == NULL) { return NULL; } memset(ctx, 0, sizeof(sav_ctx_t)); ctx->bswap = !(header->layout_code == 2 || header->layout_code == 3); ctx->data_is_compressed = (header->compressed != 0); ctx->record_count = ctx->bswap ? byteswap4(header->ncases) : header->ncases; ctx->fweight_index = ctx->bswap ? byteswap4(header->weight_index) : header->weight_index; ctx->missing_double = SAV_MISSING_DOUBLE; ctx->lowest_double = SAV_LOWEST_DOUBLE; ctx->highest_double = SAV_HIGHEST_DOUBLE; double bias = ctx->bswap ? byteswap_double(header->bias) : header->bias; if (bias != 100.0) { sav_ctx_free(ctx); return NULL; } ctx->varinfo_capacity = SAV_VARINFO_INITIAL_CAPACITY; if ((ctx->varinfo = calloc(ctx->varinfo_capacity, sizeof(spss_varinfo_t))) == NULL) { sav_ctx_free(ctx); return NULL; } ctx->io = io; return ctx; }
static readstat_error_t read_value_vector(rdata_sexptype_header_t header, const char *name, rdata_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; int32_t length; size_t input_elem_size = 0; void *vals = NULL; size_t buf_len = 0; int output_data_type; int i; switch (header.type) { case RDATA_SEXPTYPE_REAL_VECTOR: input_elem_size = sizeof(double); output_data_type = READSTAT_TYPE_DOUBLE; break; case RDATA_SEXPTYPE_INTEGER_VECTOR: input_elem_size = sizeof(int32_t); output_data_type = READSTAT_TYPE_DOUBLE; break; case RDATA_SEXPTYPE_LOGICAL_VECTOR: input_elem_size = sizeof(int32_t); output_data_type = READSTAT_TYPE_DOUBLE; break; default: retval = READSTAT_ERROR_PARSE; break; } if (retval != READSTAT_OK) goto cleanup; if ((retval = read_length(&length, ctx)) != READSTAT_OK) goto cleanup; buf_len = length * input_elem_size; vals = malloc(buf_len); if (vals == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if (read_st(ctx, vals, buf_len) != buf_len) { retval = READSTAT_ERROR_READ; goto cleanup; } if (ctx->machine_needs_byteswap) { if (input_elem_size == sizeof(double)) { double *d_vals = (double *)vals; for (i=0; i<buf_len/sizeof(double); i++) { d_vals[i] = byteswap_double(d_vals[i]); } } else { uint32_t *i_vals = (uint32_t *)vals; for (i=0; i<buf_len/sizeof(uint32_t); i++) { i_vals[i] = byteswap4(i_vals[i]); } } } ctx->class_is_posixct = 0; if (header.attributes) { if ((retval = read_attributes(&handle_vector_attribute, ctx)) != READSTAT_OK) goto cleanup; } if (ctx->column_handler) { if (header.type == RDATA_SEXPTYPE_LOGICAL_VECTOR || header.type == RDATA_SEXPTYPE_INTEGER_VECTOR) { double *real_vals = malloc(length * sizeof(double)); int32_t *i_vals = (int32_t *)vals; for (i=0; i<length; i++) { if (i_vals[i] == INT32_MIN) { real_vals[i] = NAN; } else { real_vals[i] = i_vals[i]; } } if (ctx->column_handler(name, output_data_type, NULL, real_vals, length, ctx->user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } free(real_vals); } else { if (ctx->column_handler(name, output_data_type, ctx->class_is_posixct ? "%ts" : NULL, vals, length, ctx->user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } } cleanup: return retval; }
static readstat_error_t sav_process_row(unsigned char *buffer, size_t buffer_len, sav_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; double fp_value; int offset = 0; readstat_off_t data_offset = 0; size_t raw_str_used = 0; int segment_offset = 0; int var_index = 0, col = 0; while (data_offset < buffer_len && col < ctx->var_index) { spss_varinfo_t *col_info = &ctx->varinfo[col]; spss_varinfo_t *var_info = &ctx->varinfo[var_index]; readstat_value_t value = { .type = var_info->type }; if (offset > 31) { retval = READSTAT_ERROR_PARSE; goto done; } if (var_info->type == READSTAT_TYPE_STRING) { if (raw_str_used + 8 <= ctx->raw_string_len) { memcpy(ctx->raw_string + raw_str_used, &buffer[data_offset], 8); raw_str_used += 8; } if (++offset == col_info->width) { if (++segment_offset < var_info->n_segments) { raw_str_used--; } offset = 0; col++; } if (segment_offset == var_info->n_segments) { retval = readstat_convert(ctx->utf8_string, ctx->utf8_string_len, ctx->raw_string, raw_str_used, ctx->converter); if (retval != READSTAT_OK) goto done; value.v.string_value = ctx->utf8_string; if (ctx->value_handler(ctx->current_row, ctx->variables[var_info->index], value, ctx->user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto done; } raw_str_used = 0; segment_offset = 0; var_index += var_info->n_segments; } } else if (var_info->type == READSTAT_TYPE_DOUBLE) { memcpy(&fp_value, &buffer[data_offset], 8); if (ctx->bswap) { fp_value = byteswap_double(fp_value); } value.v.double_value = fp_value; sav_tag_missing_double(&value, ctx); if (ctx->value_handler(ctx->current_row, ctx->variables[var_info->index], value, ctx->user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto done; } var_index += var_info->n_segments; col++; } data_offset += 8; } ctx->current_row++; done: return retval; } static readstat_error_t sav_read_uncompressed_data(sav_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; readstat_io_t *io = ctx->io; unsigned char *buffer = NULL; size_t bytes_read = 0; size_t buffer_len = ctx->var_offset * 8; buffer = malloc(buffer_len); while (ctx->current_row < ctx->row_limit) { retval = sav_update_progress(ctx); if (retval != READSTAT_OK) goto done; if ((bytes_read = io->read(buffer, buffer_len, io->io_ctx)) != buffer_len) goto done; retval = sav_process_row(buffer, buffer_len, ctx); if (retval != READSTAT_OK) goto done; } done: if (buffer) free(buffer); return retval; }
static readstat_error_t sav_read_variable_record(sav_ctx_t *ctx) { readstat_io_t *io = ctx->io; sav_variable_record_t variable; readstat_error_t retval = READSTAT_OK; if (ctx->var_index == ctx->varinfo_capacity) { if ((ctx->varinfo = realloc(ctx->varinfo, (ctx->varinfo_capacity *= 2) * sizeof(spss_varinfo_t))) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } } if (io->read(&variable, sizeof(sav_variable_record_t), io->io_ctx) < sizeof(sav_variable_record_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } variable.print = ctx->bswap ? byteswap4(variable.print) : variable.print; variable.write = ctx->bswap ? byteswap4(variable.write) : variable.write; readstat_type_t dta_type = READSTAT_TYPE_DOUBLE; int32_t type = ctx->bswap ? byteswap4(variable.type) : variable.type; int i; if (type < 0) { if (ctx->var_index == 0) { return READSTAT_ERROR_PARSE; } ctx->var_offset++; spss_varinfo_t *prev = &ctx->varinfo[ctx->var_index-1]; prev->width++; return 0; } if (type > 0) { dta_type = READSTAT_TYPE_STRING; // len = type; } spss_varinfo_t *info = &ctx->varinfo[ctx->var_index]; memset(info, 0, sizeof(spss_varinfo_t)); info->width = 1; info->n_segments = 1; info->index = ctx->var_index; info->offset = ctx->var_offset; info->type = dta_type; retval = readstat_convert(info->name, sizeof(info->name), variable.name, sizeof(variable.name), ctx->converter); if (retval != READSTAT_OK) goto cleanup; retval = readstat_convert(info->longname, sizeof(info->longname), variable.name, sizeof(variable.name), ctx->converter); if (retval != READSTAT_OK) goto cleanup; info->print_format.decimal_places = (variable.print & 0x000000FF); info->print_format.width = (variable.print & 0x0000FF00) >> 8; info->print_format.type = (variable.print & 0x00FF0000) >> 16; info->write_format.decimal_places = (variable.write & 0x000000FF); info->write_format.width = (variable.write & 0x0000FF00) >> 8; info->write_format.type = (variable.write & 0x00FF0000) >> 16; if (variable.has_var_label) { int32_t label_len; if (io->read(&label_len, sizeof(int32_t), io->io_ctx) < sizeof(int32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } label_len = ctx->bswap ? byteswap4(label_len) : label_len; int32_t label_capacity = (label_len + 3) / 4 * 4; char *label_buf = malloc(label_capacity); size_t out_label_len = label_len*4+1; info->label = malloc(out_label_len); if (label_buf == NULL || info->label == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if (io->read(label_buf, label_capacity, io->io_ctx) < label_capacity) { retval = READSTAT_ERROR_READ; free(label_buf); free(info->label); info->label = NULL; goto cleanup; } retval = readstat_convert(info->label, out_label_len, label_buf, label_len, ctx->converter); free(label_buf); if (retval != READSTAT_OK) goto cleanup; } ctx->varinfo[ctx->var_index].labels_index = -1; if (variable.n_missing_values) { info->n_missing_values = ctx->bswap ? byteswap4(variable.n_missing_values) : variable.n_missing_values; if (info->n_missing_values < 0) { info->missing_range = 1; info->n_missing_values = abs(info->n_missing_values); } else { info->missing_range = 0; } if (info->n_missing_values > 3) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (io->read(info->missing_values, info->n_missing_values * sizeof(double), io->io_ctx) < info->n_missing_values * sizeof(double)) { retval = READSTAT_ERROR_READ; goto cleanup; } for (i=0; i<info->n_missing_values; i++) { if (ctx->bswap) { info->missing_values[i] = byteswap_double(info->missing_values[i]); } uint64_t long_value = 0; memcpy(&long_value, &info->missing_values[i], 8); if (long_value == ctx->missing_double) info->missing_values[i] = NAN; if (long_value == ctx->lowest_double) info->missing_values[i] = -HUGE_VAL; if (long_value == ctx->highest_double) info->missing_values[i] = HUGE_VAL; } } ctx->var_index++; ctx->var_offset++; cleanup: return retval; }
readstat_error_t sas_read_header(readstat_io_t *io, sas_header_info_t *hinfo, readstat_error_handler error_handler, void *user_ctx) { sas_header_start_t header_start; sas_header_end_t header_end; int retval = READSTAT_OK; char error_buf[1024]; struct tm epoch_tm = { .tm_year = 60, .tm_mday = 1 }; time_t epoch = mktime(&epoch_tm); if (io->read(&header_start, sizeof(sas_header_start_t), io->io_ctx) < sizeof(sas_header_start_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(header_start.magic, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0 && memcmp(header_start.magic, sas7bcat_magic_number, sizeof(sas7bcat_magic_number)) != 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (header_start.a1 == SAS_ALIGNMENT_OFFSET_4) { hinfo->pad1 = 4; } if (header_start.a2 == SAS_ALIGNMENT_OFFSET_4) { hinfo->u64 = 1; } int bswap = 0; if (header_start.endian == SAS_ENDIAN_BIG) { bswap = machine_is_little_endian(); hinfo->little_endian = 0; } else if (header_start.endian == SAS_ENDIAN_LITTLE) { bswap = !machine_is_little_endian(); hinfo->little_endian = 1; } else { retval = READSTAT_ERROR_PARSE; goto cleanup; } int i; for (i=0; i<sizeof(_charset_table)/sizeof(_charset_table[0]); i++) { if (header_start.encoding == _charset_table[i].code) { hinfo->encoding = _charset_table[i].name; break; } } if (hinfo->encoding == NULL) { if (error_handler) { snprintf(error_buf, sizeof(error_buf), "Unsupported character set code: %d\n", header_start.encoding); error_handler(error_buf, user_ctx); } retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } memcpy(hinfo->file_label, header_start.file_label, sizeof(header_start.file_label)); if (io->seek(hinfo->pad1, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } double creation_time, modification_time; if (io->read(&creation_time, sizeof(double), io->io_ctx) < sizeof(double)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (io->read(&modification_time, sizeof(double), io->io_ctx) < sizeof(double)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->creation_time = bswap ? byteswap_double(creation_time) + epoch : creation_time + epoch; hinfo->modification_time = bswap ? byteswap_double(creation_time) + epoch : creation_time + epoch; if (io->seek(16, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } uint32_t header_size, page_size; if (io->read(&header_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (io->read(&page_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->header_size = bswap ? byteswap4(header_size) : header_size; hinfo->page_size = bswap ? byteswap4(page_size) : page_size; if (hinfo->header_size < 1024) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (hinfo->u64) { uint64_t page_count; if (io->read(&page_count, sizeof(uint64_t), io->io_ctx) < sizeof(uint64_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->page_count = bswap ? byteswap8(page_count) : page_count; } else { uint32_t page_count; if (io->read(&page_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->page_count = bswap ? byteswap4(page_count) : page_count; } if (io->seek(8, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek forward by %d\n", 8); error_handler(error_buf, user_ctx); } goto cleanup; } if (io->read(&header_end, sizeof(sas_header_end_t), io->io_ctx) < sizeof(sas_header_end_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } int major, minor, revision; if (sscanf(header_end.release, "%1d.%04dM%1d", &major, &minor, &revision) == 3) { hinfo->major_version = major; hinfo->minor_version = minor; hinfo->revision = revision; } if (major == 9 && minor == 0 && revision == 0) { /* A bit of a hack, but most SAS installations are running a minor update */ hinfo->vendor = READSTAT_VENDOR_STAT_TRANSFER; } else { hinfo->vendor = READSTAT_VENDOR_SAS; } if (io->seek(hinfo->header_size, READSTAT_SEEK_SET, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek to position %" PRId64 "\n", hinfo->header_size); error_handler(error_buf, user_ctx); } goto cleanup; } cleanup: return retval; }