static void dta_interpret_strl_vo_bytes(dta_ctx_t *ctx, unsigned char *vo_bytes, dta_strl_t *strl) { int file_is_big_endian = (!machine_is_little_endian() ^ ctx->machine_needs_byte_swap); if (ctx->strl_v_len == 2) { if (file_is_big_endian) { strl->v = (vo_bytes[0] << 8) + vo_bytes[1]; strl->o = (((uint64_t)vo_bytes[2] << 40) + ((uint64_t)vo_bytes[3] << 32) + (vo_bytes[4] << 24) + (vo_bytes[5] << 16) + (vo_bytes[6] << 8) + vo_bytes[7]); } else { strl->v = vo_bytes[0] + (vo_bytes[1] << 8); strl->o = (vo_bytes[2] + (vo_bytes[3] << 8) + (vo_bytes[4] << 16) + (vo_bytes[5] << 24) + ((uint64_t)vo_bytes[6] << 32) + ((uint64_t)vo_bytes[7] << 40)); } } else if (ctx->strl_v_len == 4) { uint32_t v, o; memcpy(&v, &vo_bytes[0], sizeof(uint32_t)); memcpy(&o, &vo_bytes[4], sizeof(uint32_t)); strl->v = ctx->machine_needs_byte_swap ? byteswap4(v) : v; strl->o = ctx->machine_needs_byte_swap ? byteswap4(o) : o; } }
static readstat_error_t xport_read_labels_v8(xport_ctx_t *ctx, int label_count) { readstat_error_t retval = READSTAT_OK; uint16_t labeldef[3]; int i; for (i=0; i<label_count; i++) { int index, name_len, label_len; if (read_bytes(ctx, labeldef, sizeof(labeldef)) != sizeof(labeldef)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (machine_is_little_endian()) { index = byteswap2(labeldef[0]); name_len = byteswap2(labeldef[1]); label_len = byteswap2(labeldef[2]); } else { index = labeldef[0]; name_len = labeldef[1]; label_len = labeldef[2]; } if (index >= ctx->var_count) { retval = READSTAT_ERROR_PARSE; goto cleanup; } char name[name_len]; char label[label_len]; readstat_variable_t *variable = ctx->variables[index]; if (read_bytes(ctx, name, name_len) != name_len || read_bytes(ctx, label, label_len) != label_len) { retval = READSTAT_ERROR_READ; goto cleanup; } retval = readstat_convert(variable->name, sizeof(variable->name), name, name_len, NULL); if (retval != READSTAT_OK) goto cleanup; retval = readstat_convert(variable->label, sizeof(variable->label), label, label_len, NULL); if (retval != READSTAT_OK) goto cleanup; } retval = xport_skip_rest_of_record(ctx); if (retval != READSTAT_OK) goto cleanup; retval = xport_read_obs_header_record(ctx); if (retval != READSTAT_OK) goto cleanup; cleanup: return retval; }
static readstat_error_t dta_begin_data(void *writer_ctx) { readstat_writer_t *writer = (readstat_writer_t *)writer_ctx; readstat_error_t error = READSTAT_OK; dta_ctx_t *ctx = dta_ctx_alloc(NULL); dta_header_t header; memset(&header, 0, sizeof(dta_header_t)); header.ds_format = 111; header.byteorder = machine_is_little_endian() ? DTA_LOHI : DTA_HILO; header.filetype = 0x01; header.unused = 0x00; header.nvar = writer->variables_count; header.nobs = writer->row_count; error = readstat_write_bytes(writer, &header, sizeof(dta_header_t)); if (error != READSTAT_OK) goto cleanup; error = dta_ctx_init(ctx, header.nvar, header.nobs, header.byteorder, header.ds_format); if (error != READSTAT_OK) goto cleanup; error = dta_emit_header_data_label(writer); if (error != READSTAT_OK) goto cleanup; error = dta_emit_header_time_stamp(writer); if (error != READSTAT_OK) goto cleanup; error = dta_emit_descriptors(writer, ctx); if (error != READSTAT_OK) goto cleanup; error = dta_emit_variable_labels(writer, ctx); if (error != READSTAT_OK) goto cleanup; error = dta_emit_expansion_fields(writer, ctx); if (error != READSTAT_OK) goto cleanup; cleanup: if (error != READSTAT_OK) { dta_ctx_free(ctx); } else { writer->module_ctx = ctx; } return error; }
void xport_namestr_bswap(xport_namestr_t *namestr) { if (!machine_is_little_endian()) return; namestr->ntype = byteswap2(namestr->ntype); namestr->nhfun = byteswap2(namestr->nhfun); namestr->nlng = byteswap2(namestr->nlng); namestr->nvar0 = byteswap2(namestr->nlng); namestr->nfl = byteswap2(namestr->nfl); namestr->nfd = byteswap2(namestr->nfd); namestr->nfj = byteswap2(namestr->nfj); namestr->nifl = byteswap2(namestr->nifl); namestr->nifd = byteswap2(namestr->nifd); namestr->npos = byteswap4(namestr->npos); namestr->labeln = byteswap2(namestr->labeln); }
rdata_ctx_t *init_rdata_ctx(const char *filename) { int fd = readstat_open(filename); if (fd == -1) { return NULL; } rdata_ctx_t *ctx = calloc(1, sizeof(rdata_ctx_t)); rdata_atom_table_t *atom_table = malloc(sizeof(rdata_atom_table_t)); atom_table->count = 0; atom_table->data = NULL; ctx->atom_table = atom_table; ctx->machine_needs_byteswap = 0; if (machine_is_little_endian()) { ctx->machine_needs_byteswap = 1; } ctx->fd = fd; return ctx; }
readstat_error_t dta_ctx_init(dta_ctx_t *ctx, uint32_t nvar, uint64_t nobs, unsigned char byteorder, unsigned char ds_format, const char *input_encoding, const char *output_encoding) { readstat_error_t retval = READSTAT_OK; int machine_byteorder = DTA_HILO; if (ds_format < DTA_MIN_VERSION || ds_format > DTA_MAX_VERSION) return READSTAT_ERROR_UNSUPPORTED_FILE_FORMAT_VERSION; if (machine_is_little_endian()) { machine_byteorder = DTA_LOHI; } ctx->bswap = (byteorder != machine_byteorder); ctx->ds_format = ds_format; ctx->endianness = byteorder == DTA_LOHI ? READSTAT_ENDIAN_LITTLE : READSTAT_ENDIAN_BIG; ctx->nvar = nvar; ctx->nobs = nobs; if (ctx->nvar) { if ((ctx->variables = readstat_calloc(ctx->nvar, sizeof(readstat_variable_t *))) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } } ctx->machine_is_twos_complement = READSTAT_MACHINE_IS_TWOS_COMPLEMENT; if (ds_format < 105) { ctx->fmtlist_entry_len = 7; } else if (ds_format < 114) { ctx->fmtlist_entry_len = 12; } else if (ds_format < 118) { ctx->fmtlist_entry_len = 49; } else { ctx->fmtlist_entry_len = 57; } if (ds_format >= 117) { ctx->typlist_version = 117; } else if (ds_format >= 111) { ctx->typlist_version = 111; } else { ctx->typlist_version = 0; } if (ds_format >= 118) { ctx->data_label_len_len = 2; ctx->strl_v_len = 2; ctx->strl_o_len = 6; } else if (ds_format >= 117) { ctx->data_label_len_len = 1; ctx->strl_v_len = 4; ctx->strl_o_len = 4; } if (ds_format < 105) { ctx->expansion_len_len = 0; } else if (ds_format < 110) { ctx->expansion_len_len = 2; } else { ctx->expansion_len_len = 4; } if (ds_format < 110) { ctx->lbllist_entry_len = 9; ctx->variable_name_len = 9; ctx->ch_metadata_len = 9; } else if (ds_format < 118) { ctx->lbllist_entry_len = 33; ctx->variable_name_len = 33; ctx->ch_metadata_len = 33; } else { ctx->lbllist_entry_len = 129; ctx->variable_name_len = 129; ctx->ch_metadata_len = 129; } if (ds_format < 108) { ctx->variable_labels_entry_len = 32; ctx->data_label_len = 32; } else if (ds_format < 118) { ctx->variable_labels_entry_len = 81; ctx->data_label_len = 81; } else { ctx->variable_labels_entry_len = 321; ctx->data_label_len = 321; } if (ds_format < 105) { ctx->timestamp_len = 0; ctx->value_label_table_len_len = 2; ctx->value_label_table_labname_len = 12; ctx->value_label_table_padding_len = 2; } else { ctx->timestamp_len = 18; ctx->value_label_table_len_len = 4; if (ds_format < 118) { ctx->value_label_table_labname_len = 33; } else { ctx->value_label_table_labname_len = 129; } ctx->value_label_table_padding_len = 3; } if (ds_format < 117) { ctx->typlist_entry_len = 1; ctx->file_is_xmlish = 0; } else { ctx->typlist_entry_len = 2; ctx->file_is_xmlish = 1; } if (ds_format < 113) { ctx->max_int8 = DTA_OLD_MAX_INT8; ctx->max_int16 = DTA_OLD_MAX_INT16; ctx->max_int32 = DTA_OLD_MAX_INT32; ctx->max_float = DTA_OLD_MAX_FLOAT; ctx->max_double = DTA_OLD_MAX_DOUBLE; } else { ctx->max_int8 = DTA_113_MAX_INT8; ctx->max_int16 = DTA_113_MAX_INT16; ctx->max_int32 = DTA_113_MAX_INT32; ctx->max_float = DTA_113_MAX_FLOAT; ctx->max_double = DTA_113_MAX_DOUBLE; ctx->supports_tagged_missing = 1; } if (output_encoding) { if (input_encoding) { ctx->converter = iconv_open(output_encoding, input_encoding); } else if (ds_format < 118) { ctx->converter = iconv_open(output_encoding, "WINDOWS-1252"); } else if (strcmp(output_encoding, "UTF-8") != 0) { ctx->converter = iconv_open(output_encoding, "UTF-8"); } if (ctx->converter == (iconv_t)-1) { ctx->converter = NULL; retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } } ctx->srtlist_len = (ctx->nvar + 1) * sizeof(int16_t); if ((ctx->srtlist = readstat_malloc(ctx->srtlist_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if (ctx->nvar > 0) { ctx->typlist_len = ctx->nvar * sizeof(uint16_t); ctx->varlist_len = ctx->variable_name_len * ctx->nvar * sizeof(char); ctx->fmtlist_len = ctx->fmtlist_entry_len * ctx->nvar * sizeof(char); ctx->lbllist_len = ctx->lbllist_entry_len * ctx->nvar * sizeof(char); ctx->variable_labels_len = ctx->variable_labels_entry_len * ctx->nvar * sizeof(char); if ((ctx->typlist = readstat_malloc(ctx->typlist_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if ((ctx->varlist = readstat_malloc(ctx->varlist_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if ((ctx->fmtlist = readstat_malloc(ctx->fmtlist_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if ((ctx->lbllist = readstat_malloc(ctx->lbllist_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if ((ctx->variable_labels = readstat_malloc(ctx->variable_labels_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } } ctx->initialized = 1; cleanup: return retval; }
static readstat_error_t sas7bcat_parse_value_labels(const char *value_start, size_t value_labels_len, int label_count_used, int label_count_capacity, const char *name, sas7bcat_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; int i; const char *lbp1 = value_start; uint32_t *value_offset = calloc(label_count_used, sizeof(uint32_t)); /* Doubles appear to be stored as big-endian, always */ int bswap_doubles = machine_is_little_endian(); int is_string = (name[0] == '$'); /* Pass 1 -- find out the offset of the labels */ for (i=0; i<label_count_capacity; i++) { if (&lbp1[2] - value_start > value_labels_len) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (i<label_count_used) { uint32_t label_pos = sas_read4(&lbp1[10+ctx->pad1], ctx->bswap); if (label_pos >= label_count_used) { retval = READSTAT_ERROR_PARSE; goto cleanup; } value_offset[label_pos] = lbp1 - value_start; } lbp1 += 6 + lbp1[2]; } const char *lbp2 = lbp1; /* Pass 2 -- parse pairs of values & labels */ for (i=0; i<label_count_used && i<label_count_capacity; i++) { lbp1 = value_start + value_offset[i]; if (&lbp1[30] - value_start > value_labels_len || &lbp2[10] - value_start > value_labels_len) { retval = READSTAT_ERROR_PARSE; goto cleanup; } size_t label_len = sas_read2(&lbp2[8], ctx->bswap); size_t value_entry_len = 6 + lbp1[2]; const char *label = &lbp2[10]; readstat_value_t value = { .type = is_string ? READSTAT_TYPE_STRING : READSTAT_TYPE_DOUBLE }; if (is_string) { char val[4*16+1]; retval = readstat_convert(val, sizeof(val), &lbp1[value_entry_len-16], 16, ctx->converter); if (retval != READSTAT_OK) goto cleanup; value.v.string_value = val; } else { uint64_t val = sas_read8(&lbp1[22], bswap_doubles); double dval = NAN; if ((val | 0xFF0000000000) == 0xFFFFFFFFFFFF) { value.tag = (val >> 40); if (value.tag) { value.is_tagged_missing = 1; } else { value.is_system_missing = 1; } } else { memcpy(&dval, &val, 8); dval *= -1.0; } value.v.double_value = dval; } if (ctx->value_label_handler) { if (ctx->value_label_handler(name, value, label, ctx->user_ctx) != READSTAT_HANDLER_OK) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } lbp2 += 8 + 2 + label_len + 1; }
readstat_error_t readstat_parse_sas7bcat(readstat_parser_t *parser, const char *path, void *user_ctx) { readstat_error_t retval = READSTAT_OK; readstat_io_t *io = parser->io; int64_t i; char *page = NULL; char *buffer = NULL; sas7bcat_ctx_t *ctx = calloc(1, sizeof(sas7bcat_ctx_t)); sas_header_info_t *hinfo = calloc(1, sizeof(sas_header_info_t)); ctx->block_pointers = malloc((ctx->block_pointers_capacity = 200) * sizeof(uint64_t)); ctx->value_label_handler = parser->value_label_handler; ctx->metadata_handler = parser->metadata_handler; ctx->input_encoding = parser->input_encoding; ctx->output_encoding = parser->output_encoding; ctx->user_ctx = user_ctx; ctx->io = io; if (io->open(path, io->io_ctx) == -1) { retval = READSTAT_ERROR_OPEN; goto cleanup; } if ((retval = sas_read_header(io, hinfo, parser->error_handler, user_ctx)) != READSTAT_OK) { goto cleanup; } ctx->u64 = hinfo->u64; ctx->pad1 = hinfo->pad1; ctx->bswap = machine_is_little_endian() ^ hinfo->little_endian; ctx->header_size = hinfo->header_size; ctx->page_count = hinfo->page_count; ctx->page_size = hinfo->page_size; if (ctx->input_encoding == NULL) { ctx->input_encoding = hinfo->encoding; } if (ctx->u64) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (ctx->input_encoding && ctx->output_encoding && strcmp(ctx->input_encoding, ctx->output_encoding) != 0) { iconv_t converter = iconv_open(ctx->output_encoding, ctx->input_encoding); if (converter == (iconv_t)-1) { retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } ctx->converter = converter; } if (parser->metadata_handler) { char file_label[4*64+1]; retval = readstat_convert(file_label, sizeof(file_label), hinfo->file_label, sizeof(hinfo->file_label), ctx->converter); if (retval != READSTAT_OK) goto cleanup; if (ctx->metadata_handler(file_label, hinfo->modification_time, 10000 * hinfo->major_version + hinfo->minor_version, ctx->user_ctx) != READSTAT_HANDLER_OK) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } if ((page = malloc(ctx->page_size)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if (io->seek(ctx->header_size+SAS_CATALOG_FIRST_INDEX_PAGE*ctx->page_size, READSTAT_SEEK_SET, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (io->read(page, ctx->page_size, io->io_ctx) < ctx->page_size) { retval = READSTAT_ERROR_READ; goto cleanup; } sas7bcat_augment_index(&page[856+2*ctx->pad1], ctx->page_size - 856 - 2*ctx->pad1, ctx); // Pass 1 -- find the XLSR entries for (i=SAS_CATALOG_USELESS_PAGES; i<ctx->page_count; i++) { if (io->seek(ctx->header_size+i*ctx->page_size, READSTAT_SEEK_SET, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (io->read(page, ctx->page_size, io->io_ctx) < ctx->page_size) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(&page[16], "XLSR", sizeof("XLSR")-1) == 0) { sas7bcat_augment_index(&page[16], ctx->page_size - 16, ctx); } } sas7bcat_sort_index(ctx); sas7bcat_uniq_index(ctx); // Pass 2 -- look up the individual block pointers for (i=0; i<ctx->block_pointers_used; i++) { int start_page = ctx->block_pointers[i] >> 32; int start_page_pos = (ctx->block_pointers[i]) & 0xFFFF; int buffer_len = sas7bcat_block_size(start_page, start_page_pos, ctx, &retval); if (buffer_len == -1) { goto cleanup; } else if (buffer_len == 0) { continue; } buffer = realloc(buffer, buffer_len); if ((retval = sas7bcat_read_block(buffer, buffer_len, start_page, start_page_pos, ctx)) != READSTAT_OK) goto cleanup; if ((retval = sas7bcat_parse_block(buffer, buffer_len, ctx)) != READSTAT_OK) goto cleanup; } cleanup: io->close(io->io_ctx); if (page) free(page); if (buffer) free(buffer); if (ctx) sas7bcat_ctx_free(ctx); if (hinfo) free(hinfo); return retval; }
static sas7bcat_block_t *sas7bcat_block_for_label_set(readstat_label_set_t *r_label_set) { size_t len = 0; size_t name_len = strlen(r_label_set->name); int j; char name[32]; len += 106; if (name_len > 8) { len += 32; // long name if (name_len > 32) { name_len = 32; } } memcpy(&name[0], r_label_set->name, name_len); for (j=0; j<r_label_set->value_labels_count; j++) { readstat_value_label_t *value_label = readstat_get_value_label(r_label_set, j); len += 30; // Value: 14-byte header + 16-byte padded value len += 8 + 2 + value_label->label_len + 1; } sas7bcat_block_t *block = calloc(1, sizeof(sas7bcat_block_t) + len); block->len = len; off_t begin = 106; int32_t count = r_label_set->value_labels_count; memcpy(&block->data[38], &count, sizeof(int32_t)); memcpy(&block->data[42], &count, sizeof(int32_t)); if (name_len > 8) { block->data[2] = (char)0x80; memcpy(&block->data[8], name, 8); memset(&block->data[106], ' ', 32); memcpy(&block->data[106], name, name_len); begin += 32; } else { memset(&block->data[8], ' ', 8); memcpy(&block->data[8], name, name_len); } char *lbp1 = &block->data[begin]; char *lbp2 = &block->data[begin+r_label_set->value_labels_count*30]; for (j=0; j<r_label_set->value_labels_count; j++) { readstat_value_label_t *value_label = readstat_get_value_label(r_label_set, j); lbp1[2] = 24; // size - 6 int32_t index = j; memcpy(&lbp1[10], &index, sizeof(int32_t)); if (r_label_set->type == READSTAT_TYPE_STRING) { size_t string_len = value_label->string_key_len; if (string_len > 16) string_len = 16; memset(&lbp1[14], ' ', 16); memcpy(&lbp1[14], value_label->string_key, string_len); } else { uint64_t big_endian_value; double double_value = -1.0 * value_label->double_key; memcpy(&big_endian_value, &double_value, sizeof(double)); if (machine_is_little_endian()) { big_endian_value = byteswap8(big_endian_value); } memcpy(&lbp1[22], &big_endian_value, sizeof(uint64_t)); } int16_t label_len = value_label->label_len; memcpy(&lbp2[8], &label_len, sizeof(int16_t)); memcpy(&lbp2[10], value_label->label, label_len); lbp1 += 30; lbp2 += 8 + 2 + value_label->label_len + 1; } return block; }
int16_t label_len = value_label->label_len; memcpy(&lbp2[8], &label_len, sizeof(int16_t)); memcpy(&lbp2[10], value_label->label, label_len); lbp1 += 30; lbp2 += 8 + 2 + value_label->label_len + 1; } return block; } static readstat_error_t sas7bcat_emit_header(readstat_writer_t *writer, sas_header_info_t *hinfo) { sas_header_start_t header_start = { .a2 = hinfo->u64 ? SAS_ALIGNMENT_OFFSET_4 : SAS_ALIGNMENT_OFFSET_0, .a1 = SAS_ALIGNMENT_OFFSET_0, .endian = machine_is_little_endian() ? SAS_ENDIAN_LITTLE : SAS_ENDIAN_BIG, .file_format = SAS_FILE_FORMAT_UNIX, .encoding = 20, /* UTF-8 */ .file_type = "SAS FILE", .file_info = "CATALOG " }; memcpy(&header_start.magic, sas7bcat_magic_number, sizeof(header_start.magic)); strncpy(header_start.file_label, writer->file_label, sizeof(header_start.file_label)); return sas_write_header(writer, hinfo, header_start); } static readstat_error_t sas7bcat_begin_data(void *writer_ctx) { readstat_writer_t *writer = (readstat_writer_t *)writer_ctx; readstat_error_t retval = READSTAT_OK;
readstat_error_t sas_read_header(int fd, sas_header_info_t *ctx, readstat_error_handler error_handler, void *user_ctx) { sas_header_start_t header_start; sas_header_end_t header_end; int retval = READSTAT_OK; char error_buf[1024]; if (read(fd, &header_start, sizeof(sas_header_start_t)) < sizeof(sas_header_start_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(header_start.magic, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0 && memcmp(header_start.magic, sas7bcat_magic_number, sizeof(sas7bcat_magic_number)) != 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (header_start.a1 == SAS_ALIGNMENT_OFFSET_4) { ctx->pad1 = 4; } if (header_start.a2 == SAS_ALIGNMENT_OFFSET_4) { ctx->u64 = 1; } int bswap = 0; if (header_start.endian == SAS_ENDIAN_BIG) { bswap = machine_is_little_endian(); ctx->little_endian = 0; } else if (header_start.endian == SAS_ENDIAN_LITTLE) { bswap = !machine_is_little_endian(); ctx->little_endian = 1; } else { retval = READSTAT_ERROR_PARSE; goto cleanup; } int i; for (i=0; i<sizeof(_charset_table)/sizeof(_charset_table[0]); i++) { if (header_start.encoding == _charset_table[i].code) { ctx->encoding = _charset_table[i].name; break; } } if (ctx->encoding == NULL) { if (error_handler) { snprintf(error_buf, sizeof(error_buf), "Unsupported character set code: %d\n", header_start.encoding); error_handler(error_buf, user_ctx); } retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } if (readstat_lseek(fd, 196 + ctx->pad1, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek to position %d\n", 196 + ctx->pad1); error_handler(error_buf, user_ctx); } goto cleanup; } uint32_t header_size, page_size; if (read(fd, &header_size, sizeof(uint32_t)) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (read(fd, &page_size, sizeof(uint32_t)) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } ctx->header_size = bswap ? byteswap4(header_size) : header_size; ctx->page_size = bswap ? byteswap4(page_size) : page_size; if (ctx->header_size < 1024) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (ctx->u64) { uint64_t page_count; if (read(fd, &page_count, sizeof(uint64_t)) < sizeof(uint64_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } ctx->page_count = bswap ? byteswap8(page_count) : page_count; } else { uint32_t page_count; if (read(fd, &page_count, sizeof(uint32_t)) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } ctx->page_count = bswap ? byteswap4(page_count) : page_count; } if (readstat_lseek(fd, 8, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek forward by %d\n", 8); error_handler(error_buf, user_ctx); } goto cleanup; } if (read(fd, &header_end, sizeof(sas_header_end_t)) < sizeof(sas_header_end_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (strncmp(header_end.release, "9.0000M0", sizeof(header_end.release)) == 0) { /* A bit of a hack, but most SAS installations are running a minor update */ ctx->vendor = READSTAT_VENDOR_STAT_TRANSFER; } else { ctx->vendor = READSTAT_VENDOR_SAS; } if (readstat_lseek(fd, ctx->header_size, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek to position %lld\n", ctx->header_size); error_handler(error_buf, user_ctx); } goto cleanup; } cleanup: return retval; }
static readstat_error_t xport_write_variables(readstat_writer_t *writer) { readstat_error_t retval = READSTAT_OK; int i; long offset = 0; int num_long_labels = 0; int any_has_long_format = 0; for (i=0; i<writer->variables_count; i++) { int needs_long_record = 0; readstat_variable_t *variable = readstat_get_variable(writer, i); size_t width = xport_variable_width(variable->type, variable->user_width); xport_namestr_t namestr = { .nvar0 = i, .nlng = width, .npos = offset }; if (readstat_variable_get_type_class(variable) == READSTAT_TYPE_CLASS_STRING) { namestr.ntype = SAS_COLUMN_TYPE_CHR; } else { namestr.ntype = SAS_COLUMN_TYPE_NUM; } copypad(namestr.nname, sizeof(namestr.nname), variable->name); copypad(namestr.nlabel, sizeof(namestr.nlabel), variable->label); if (variable->format[0]) { int decimals = 0; int width = 0; char name[24]; sscanf(variable->format, "%s%d.%d", name, &width, &decimals); copypad(namestr.nform, sizeof(namestr.nform), name); namestr.nfl = width; namestr.nfd = decimals; copypad(namestr.niform, sizeof(namestr.niform), name); namestr.nifl = width; namestr.nifd = decimals; if (strlen(name) > 8) { any_has_long_format = 1; needs_long_record = 1; } } namestr.nfj = (variable->alignment == READSTAT_ALIGNMENT_RIGHT); if (writer->version == 8) { copypad(namestr.longname, sizeof(namestr.longname), variable->name); size_t label_len = strlen(variable->label); if (label_len > 40) { needs_long_record = 1; } namestr.labeln = label_len; } if (needs_long_record) { num_long_labels++; } offset += width; xport_namestr_bswap(&namestr); retval = xport_write_bytes(writer, &namestr, sizeof(xport_namestr_t)); if (retval != READSTAT_OK) goto cleanup; } retval = xport_finish_record(writer); if (retval != READSTAT_OK) goto cleanup; if (writer->version == 8 && num_long_labels) { xport_header_record_t header = { .name = "LABELV8", .num1 = num_long_labels }; if (any_has_long_format) { strcpy(header.name, "LABELV9"); } retval = xport_write_header_record_v8(writer, &header); if (retval != READSTAT_OK) goto cleanup; for (i=0; i<writer->variables_count; i++) { readstat_variable_t *variable = readstat_get_variable(writer, i); size_t label_len = strlen(variable->label); size_t name_len = strlen(variable->name); int has_long_label = 0; int has_long_format = 0; int format_len = 0; char format_name[24]; memset(format_name, 0, sizeof(format_name)); has_long_label = (label_len > 40); if (variable->format[0]) { int decimals = 2; int width = 8; int matches = sscanf(variable->format, "%s%d.%d", format_name, &width, &decimals); if (matches < 1) { retval = READSTAT_ERROR_BAD_FORMAT_STRING; goto cleanup; } format_len = strlen(format_name); if (format_len > 8) { has_long_format = 1; } } if (has_long_format) { uint16_t labeldef[5] = { i, name_len, format_len, format_len, label_len }; if (machine_is_little_endian()) { labeldef[0] = byteswap2(labeldef[0]); labeldef[1] = byteswap2(labeldef[1]); labeldef[2] = byteswap2(labeldef[2]); labeldef[3] = byteswap2(labeldef[3]); labeldef[4] = byteswap2(labeldef[4]); } retval = readstat_write_bytes(writer, labeldef, sizeof(labeldef)); if (retval != READSTAT_OK) goto cleanup; retval = readstat_write_string(writer, variable->name); if (retval != READSTAT_OK) goto cleanup; retval = readstat_write_string(writer, format_name); if (retval != READSTAT_OK) goto cleanup; retval = readstat_write_string(writer, format_name); if (retval != READSTAT_OK) goto cleanup; retval = readstat_write_string(writer, variable->label); if (retval != READSTAT_OK) goto cleanup; } else if (has_long_label) { uint16_t labeldef[3] = { i, name_len, label_len }; if (machine_is_little_endian()) { labeldef[0] = byteswap2(labeldef[0]); labeldef[1] = byteswap2(labeldef[1]); labeldef[2] = byteswap2(labeldef[2]); } retval = readstat_write_bytes(writer, labeldef, sizeof(labeldef)); if (retval != READSTAT_OK) goto cleanup; retval = readstat_write_string(writer, variable->name); if (retval != READSTAT_OK) goto cleanup; retval = readstat_write_string(writer, variable->label); if (retval != READSTAT_OK) goto cleanup; } } retval = xport_finish_record(writer); if (retval != READSTAT_OK) goto cleanup; } cleanup: return retval; }
readstat_error_t readstat_parse_sas7bcat(readstat_parser_t *parser, const char *filename, void *user_ctx) { readstat_error_t retval = READSTAT_OK; int64_t i; char *page = NULL; char *buffer = NULL; sas_catalog_ctx_t *ctx = calloc(1, sizeof(sas_catalog_ctx_t)); sas_header_info_t *hinfo = calloc(1, sizeof(sas_header_info_t)); ctx->block_pointers = malloc((ctx->block_pointers_capacity = 200) * sizeof(uint64_t)); ctx->value_label_handler = parser->value_label_handler; ctx->user_ctx = user_ctx; if ((ctx->fd = readstat_open(filename)) == -1) { retval = READSTAT_ERROR_OPEN; goto cleanup; } if ((retval = sas_read_header(ctx->fd, hinfo, parser->error_handler, user_ctx)) != READSTAT_OK) { goto cleanup; } ctx->u64 = hinfo->u64; ctx->pad1 = hinfo->pad1; ctx->bswap = machine_is_little_endian() ^ hinfo->little_endian; ctx->header_size = hinfo->header_size; ctx->page_count = hinfo->page_count; ctx->page_size = hinfo->page_size; if (ctx->u64) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (strcmp(hinfo->encoding, "UTF-8") != 0 && strcmp(hinfo->encoding, "US-ASCII") != 0) { iconv_t converter = iconv_open("UTF-8", hinfo->encoding); if (converter == (iconv_t)-1) { retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } ctx->converter = converter; } if ((page = malloc(ctx->page_size)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if (readstat_lseek(ctx->fd, ctx->header_size+SAS_CATALOG_FIRST_INDEX_PAGE*ctx->page_size, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (read(ctx->fd, page, ctx->page_size) < ctx->page_size) { retval = READSTAT_ERROR_READ; goto cleanup; } sas_catalog_augment_index(&page[856+2*ctx->pad1], ctx->page_size - 856 - 2*ctx->pad1, ctx); // Pass 1 -- find the XLSR entries for (i=SAS_CATALOG_USELESS_PAGES; i<ctx->page_count; i++) { if (readstat_lseek(ctx->fd, ctx->header_size+i*ctx->page_size, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (read(ctx->fd, page, ctx->page_size) < ctx->page_size) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(&page[16], "XLSR", sizeof("XLSR")-1) == 0) { sas_catalog_augment_index(&page[16], ctx->page_size - 16, ctx); } } // Pass 2 -- look up the individual block pointers for (i=0; i<ctx->block_pointers_used; i++) { int start_page = ctx->block_pointers[i] >> 32; int start_page_pos = (ctx->block_pointers[i]) & 0xFFFF; int buffer_len = sas_catalog_block_size(start_page, start_page_pos, ctx, &retval); if (buffer_len == -1) { goto cleanup; } else if (buffer_len == 0) { continue; } buffer = realloc(buffer, buffer_len); if ((retval = sas_catalog_read_block(buffer, buffer_len, start_page, start_page_pos, ctx)) != READSTAT_OK) goto cleanup; if ((retval = sas_catalog_parse_block(buffer, buffer_len, ctx)) != READSTAT_OK) goto cleanup; } cleanup: if (page) free(page); if (buffer) free(buffer); if (ctx) sas_catalog_ctx_free(ctx); if (hinfo) free(hinfo); return retval; }
readstat_error_t sas_read_header(readstat_io_t *io, sas_header_info_t *hinfo, readstat_error_handler error_handler, void *user_ctx) { sas_header_start_t header_start; sas_header_end_t header_end; int retval = READSTAT_OK; char error_buf[1024]; struct tm epoch_tm = { .tm_year = 60, .tm_mday = 1 }; time_t epoch = mktime(&epoch_tm); if (io->read(&header_start, sizeof(sas_header_start_t), io->io_ctx) < sizeof(sas_header_start_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(header_start.magic, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0 && memcmp(header_start.magic, sas7bcat_magic_number, sizeof(sas7bcat_magic_number)) != 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (header_start.a1 == SAS_ALIGNMENT_OFFSET_4) { hinfo->pad1 = 4; } if (header_start.a2 == SAS_ALIGNMENT_OFFSET_4) { hinfo->u64 = 1; } int bswap = 0; if (header_start.endian == SAS_ENDIAN_BIG) { bswap = machine_is_little_endian(); hinfo->little_endian = 0; } else if (header_start.endian == SAS_ENDIAN_LITTLE) { bswap = !machine_is_little_endian(); hinfo->little_endian = 1; } else { retval = READSTAT_ERROR_PARSE; goto cleanup; } int i; for (i=0; i<sizeof(_charset_table)/sizeof(_charset_table[0]); i++) { if (header_start.encoding == _charset_table[i].code) { hinfo->encoding = _charset_table[i].name; break; } } if (hinfo->encoding == NULL) { if (error_handler) { snprintf(error_buf, sizeof(error_buf), "Unsupported character set code: %d\n", header_start.encoding); error_handler(error_buf, user_ctx); } retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } memcpy(hinfo->file_label, header_start.file_label, sizeof(header_start.file_label)); if (io->seek(hinfo->pad1, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } double creation_time, modification_time; if (io->read(&creation_time, sizeof(double), io->io_ctx) < sizeof(double)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (io->read(&modification_time, sizeof(double), io->io_ctx) < sizeof(double)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->creation_time = bswap ? byteswap_double(creation_time) + epoch : creation_time + epoch; hinfo->modification_time = bswap ? byteswap_double(creation_time) + epoch : creation_time + epoch; if (io->seek(16, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } uint32_t header_size, page_size; if (io->read(&header_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (io->read(&page_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->header_size = bswap ? byteswap4(header_size) : header_size; hinfo->page_size = bswap ? byteswap4(page_size) : page_size; if (hinfo->header_size < 1024) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (hinfo->u64) { uint64_t page_count; if (io->read(&page_count, sizeof(uint64_t), io->io_ctx) < sizeof(uint64_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->page_count = bswap ? byteswap8(page_count) : page_count; } else { uint32_t page_count; if (io->read(&page_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } hinfo->page_count = bswap ? byteswap4(page_count) : page_count; } if (io->seek(8, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek forward by %d\n", 8); error_handler(error_buf, user_ctx); } goto cleanup; } if (io->read(&header_end, sizeof(sas_header_end_t), io->io_ctx) < sizeof(sas_header_end_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } int major, minor, revision; if (sscanf(header_end.release, "%1d.%04dM%1d", &major, &minor, &revision) == 3) { hinfo->major_version = major; hinfo->minor_version = minor; hinfo->revision = revision; } if (major == 9 && minor == 0 && revision == 0) { /* A bit of a hack, but most SAS installations are running a minor update */ hinfo->vendor = READSTAT_VENDOR_STAT_TRANSFER; } else { hinfo->vendor = READSTAT_VENDOR_SAS; } if (io->seek(hinfo->header_size, READSTAT_SEEK_SET, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; if (error_handler) { snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek to position %" PRId64 "\n", hinfo->header_size); error_handler(error_buf, user_ctx); } goto cleanup; } cleanup: return retval; }