static readstat_error_t dta_read_map(int fd, dta_ctx_t *ctx) { if (!ctx->file_is_xmlish) return 0; readstat_error_t retval = READSTAT_OK; if ((retval = dta_read_tag(fd, ctx, "<map>")) != READSTAT_OK) { goto cleanup; } uint64_t map_buffer[14]; if (read(fd, map_buffer, sizeof(map_buffer)) != sizeof(map_buffer)) { retval = READSTAT_ERROR_READ; goto cleanup; } ctx->data_offset = ctx->machine_needs_byte_swap ? byteswap8(map_buffer[9]) : map_buffer[9]; ctx->strls_offset = ctx->machine_needs_byte_swap ? byteswap8(map_buffer[10]) : map_buffer[10]; ctx->value_labels_offset = ctx->machine_needs_byte_swap ? byteswap8(map_buffer[11]) : map_buffer[11]; if ((retval = dta_read_tag(fd, ctx, "</map>")) != READSTAT_OK) { goto cleanup; } cleanup: return retval; }
static readstat_error_t dta_read_chunk( dta_ctx_t *ctx, const char *start_tag, void *dst, size_t dst_len, const char *end_tag) { char *dst_buffer = (char *)dst; readstat_io_t *io = ctx->io; readstat_error_t retval = READSTAT_OK; if ((retval = dta_read_tag(ctx, start_tag)) != READSTAT_OK) goto cleanup; if (io->read(dst_buffer, dst_len, io->io_ctx) != dst_len) { retval = READSTAT_ERROR_READ; goto cleanup; } if ((retval = dta_read_tag(ctx, end_tag)) != READSTAT_OK) goto cleanup; cleanup: return retval; }
readstat_error_t readstat_parse_dta(readstat_parser_t *parser, const char *filename, void *user_ctx) { readstat_error_t retval = READSTAT_OK; int i; size_t record_len = 0; int fd = -1; char *buf = NULL; dta_header_t header; dta_ctx_t *ctx = NULL; char str_buf[2048]; char *long_string = NULL; size_t file_size = 0; if ((fd = readstat_open(filename)) == -1) { retval = READSTAT_ERROR_OPEN; goto cleanup; } char magic[4]; if (read(fd, magic, 4) != 4) { retval = READSTAT_ERROR_READ; goto cleanup; } file_size = readstat_lseek(fd, 0, SEEK_END); if (file_size == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (readstat_lseek(fd, 0, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (strncmp(magic, "<sta", 4) == 0) { retval = dta_read_xmlish_preamble(fd, ctx, &header); } else { if (read(fd, &header, sizeof(header)) != sizeof(header)) { retval = READSTAT_ERROR_READ; goto cleanup; } } if ((ctx = dta_ctx_init(header.nvar, header.nobs, header.byteorder, header.ds_format)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } ctx->user_ctx = user_ctx; ctx->file_size = file_size; ctx->progress_handler = parser->progress_handler; retval = dta_update_progress(fd, ctx); if (retval != READSTAT_OK) goto cleanup; if (parser->info_handler) { if (parser->info_handler(ctx->nobs, ctx->nvar, user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } if (ctx->file_is_xmlish) { uint16_t label_len = 0; unsigned char timestamp_len; if ((retval = dta_read_tag(fd, ctx, "<label>")) != READSTAT_OK) { goto cleanup; } if (ctx->data_label_len_len == 2) { if (read(fd, &label_len, sizeof(uint16_t)) != sizeof(uint16_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } label_len = ctx->machine_needs_byte_swap ? byteswap2(label_len) : label_len; } else if (ctx->data_label_len_len == 1) { unsigned char label_len_char; if (read(fd, &label_len_char, sizeof(unsigned char)) != sizeof(unsigned char)) { retval = READSTAT_ERROR_READ; goto cleanup; } label_len = label_len_char; } if (readstat_lseek(fd, label_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "</label>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<timestamp>")) != READSTAT_OK) { goto cleanup; } if (read(fd, ×tamp_len, 1) != 1) { retval = READSTAT_ERROR_READ; goto cleanup; } if (readstat_lseek(fd, timestamp_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "</timestamp>")) != READSTAT_OK) { goto cleanup; } } else { if (readstat_lseek(fd, ctx->data_label_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (ctx->time_stamp_len) { if (readstat_lseek(fd, ctx->time_stamp_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } } if ((retval = dta_read_tag(fd, ctx, "</header>")) != READSTAT_OK) { goto cleanup; } if (dta_read_map(fd, ctx) != READSTAT_OK) { retval = READSTAT_ERROR_READ; goto cleanup; } if (dta_read_descriptors(fd, ctx) != READSTAT_OK) { retval = READSTAT_ERROR_READ; goto cleanup; } for (i=0; i<ctx->nvar; i++) { size_t max_len; readstat_types_t type = dta_type_info(ctx->typlist[i], &max_len, ctx); record_len += max_len; if (type == READSTAT_TYPE_STRING) max_len++; /* might append NULL */ if (parser->variable_handler) { readstat_variable_t *variable = dta_init_variable(ctx, i, type); const char *value_labels = NULL; if (ctx->lbllist[ctx->lbllist_entry_len*i]) value_labels = &ctx->lbllist[ctx->lbllist_entry_len*i]; int cb_retval = parser->variable_handler(i, variable, value_labels, user_ctx); free(variable); if (cb_retval) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } } if ((retval = dta_skip_expansion_fields(fd, ctx)) != READSTAT_OK) { goto cleanup; } if (record_len == 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<data>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_update_progress(fd, ctx)) != READSTAT_OK) { goto cleanup; } if ((buf = malloc(record_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } for (i=0; i<ctx->nobs; i++) { if (read(fd, buf, record_len) != record_len) { retval = READSTAT_ERROR_READ; goto cleanup; } int j; off_t offset = 0; for (j=0; j<ctx->nvar; j++) { size_t max_len; readstat_value_t value; memset(&value, 0, sizeof(readstat_value_t)); value.type = dta_type_info(ctx->typlist[j], &max_len, ctx); if (value.type == READSTAT_TYPE_STRING) { readstat_convert(str_buf, sizeof(str_buf), &buf[offset], max_len, ctx->converter); value.v.string_value = str_buf; } else if (value.type == READSTAT_TYPE_LONG_STRING) { uint32_t v, o; v = *((uint32_t *)&buf[offset]); o = *((uint32_t *)&buf[offset+4]); if (ctx->machine_needs_byte_swap) { v = byteswap4(v); o = byteswap4(o); } if (v > 0 && o > 0) { off_t cur_pos = readstat_lseek(fd, 0, SEEK_CUR); if (cur_pos == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } retval = dta_read_long_string(fd, ctx, v, o, &long_string); if (retval != READSTAT_OK) { goto cleanup; } value.v.string_value = long_string; if (readstat_lseek(fd, cur_pos, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } } else if (value.type == READSTAT_TYPE_CHAR) { char byte = buf[offset]; if (ctx->machine_is_twos_complement) { byte = ones_to_twos_complement1(byte); } if (byte > DTA_MAX_CHAR) { value.is_system_missing = 1; if (byte > DTA_MISSING_CHAR) { value.tag = 'a' + (byte - DTA_MISSING_CHAR_A); } } value.v.char_value = byte; } else if (value.type == READSTAT_TYPE_INT16) { int16_t num = *((int16_t *)&buf[offset]); if (ctx->machine_needs_byte_swap) { num = byteswap2(num); } if (ctx->machine_is_twos_complement) { num = ones_to_twos_complement2(num); } if (num > DTA_MAX_INT16) { value.is_system_missing = 1; if (num > DTA_MISSING_INT16) { value.tag = 'a' + (num - DTA_MISSING_INT16_A); } } value.v.i16_value = num; } else if (value.type == READSTAT_TYPE_INT32) { int32_t num = *((int32_t *)&buf[offset]); if (ctx->machine_needs_byte_swap) { num = byteswap4(num); } if (ctx->machine_is_twos_complement) { num = ones_to_twos_complement4(num); } if (num > DTA_MAX_INT32) { value.is_system_missing = 1; if (num > DTA_MISSING_INT32) { value.tag = 'a' + (num - DTA_MISSING_INT32_A); } } value.v.i32_value = num; } else if (value.type == READSTAT_TYPE_FLOAT) { uint32_t num = *((uint32_t *)&buf[offset]); float f_num = NAN; if (ctx->machine_needs_byte_swap) { num = byteswap4(num); } if (num > DTA_MAX_FLOAT) { value.is_system_missing = 1; if (num > DTA_MISSING_FLOAT) { value.tag = 'a' + ((num - DTA_MISSING_FLOAT_A) >> 11); } } else {
readstat_error_t dta_read_xmlish_preamble(int fd, dta_ctx_t *ctx, dta_header_t *header) { readstat_error_t retval = READSTAT_OK; if ((retval = dta_read_tag(fd, ctx, "<stata_dta>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<header>")) != READSTAT_OK) { goto cleanup; } char ds_format[3]; if ((retval = dta_read_tag(fd, ctx, "<release>")) != READSTAT_OK) { goto cleanup; } if (read(fd, ds_format, sizeof(ds_format)) != sizeof(ds_format)) { retval = READSTAT_ERROR_READ; goto cleanup; } header->ds_format = 100 * (ds_format[0] - '0') + 10 * (ds_format[1] - '0') + (ds_format[2] - '0'); if ((retval = dta_read_tag(fd, ctx, "</release>")) != READSTAT_OK) { goto cleanup; } char byteorder[3]; if ((retval = dta_read_tag(fd, ctx, "<byteorder>")) != READSTAT_OK) { goto cleanup; } if (read(fd, byteorder, sizeof(byteorder)) != sizeof(byteorder)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (strncmp(byteorder, "MSF", 3) == 0) { header->byteorder = DTA_HILO; } else if (strncmp(byteorder, "LSF", 3) == 0) { header->byteorder = DTA_LOHI; } else { retval = READSTAT_ERROR_PARSE; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "</byteorder>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<K>")) != READSTAT_OK) { goto cleanup; } if (read(fd, &header->nvar, sizeof(int16_t)) != sizeof(int16_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "</K>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<N>")) != READSTAT_OK) { goto cleanup; } if (read(fd, &header->nobs, sizeof(int32_t)) != sizeof(int32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (header->ds_format >= 118) { /* Only support files < 4 billion rows for now */ if (header->byteorder == DTA_HILO) { if (read(fd, &header->nobs, sizeof(int32_t)) != sizeof(int32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } } else { if (readstat_lseek(fd, 4, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } } if ((retval = dta_read_tag(fd, ctx, "</N>")) != READSTAT_OK) { goto cleanup; } cleanup: return retval; }
static readstat_error_t dta_read_long_string(int fd, dta_ctx_t *ctx, int v, int o, char **long_string_out) { readstat_error_t retval = READSTAT_OK; if (readstat_lseek(fd, ctx->strls_offset, SEEK_SET) != ctx->strls_offset) { retval = READSTAT_ERROR_SEEK; goto cleanup; } retval = dta_read_tag(fd, ctx, "<strls>"); if (retval != READSTAT_OK) goto cleanup; dta_gso_header_t header; while (1) { if (read(fd, &header, sizeof(dta_gso_header_t)) != sizeof(dta_gso_header_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (strncmp(header.gso, "GSO", sizeof("GSO")-1) != 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (header.len <= 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (header.v == v && header.o == o) { if (header.t == DTA_GSO_TYPE_BINARY) { *long_string_out = NULL; } else if (header.t == DTA_GSO_TYPE_ASCII) { char *string_buf = malloc(header.len); if (read(fd, string_buf, header.len) != header.len) { free(string_buf); retval = READSTAT_ERROR_READ; goto cleanup; } if (string_buf[header.len-1] != '\0') { free(string_buf); retval = READSTAT_ERROR_PARSE; goto cleanup; } *long_string_out = string_buf; } else { retval = READSTAT_ERROR_PARSE; goto cleanup; } break; } else { if (readstat_lseek(fd, header.len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } } cleanup: return retval; }
static readstat_error_t dta_read_descriptors(int fd, dta_ctx_t *ctx) { if (dta_read_tag(fd, ctx, "<variable_types>") != READSTAT_OK) return -1; size_t buffer_len = ctx->nvar * ctx->typlist_entry_len; unsigned char *buffer = malloc(buffer_len); if (read(fd, buffer, buffer_len) != buffer_len) { free(buffer); return -1; } int i; if (ctx->typlist_entry_len == 1) { for (i=0; i<ctx->nvar; i++) { ctx->typlist[i] = buffer[i]; } } else if (ctx->typlist_entry_len == 2) { memcpy(ctx->typlist, buffer, buffer_len); if (ctx->machine_needs_byte_swap) { for (i=0; i<ctx->nvar; i++) { ctx->typlist[i] = byteswap2(ctx->typlist[i]); } } } free(buffer); if (dta_read_tag(fd, ctx, "</variable_types>") != READSTAT_OK) return -1; if (dta_read_tag(fd, ctx, "<varnames>") != READSTAT_OK || read(fd, ctx->varlist, ctx->varlist_len) != ctx->varlist_len || dta_read_tag(fd, ctx, "</varnames>") != READSTAT_OK) return -1; if (dta_read_tag(fd, ctx, "<sortlist>") != READSTAT_OK || read(fd, ctx->srtlist, ctx->srtlist_len) != ctx->srtlist_len || dta_read_tag(fd, ctx, "</sortlist>") != READSTAT_OK) return -1; if (dta_read_tag(fd, ctx, "<formats>") != READSTAT_OK || read(fd, ctx->fmtlist, ctx->fmtlist_len) != ctx->fmtlist_len || dta_read_tag(fd, ctx, "</formats>") != READSTAT_OK) return -1; if (dta_read_tag(fd, ctx, "<value_label_names>") != READSTAT_OK || read(fd, ctx->lbllist, ctx->lbllist_len) != ctx->lbllist_len || dta_read_tag(fd, ctx, "</value_label_names>") != READSTAT_OK) return -1; if (dta_read_tag(fd, ctx, "<variable_labels>") != READSTAT_OK || read(fd, ctx->variable_labels, ctx->variable_labels_len) != ctx->variable_labels_len || dta_read_tag(fd, ctx, "</variable_labels>") != READSTAT_OK) return -1; return 0; }
static readstat_error_t dta_read_strls(dta_ctx_t *ctx) { if (!ctx->file_is_xmlish) return READSTAT_OK; readstat_error_t retval = READSTAT_OK; readstat_io_t *io = ctx->io; if (io->seek(ctx->strls_offset, READSTAT_SEEK_SET, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } retval = dta_read_tag(ctx, "<strls>"); if (retval != READSTAT_OK) goto cleanup; ctx->strls_capacity = 100; ctx->strls = malloc(ctx->strls_capacity * sizeof(dta_strl_t *)); while (1) { char tag[3]; if (io->read(tag, sizeof(tag), io->io_ctx) != sizeof(tag)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(tag, "GSO", sizeof(tag)) == 0) { dta_strl_t strl; retval = dta_read_strl(ctx, &strl); if (retval != READSTAT_OK) goto cleanup; if (strl.type != DTA_GSO_TYPE_ASCII) continue; if (ctx->strls_count == ctx->strls_capacity) { ctx->strls_capacity *= 2; ctx->strls = realloc(ctx->strls, sizeof(dta_strl_t *) * ctx->strls_capacity); } dta_strl_t *strl_ptr = malloc(sizeof(dta_strl_t) + strl.len); memcpy(strl_ptr, &strl, sizeof(dta_strl_t)); ctx->strls[ctx->strls_count++] = strl_ptr; if (io->read(&strl_ptr->data[0], strl_ptr->len, io->io_ctx) != strl_ptr->len) { retval = READSTAT_ERROR_READ; goto cleanup; } } else if (memcmp(tag, "</s", sizeof(tag)) == 0) { retval = dta_read_tag(ctx, "trls>"); if (retval != READSTAT_OK) goto cleanup; break; } else { retval = READSTAT_ERROR_PARSE; goto cleanup; } } cleanup: return retval; }
static readstat_error_t dta_read_expansion_fields(dta_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; readstat_io_t *io = ctx->io; char *buffer = NULL; if (ctx->expansion_len_len == 0) return READSTAT_OK; if (ctx->file_is_xmlish && !ctx->note_handler) { if (io->seek(ctx->data_offset, READSTAT_SEEK_SET, io->io_ctx) == -1) { return READSTAT_ERROR_SEEK; } return READSTAT_OK; } retval = dta_read_tag(ctx, "<characteristics>"); if (retval != READSTAT_OK) goto cleanup; while (1) { size_t len; char data_type; if (ctx->file_is_xmlish) { char start[4]; if (io->read(start, sizeof(start), io->io_ctx) != sizeof(start)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(start, "</ch", sizeof(start)) == 0) { retval = dta_read_tag(ctx, "aracteristics>"); if (retval != READSTAT_OK) goto cleanup; break; } else if (memcmp(start, "<ch>", sizeof(start)) != 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } data_type = 1; } else { if (io->read(&data_type, 1, io->io_ctx) != 1) { retval = READSTAT_ERROR_READ; goto cleanup; } } if (ctx->expansion_len_len == 2) { int16_t len16; if (io->read(&len16, sizeof(int16_t), io->io_ctx) != sizeof(int16_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } len = ctx->machine_needs_byte_swap ? byteswap2(len16) : len16; } else { int32_t len32; if (io->read(&len32, sizeof(int32_t), io->io_ctx) != sizeof(int32_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } len = ctx->machine_needs_byte_swap ? byteswap2(len32) : len32; } if (data_type == 0 && len == 0) break; if (data_type != 1 || len > (1<<20)) { retval = READSTAT_ERROR_NOTE_IS_TOO_LONG; goto cleanup; } if (ctx->note_handler && len >= 2 * ctx->ch_metadata_len) { buffer = realloc(buffer, len + 1); buffer[len] = '\0'; if (io->read(buffer, len, io->io_ctx) != len) { retval = READSTAT_ERROR_READ; goto cleanup; } int index = 0; if (strncmp(&buffer[0], "_dta", 4) == 0 && sscanf(&buffer[ctx->ch_metadata_len], "note%d", &index) == 1) { if (ctx->note_handler(index, &buffer[2*ctx->ch_metadata_len], ctx->user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } } else { if (io->seek(len, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } retval = dta_read_tag(ctx, "</ch>"); if (retval != READSTAT_OK) goto cleanup; } cleanup: if (buffer) free(buffer); return retval; }