rdata_ctx_t *init_rdata_ctx(const char *filename) { int fd = readstat_open(filename); if (fd == -1) { return NULL; } rdata_ctx_t *ctx = calloc(1, sizeof(rdata_ctx_t)); rdata_atom_table_t *atom_table = malloc(sizeof(rdata_atom_table_t)); atom_table->count = 0; atom_table->data = NULL; ctx->atom_table = atom_table; ctx->machine_needs_byteswap = 0; if (machine_is_little_endian()) { ctx->machine_needs_byteswap = 1; } ctx->fd = fd; return ctx; }
static readstat_error_t read_por_file_data(readstat_por_ctx_t *ctx) { int i; char string[256]; char error_buf[1024]; int retval = 0; readstat_error_t rs_retval = READSTAT_OK; while (1) { for (i=0; i<ctx->var_count; i++) { spss_varinfo_t *info = &ctx->varinfo[i]; readstat_value_t value = { .type = info->type }; if (info->type == READSTAT_TYPE_STRING) { retval = read_string(ctx, string, sizeof(string)); if (i == 0 && retval == 1) { return 0; } else if (retval == -1) { if (ctx->error_handler) { snprintf(error_buf, sizeof(error_buf), "Error in %s\n", info->name); ctx->error_handler(error_buf, ctx->user_ctx); } rs_retval = READSTAT_ERROR_PARSE; goto cleanup; } value.v.string_value = string; // printf("String value: %s\n", string); } else if (info->type == READSTAT_TYPE_DOUBLE) { retval = read_double(ctx, &value.v.double_value); if (i == 0 && retval == 1) { return READSTAT_OK; } else if (retval != 0) { if (ctx->error_handler) { snprintf(error_buf, sizeof(error_buf), "Error in %s\n", info->name); ctx->error_handler(error_buf, ctx->user_ctx); } rs_retval = READSTAT_ERROR_PARSE; goto cleanup; } spss_tag_missing_double(&value, &info->missingness); } ctx->value_handler(ctx->obs_count, i, value, ctx->user_ctx); } ctx->obs_count++; rs_retval = por_update_progress(ctx); if (rs_retval != READSTAT_OK) break; } cleanup: return rs_retval; } readstat_error_t readstat_parse_por(readstat_parser_t *parser, const char *filename, void *user_ctx) { readstat_error_t retval = READSTAT_OK; unsigned char reverse_lookup[256]; char vanity[200]; readstat_por_ctx_t *ctx = calloc(1, sizeof(readstat_por_ctx_t)); ctx->space = ' '; ctx->var_dict = ck_hash_table_init(1024); ctx->info_handler = parser->info_handler; ctx->variable_handler = parser->variable_handler; ctx->value_handler = parser->value_handler; ctx->value_label_handler = parser->value_label_handler; ctx->error_handler = parser->error_handler; ctx->progress_handler = parser->progress_handler; ctx->user_ctx = user_ctx; if ((ctx->fd = readstat_open(filename)) == -1) { free(ctx); return READSTAT_ERROR_OPEN; } if ((ctx->file_size = lseek(ctx->fd, 0, SEEK_END)) == -1) { retval = READSTAT_ERROR_READ; goto cleanup; } if (lseek(ctx->fd, 0, SEEK_SET) == -1) { retval = READSTAT_ERROR_READ; goto cleanup; } if (read_bytes(ctx, vanity, sizeof(vanity)) != sizeof(vanity)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (read_bytes(ctx, reverse_lookup, sizeof(reverse_lookup)) != sizeof(reverse_lookup)) { retval = READSTAT_ERROR_READ; goto cleanup; } ctx->space = reverse_lookup[126]; int i; for (i=0; i<256; i++) ctx->lookup[reverse_lookup[i]] = unicode_lookup[i]; unsigned char check[9]; char tr_check[9]; if (read_bytes(ctx, check, sizeof(check)-1) != sizeof(check)-1) { retval = READSTAT_ERROR_READ; goto cleanup; } check[8] = '\0'; utf8_encode(check, sizeof(check), tr_check, sizeof(tr_check), ctx->lookup); if (strcmp("SPSSPORT", tr_check) != 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } ctx->var_offset = -1; unsigned char version; char string[256]; if (read_bytes(ctx, &version, sizeof(version)) != sizeof(version)) { retval = READSTAT_ERROR_READ; goto cleanup; } if (read_string(ctx, string, sizeof(string)) == -1) { /* creation date */ retval = READSTAT_ERROR_PARSE; goto cleanup; } if (read_string(ctx, string, sizeof(string)) == -1) { /* creation time */ retval = READSTAT_ERROR_PARSE; goto cleanup; } while (1) { uint16_t tr_tag = read_tag(ctx); switch (tr_tag) { case '1': /* product ID */ case '2': /* author ID */ case '3': /* sub-product ID */ if (read_string(ctx, string, sizeof(string)) == -1) { retval = READSTAT_ERROR_PARSE; goto cleanup; } break; case '4': /* variable count */ retval = read_variable_count_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case '6': /* case weight */ retval = read_case_weight_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case '7': /* variable */ retval = read_variable_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case '8': /* missing value */ retval = read_missing_value_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case 'B': /* missing value range */ retval = read_missing_value_range_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case '9': /* LO THRU x */ retval = read_missing_value_lo_range_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case 'A': /* x THRU HI */ retval = read_missing_value_hi_range_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case 'C': /* variable label */ retval = read_variable_label_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case 'D': /* value label */ retval = read_value_label_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case 'E': /* document record */ retval = read_document_record(ctx); if (retval != READSTAT_OK) goto cleanup; break; case 'F': /* file data */ if (ctx->var_offset != ctx->var_count - 1) { retval = READSTAT_ERROR_PARSE; goto cleanup; } for (i=0; i<ctx->var_count; i++) { char label_name_buf[256]; spss_varinfo_t *info = &ctx->varinfo[i]; info->missingness = spss_missingness_for_info(info); readstat_variable_t *variable = spss_init_variable_for_info(info); snprintf(label_name_buf, sizeof(label_name_buf), POR_LABEL_NAME_PREFIX "%d", info->labels_index); int cb_retval = ctx->variable_handler(i, variable, info->labels_index == -1 ? NULL : label_name_buf, user_ctx); spss_free_variable(variable); if (cb_retval) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } if (parser->fweight_handler && ctx->fweight_name[0]) { for (i=0; i<ctx->var_count; i++) { spss_varinfo_t *info = &ctx->varinfo[i]; if (strcmp(info->name, ctx->fweight_name) == 0) { if (parser->fweight_handler(i, user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } break; } } } retval = read_por_file_data(ctx); if (retval != READSTAT_OK) goto cleanup; if (parser->info_handler) { if (parser->info_handler(ctx->obs_count, ctx->var_count, ctx->user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; } } goto cleanup; break; default: retval = READSTAT_ERROR_PARSE; goto cleanup; break; } } cleanup: readstat_close(ctx->fd); por_ctx_free(ctx); return retval; }
readstat_error_t readstat_parse_dta(readstat_parser_t *parser, const char *filename, void *user_ctx) { readstat_error_t retval = READSTAT_OK; int i; size_t record_len = 0; int fd = -1; char *buf = NULL; dta_header_t header; dta_ctx_t *ctx = NULL; char str_buf[2048]; char *long_string = NULL; size_t file_size = 0; if ((fd = readstat_open(filename)) == -1) { retval = READSTAT_ERROR_OPEN; goto cleanup; } char magic[4]; if (read(fd, magic, 4) != 4) { retval = READSTAT_ERROR_READ; goto cleanup; } file_size = readstat_lseek(fd, 0, SEEK_END); if (file_size == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (readstat_lseek(fd, 0, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (strncmp(magic, "<sta", 4) == 0) { retval = dta_read_xmlish_preamble(fd, ctx, &header); } else { if (read(fd, &header, sizeof(header)) != sizeof(header)) { retval = READSTAT_ERROR_READ; goto cleanup; } } if ((ctx = dta_ctx_init(header.nvar, header.nobs, header.byteorder, header.ds_format)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } ctx->user_ctx = user_ctx; ctx->file_size = file_size; ctx->progress_handler = parser->progress_handler; retval = dta_update_progress(fd, ctx); if (retval != READSTAT_OK) goto cleanup; if (parser->info_handler) { if (parser->info_handler(ctx->nobs, ctx->nvar, user_ctx)) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } if (ctx->file_is_xmlish) { uint16_t label_len = 0; unsigned char timestamp_len; if ((retval = dta_read_tag(fd, ctx, "<label>")) != READSTAT_OK) { goto cleanup; } if (ctx->data_label_len_len == 2) { if (read(fd, &label_len, sizeof(uint16_t)) != sizeof(uint16_t)) { retval = READSTAT_ERROR_READ; goto cleanup; } label_len = ctx->machine_needs_byte_swap ? byteswap2(label_len) : label_len; } else if (ctx->data_label_len_len == 1) { unsigned char label_len_char; if (read(fd, &label_len_char, sizeof(unsigned char)) != sizeof(unsigned char)) { retval = READSTAT_ERROR_READ; goto cleanup; } label_len = label_len_char; } if (readstat_lseek(fd, label_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "</label>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<timestamp>")) != READSTAT_OK) { goto cleanup; } if (read(fd, ×tamp_len, 1) != 1) { retval = READSTAT_ERROR_READ; goto cleanup; } if (readstat_lseek(fd, timestamp_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "</timestamp>")) != READSTAT_OK) { goto cleanup; } } else { if (readstat_lseek(fd, ctx->data_label_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (ctx->time_stamp_len) { if (readstat_lseek(fd, ctx->time_stamp_len, SEEK_CUR) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } } if ((retval = dta_read_tag(fd, ctx, "</header>")) != READSTAT_OK) { goto cleanup; } if (dta_read_map(fd, ctx) != READSTAT_OK) { retval = READSTAT_ERROR_READ; goto cleanup; } if (dta_read_descriptors(fd, ctx) != READSTAT_OK) { retval = READSTAT_ERROR_READ; goto cleanup; } for (i=0; i<ctx->nvar; i++) { size_t max_len; readstat_types_t type = dta_type_info(ctx->typlist[i], &max_len, ctx); record_len += max_len; if (type == READSTAT_TYPE_STRING) max_len++; /* might append NULL */ if (parser->variable_handler) { readstat_variable_t *variable = dta_init_variable(ctx, i, type); const char *value_labels = NULL; if (ctx->lbllist[ctx->lbllist_entry_len*i]) value_labels = &ctx->lbllist[ctx->lbllist_entry_len*i]; int cb_retval = parser->variable_handler(i, variable, value_labels, user_ctx); free(variable); if (cb_retval) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } } } if ((retval = dta_skip_expansion_fields(fd, ctx)) != READSTAT_OK) { goto cleanup; } if (record_len == 0) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if ((retval = dta_read_tag(fd, ctx, "<data>")) != READSTAT_OK) { goto cleanup; } if ((retval = dta_update_progress(fd, ctx)) != READSTAT_OK) { goto cleanup; } if ((buf = malloc(record_len)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } for (i=0; i<ctx->nobs; i++) { if (read(fd, buf, record_len) != record_len) { retval = READSTAT_ERROR_READ; goto cleanup; } int j; off_t offset = 0; for (j=0; j<ctx->nvar; j++) { size_t max_len; readstat_value_t value; memset(&value, 0, sizeof(readstat_value_t)); value.type = dta_type_info(ctx->typlist[j], &max_len, ctx); if (value.type == READSTAT_TYPE_STRING) { readstat_convert(str_buf, sizeof(str_buf), &buf[offset], max_len, ctx->converter); value.v.string_value = str_buf; } else if (value.type == READSTAT_TYPE_LONG_STRING) { uint32_t v, o; v = *((uint32_t *)&buf[offset]); o = *((uint32_t *)&buf[offset+4]); if (ctx->machine_needs_byte_swap) { v = byteswap4(v); o = byteswap4(o); } if (v > 0 && o > 0) { off_t cur_pos = readstat_lseek(fd, 0, SEEK_CUR); if (cur_pos == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } retval = dta_read_long_string(fd, ctx, v, o, &long_string); if (retval != READSTAT_OK) { goto cleanup; } value.v.string_value = long_string; if (readstat_lseek(fd, cur_pos, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } } } else if (value.type == READSTAT_TYPE_CHAR) { char byte = buf[offset]; if (ctx->machine_is_twos_complement) { byte = ones_to_twos_complement1(byte); } if (byte > DTA_MAX_CHAR) { value.is_system_missing = 1; if (byte > DTA_MISSING_CHAR) { value.tag = 'a' + (byte - DTA_MISSING_CHAR_A); } } value.v.char_value = byte; } else if (value.type == READSTAT_TYPE_INT16) { int16_t num = *((int16_t *)&buf[offset]); if (ctx->machine_needs_byte_swap) { num = byteswap2(num); } if (ctx->machine_is_twos_complement) { num = ones_to_twos_complement2(num); } if (num > DTA_MAX_INT16) { value.is_system_missing = 1; if (num > DTA_MISSING_INT16) { value.tag = 'a' + (num - DTA_MISSING_INT16_A); } } value.v.i16_value = num; } else if (value.type == READSTAT_TYPE_INT32) { int32_t num = *((int32_t *)&buf[offset]); if (ctx->machine_needs_byte_swap) { num = byteswap4(num); } if (ctx->machine_is_twos_complement) { num = ones_to_twos_complement4(num); } if (num > DTA_MAX_INT32) { value.is_system_missing = 1; if (num > DTA_MISSING_INT32) { value.tag = 'a' + (num - DTA_MISSING_INT32_A); } } value.v.i32_value = num; } else if (value.type == READSTAT_TYPE_FLOAT) { uint32_t num = *((uint32_t *)&buf[offset]); float f_num = NAN; if (ctx->machine_needs_byte_swap) { num = byteswap4(num); } if (num > DTA_MAX_FLOAT) { value.is_system_missing = 1; if (num > DTA_MISSING_FLOAT) { value.tag = 'a' + ((num - DTA_MISSING_FLOAT_A) >> 11); } } else {
readstat_error_t readstat_parse_sas7bcat(readstat_parser_t *parser, const char *filename, void *user_ctx) { readstat_error_t retval = READSTAT_OK; int64_t i; char *page = NULL; char *buffer = NULL; sas_catalog_ctx_t *ctx = calloc(1, sizeof(sas_catalog_ctx_t)); sas_header_info_t *hinfo = calloc(1, sizeof(sas_header_info_t)); ctx->block_pointers = malloc((ctx->block_pointers_capacity = 200) * sizeof(uint64_t)); ctx->value_label_handler = parser->value_label_handler; ctx->user_ctx = user_ctx; if ((ctx->fd = readstat_open(filename)) == -1) { retval = READSTAT_ERROR_OPEN; goto cleanup; } if ((retval = sas_read_header(ctx->fd, hinfo, parser->error_handler, user_ctx)) != READSTAT_OK) { goto cleanup; } ctx->u64 = hinfo->u64; ctx->pad1 = hinfo->pad1; ctx->bswap = machine_is_little_endian() ^ hinfo->little_endian; ctx->header_size = hinfo->header_size; ctx->page_count = hinfo->page_count; ctx->page_size = hinfo->page_size; if (ctx->u64) { retval = READSTAT_ERROR_PARSE; goto cleanup; } if (strcmp(hinfo->encoding, "UTF-8") != 0 && strcmp(hinfo->encoding, "US-ASCII") != 0) { iconv_t converter = iconv_open("UTF-8", hinfo->encoding); if (converter == (iconv_t)-1) { retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; goto cleanup; } ctx->converter = converter; } if ((page = malloc(ctx->page_size)) == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } if (readstat_lseek(ctx->fd, ctx->header_size+SAS_CATALOG_FIRST_INDEX_PAGE*ctx->page_size, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (read(ctx->fd, page, ctx->page_size) < ctx->page_size) { retval = READSTAT_ERROR_READ; goto cleanup; } sas_catalog_augment_index(&page[856+2*ctx->pad1], ctx->page_size - 856 - 2*ctx->pad1, ctx); // Pass 1 -- find the XLSR entries for (i=SAS_CATALOG_USELESS_PAGES; i<ctx->page_count; i++) { if (readstat_lseek(ctx->fd, ctx->header_size+i*ctx->page_size, SEEK_SET) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (read(ctx->fd, page, ctx->page_size) < ctx->page_size) { retval = READSTAT_ERROR_READ; goto cleanup; } if (memcmp(&page[16], "XLSR", sizeof("XLSR")-1) == 0) { sas_catalog_augment_index(&page[16], ctx->page_size - 16, ctx); } } // Pass 2 -- look up the individual block pointers for (i=0; i<ctx->block_pointers_used; i++) { int start_page = ctx->block_pointers[i] >> 32; int start_page_pos = (ctx->block_pointers[i]) & 0xFFFF; int buffer_len = sas_catalog_block_size(start_page, start_page_pos, ctx, &retval); if (buffer_len == -1) { goto cleanup; } else if (buffer_len == 0) { continue; } buffer = realloc(buffer, buffer_len); if ((retval = sas_catalog_read_block(buffer, buffer_len, start_page, start_page_pos, ctx)) != READSTAT_OK) goto cleanup; if ((retval = sas_catalog_parse_block(buffer, buffer_len, ctx)) != READSTAT_OK) goto cleanup; } cleanup: if (page) free(page); if (buffer) free(buffer); if (ctx) sas_catalog_ctx_free(ctx); if (hinfo) free(hinfo); return retval; }