示例#1
0
readstat_error_t readstat_parse_dta(readstat_parser_t *parser, const char *filename, void *user_ctx) {
    readstat_error_t retval = READSTAT_OK;
    int i;
    size_t  record_len = 0;
    int fd = -1;
    char *buf = NULL;
    dta_header_t  header;
    dta_ctx_t    *ctx = NULL;
    char  str_buf[2048];
    char *long_string = NULL;
    size_t file_size = 0;

    if ((fd = readstat_open(filename)) == -1) {
        retval = READSTAT_ERROR_OPEN;
        goto cleanup;
    }

    char magic[4];
    if (read(fd, magic, 4) != 4) {
        retval = READSTAT_ERROR_READ;
        goto cleanup;
    }

    file_size = readstat_lseek(fd, 0, SEEK_END);
    if (file_size == -1) {
        retval = READSTAT_ERROR_SEEK;
        goto cleanup;
    }

    if (readstat_lseek(fd, 0, SEEK_SET) == -1) {
        retval = READSTAT_ERROR_SEEK;
        goto cleanup;
    }

    if (strncmp(magic, "<sta", 4) == 0) {
        retval = dta_read_xmlish_preamble(fd, ctx, &header);
    } else {
        if (read(fd, &header, sizeof(header)) != sizeof(header)) {
            retval = READSTAT_ERROR_READ;
            goto cleanup;
        }
    }

    if ((ctx = dta_ctx_init(header.nvar, header.nobs, header.byteorder, header.ds_format)) == NULL) {
        retval = READSTAT_ERROR_MALLOC;
        goto cleanup;
    }

    ctx->user_ctx = user_ctx;
    ctx->file_size = file_size;
    ctx->progress_handler = parser->progress_handler;

    retval = dta_update_progress(fd, ctx);
    if (retval != READSTAT_OK)
        goto cleanup;
    
    if (parser->info_handler) {
        if (parser->info_handler(ctx->nobs, ctx->nvar, user_ctx)) {
            retval = READSTAT_ERROR_USER_ABORT;
            goto cleanup;
        }
    }
    
    if (ctx->file_is_xmlish) {
        uint16_t label_len = 0;
        unsigned char timestamp_len;

        if ((retval = dta_read_tag(fd, ctx, "<label>")) != READSTAT_OK) {
            goto cleanup;
        }
        
        if (ctx->data_label_len_len == 2) {
            if (read(fd, &label_len, sizeof(uint16_t)) != sizeof(uint16_t)) {
                retval = READSTAT_ERROR_READ;
                goto cleanup;
            }
            label_len = ctx->machine_needs_byte_swap ? byteswap2(label_len) : label_len;
        } else if (ctx->data_label_len_len == 1) {
            unsigned char label_len_char;
            if (read(fd, &label_len_char, sizeof(unsigned char)) != sizeof(unsigned char)) {
                retval = READSTAT_ERROR_READ;
                goto cleanup;
            }
            label_len = label_len_char;
        }
        
        if (readstat_lseek(fd, label_len, SEEK_CUR) == -1) {
            retval = READSTAT_ERROR_SEEK;
            goto cleanup;
        }
        
        if ((retval = dta_read_tag(fd, ctx, "</label>")) != READSTAT_OK) {
            goto cleanup;
        }
        
        if ((retval = dta_read_tag(fd, ctx, "<timestamp>")) != READSTAT_OK) {
            goto cleanup;
        }
        
        if (read(fd, &timestamp_len, 1) != 1) {
            retval = READSTAT_ERROR_READ;
            goto cleanup;
        }
        
        if (readstat_lseek(fd, timestamp_len, SEEK_CUR) == -1) {
            retval = READSTAT_ERROR_SEEK;
            goto cleanup;
        }

        if ((retval = dta_read_tag(fd, ctx, "</timestamp>")) != READSTAT_OK) {
            goto cleanup;
        }
    } else {
        if (readstat_lseek(fd, ctx->data_label_len, SEEK_CUR) == -1) {
            retval = READSTAT_ERROR_SEEK;
            goto cleanup;
        }
        
        if (ctx->time_stamp_len) {
            if (readstat_lseek(fd, ctx->time_stamp_len, SEEK_CUR) == -1) {
                retval = READSTAT_ERROR_SEEK;
                goto cleanup;
            }
        }
    }
    
    if ((retval = dta_read_tag(fd, ctx, "</header>")) != READSTAT_OK) {
        goto cleanup;
    }

    if (dta_read_map(fd, ctx) != READSTAT_OK) {
        retval = READSTAT_ERROR_READ;
        goto cleanup;
    }

    if (dta_read_descriptors(fd, ctx) != READSTAT_OK) {
        retval = READSTAT_ERROR_READ;
        goto cleanup;
    }

    for (i=0; i<ctx->nvar; i++) {
        size_t      max_len;
        readstat_types_t type = dta_type_info(ctx->typlist[i], &max_len, ctx);

        record_len += max_len;

        if (type == READSTAT_TYPE_STRING)
            max_len++; /* might append NULL */

        if (parser->variable_handler) {
            readstat_variable_t *variable = dta_init_variable(ctx, i, type);

            const char *value_labels = NULL;

            if (ctx->lbllist[ctx->lbllist_entry_len*i])
                value_labels = &ctx->lbllist[ctx->lbllist_entry_len*i];

            int cb_retval = parser->variable_handler(i, variable, value_labels, user_ctx);

            free(variable);

            if (cb_retval) {
                retval = READSTAT_ERROR_USER_ABORT;
                goto cleanup;
            }
        }
    }

    if ((retval = dta_skip_expansion_fields(fd, ctx)) != READSTAT_OK) {
        goto cleanup;
    }
    
    if (record_len == 0) {
        retval = READSTAT_ERROR_PARSE;
        goto cleanup;
    }

    if ((retval = dta_read_tag(fd, ctx, "<data>")) != READSTAT_OK) {
        goto cleanup;
    }

    if ((retval = dta_update_progress(fd, ctx)) != READSTAT_OK) {
        goto cleanup;
    }

    if ((buf = malloc(record_len)) == NULL) {
        retval = READSTAT_ERROR_MALLOC;
        goto cleanup;
    }

    for (i=0; i<ctx->nobs; i++) {
        if (read(fd, buf, record_len) != record_len) {
            retval = READSTAT_ERROR_READ;
            goto cleanup;
        }
        int j;
        off_t offset = 0;
        for (j=0; j<ctx->nvar; j++) {
            size_t max_len;
            readstat_value_t value;
            memset(&value, 0, sizeof(readstat_value_t));

            value.type = dta_type_info(ctx->typlist[j], &max_len, ctx);

            if (value.type == READSTAT_TYPE_STRING) {
                readstat_convert(str_buf, sizeof(str_buf), &buf[offset], max_len, ctx->converter);
                value.v.string_value = str_buf;
            } else if (value.type == READSTAT_TYPE_LONG_STRING) {
                uint32_t v, o;
                v = *((uint32_t *)&buf[offset]);
                o = *((uint32_t *)&buf[offset+4]);
                if (ctx->machine_needs_byte_swap) {
                    v = byteswap4(v);
                    o = byteswap4(o);
                }
                if (v > 0 && o > 0) {
                    off_t cur_pos = readstat_lseek(fd, 0, SEEK_CUR);
                    if (cur_pos == -1) {
                        retval = READSTAT_ERROR_SEEK;
                        goto cleanup;
                    }
                    retval = dta_read_long_string(fd, ctx, v, o, &long_string);
                    if (retval != READSTAT_OK) {
                        goto cleanup;
                    }
                    value.v.string_value = long_string;
                    if (readstat_lseek(fd, cur_pos, SEEK_SET) == -1) {
                        retval = READSTAT_ERROR_SEEK;
                        goto cleanup;
                    }
                }
            } else if (value.type == READSTAT_TYPE_CHAR) {
                char byte = buf[offset];
                if (ctx->machine_is_twos_complement) {
                    byte = ones_to_twos_complement1(byte);
                }
                if (byte > DTA_MAX_CHAR) {
                    value.is_system_missing = 1;
                    if (byte > DTA_MISSING_CHAR) {
                        value.tag = 'a' + (byte - DTA_MISSING_CHAR_A);
                    }
                }
                value.v.char_value = byte;
            } else if (value.type == READSTAT_TYPE_INT16) {
                int16_t num = *((int16_t *)&buf[offset]);
                if (ctx->machine_needs_byte_swap) {
                    num = byteswap2(num);
                }
                if (ctx->machine_is_twos_complement) {
                    num = ones_to_twos_complement2(num);
                }
                if (num > DTA_MAX_INT16) {
                    value.is_system_missing = 1;
                    if (num > DTA_MISSING_INT16) {
                        value.tag = 'a' + (num - DTA_MISSING_INT16_A);
                    }
                }
                value.v.i16_value = num;
            } else if (value.type == READSTAT_TYPE_INT32) {
                int32_t num = *((int32_t *)&buf[offset]);
                if (ctx->machine_needs_byte_swap) {
                    num = byteswap4(num);
                }
                if (ctx->machine_is_twos_complement) {
                    num = ones_to_twos_complement4(num);
                }
                if (num > DTA_MAX_INT32) {
                    value.is_system_missing = 1;
                    if (num > DTA_MISSING_INT32) {
                        value.tag = 'a' + (num - DTA_MISSING_INT32_A);
                    }
                }
                value.v.i32_value = num;
            } else if (value.type == READSTAT_TYPE_FLOAT) {
                uint32_t num = *((uint32_t *)&buf[offset]);
                float f_num = NAN;
                if (ctx->machine_needs_byte_swap) {
                    num = byteswap4(num);
                }
                if (num > DTA_MAX_FLOAT) {
                    value.is_system_missing = 1;
                    if (num > DTA_MISSING_FLOAT) {
                        value.tag = 'a' + ((num - DTA_MISSING_FLOAT_A) >> 11);
                    }
                } else {
示例#2
0
static readstat_error_t dta_handle_rows(dta_ctx_t *ctx) {
    readstat_io_t *io = ctx->io;
    char *buf = NULL;
    char  str_buf[2048];
    int i;
    readstat_error_t retval = READSTAT_OK;

    if ((buf = malloc(ctx->record_len)) == NULL) {
        retval = READSTAT_ERROR_MALLOC;
        goto cleanup;
    }

    for (i=0; i<ctx->row_limit; i++) {
        if (io->read(buf, ctx->record_len, io->io_ctx) != ctx->record_len) {
            retval = READSTAT_ERROR_READ;
            goto cleanup;
        }
        int j;
        off_t offset = 0;
        for (j=0; j<ctx->nvar; j++) {
            size_t max_len;
            readstat_value_t value;
            memset(&value, 0, sizeof(readstat_value_t));

            value.type = dta_type_info(ctx->typlist[j], &max_len, ctx);

            if (value.type == READSTAT_TYPE_STRING) {
                readstat_convert(str_buf, sizeof(str_buf), &buf[offset], max_len, ctx->converter);
                value.v.string_value = str_buf;
            } else if (value.type == READSTAT_TYPE_STRING_REF) {
                dta_strl_t key;
                dta_interpret_strl_vo_bytes(ctx, (unsigned char *)&buf[offset], &key);

                dta_strl_t **found = bsearch(&key, ctx->strls, ctx->strls_count, sizeof(dta_strl_t *), &dta_compare_strls);

                if (found) {
                    value.v.string_value = (*found)->data;
                }
                value.type = READSTAT_TYPE_STRING;
            } else if (value.type == READSTAT_TYPE_INT8) {
                int8_t byte = buf[offset];
                if (ctx->machine_is_twos_complement) {
                    byte = ones_to_twos_complement1(byte);
                }
                if (byte > ctx->max_int8) {
                    if (ctx->supports_tagged_missing && byte > DTA_113_MISSING_INT8) {
                        value.tag = 'a' + (byte - DTA_113_MISSING_INT8_A);
                        value.is_tagged_missing = 1;
                    } else {
                        value.is_system_missing = 1;
                    }
                }
                value.v.i8_value = byte;
            } else if (value.type == READSTAT_TYPE_INT16) {
                int16_t num = *((int16_t *)&buf[offset]);
                if (ctx->machine_needs_byte_swap) {
                    num = byteswap2(num);
                }
                if (ctx->machine_is_twos_complement) {
                    num = ones_to_twos_complement2(num);
                }
                if (num > ctx->max_int16) {
                    if (ctx->supports_tagged_missing && num > DTA_113_MISSING_INT16) {
                        value.tag = 'a' + (num - DTA_113_MISSING_INT16_A);
                        value.is_tagged_missing = 1;
                    } else {
                        value.is_system_missing = 1;
                    }
                }
                value.v.i16_value = num;
            } else if (value.type == READSTAT_TYPE_INT32) {
                int32_t num = *((int32_t *)&buf[offset]);
                if (ctx->machine_needs_byte_swap) {
                    num = byteswap4(num);
                }
                if (ctx->machine_is_twos_complement) {
                    num = ones_to_twos_complement4(num);
                }
                if (num > ctx->max_int32) {
                    if (ctx->supports_tagged_missing && num > DTA_113_MISSING_INT32) {
                        value.tag = 'a' + (num - DTA_113_MISSING_INT32_A);
                        value.is_tagged_missing = 1;
                    } else {
                        value.is_system_missing = 1;
                    }
                }
                value.v.i32_value = num;
            } else if (value.type == READSTAT_TYPE_FLOAT) {
                int32_t num = *((int32_t *)&buf[offset]);
                float f_num = NAN;
                if (ctx->machine_needs_byte_swap) {
                    num = byteswap4(num);
                }
                if (num > ctx->max_float) {
                    if (ctx->supports_tagged_missing && num > DTA_113_MISSING_FLOAT) {
                        value.tag = 'a' + ((num - DTA_113_MISSING_FLOAT_A) >> 11);
                        value.is_tagged_missing = 1;
                    } else {
                        value.is_system_missing = 1;
                    }
                } else {