예제 #1
0
int count_rows(FILE *f, char delimiter, char quote, char comment, int allow_embedded_newline)
{
    void *fb;
    int row_count;
    int num_fields;
    char **result;
    char word_buffer[WORD_BUFFER_SIZE];
    int tok_error_type;

    fb = new_file_buffer(f, -1);
    if (fb == NULL) {
        return -1;
    }
 
    row_count = 0;
    while ((result = tokenize(fb, word_buffer, WORD_BUFFER_SIZE,
                              delimiter, quote, comment, &num_fields, TRUE, &tok_error_type)) != NULL) {
        if (result == NULL) {
            row_count = -1;
            break;
        }
        free(result);
        ++row_count;
    }

    del_file_buffer(fb, RESTORE_INITIAL);

    return row_count;
}
예제 #2
0
int count_fields(FILE *f, char delimiter, char quote, char comment, int allow_embedded_newline)
{
    void *fb;
    int num_fields;
    char **result;
    char word_buffer[WORD_BUFFER_SIZE];
    int tok_error_type;

    fb = new_file_buffer(f, -1);
    if (fb == NULL) {
        return -1;
    }
 
    result = tokenize(fb, word_buffer, WORD_BUFFER_SIZE,
                      delimiter, quote, comment, &num_fields, TRUE, &tok_error_type);
    if (result == NULL) {
        num_fields = -1;
    }
    else {
        free(result);
    }

    del_file_buffer(fb, RESTORE_INITIAL);

    return num_fields;
}
예제 #3
0
파일: file.cpp 프로젝트: richcole/RJL
Object *new_file(Object *cxt, FILE *file) {
  Object *obj = new_object(cxt, context_get(cxt, "File"));
  obj->buffer = (Buffer *) new_file_buffer(cxt, file);
  return obj;
}
예제 #4
0
void *read_rows(FILE *f, int *nrows, char *fmt,
                char delimiter, char quote, char comment,
                char sci, char decimal,
                int allow_embedded_newline,
                char *datetime_fmt,
                int tz_offset,
                int32_t *usecols, int num_usecols,
                int skiprows,
                void *data_array,
                int *p_error_type, int *p_error_lineno)
{
    void *fb;
    char *data_ptr;
    int num_fields, current_num_fields;
    char **result;
    int fmt_nfields;
    field_type *ftypes;
    int size;
    int row_count;
    int j;
    int *valid_usecols;
    char word_buffer[WORD_BUFFER_SIZE];
    int tok_error_type;

    *p_error_type = 0;
    *p_error_lineno = 0;

    if (datetime_fmt == NULL || strlen(datetime_fmt) == 0) {
        datetime_fmt = "%Y-%m-%d %H:%M:%S";
    }

    size = (*nrows) * calc_size(fmt, &fmt_nfields);

    ftypes = enumerate_fields(fmt);  /* Must free this when finished. */
    if (ftypes == NULL) {
        /* Out of memory. */
        *p_error_type = READ_ERROR_OUT_OF_MEMORY;
        return NULL;
    }

    /*
    for (k = 0; k < fmt_nfields; ++k) {
        printf("k = %d  typechar = '%c'  size = %d\n", k, ftypes[k].typechar, ftypes[k].size);
    }
    printf("size = %d\n", size);
    printf("-----\n");
    */

    if (data_array == NULL) {
        /* XXX The case where data_ptr is allocated here is untested. */
        data_ptr = malloc(size);
    }
    else {
        data_ptr = data_array;
    }

    fb = new_file_buffer(f, -1);
    if (fb == NULL) {
        free(ftypes);
        *p_error_type = ERROR_OUT_OF_MEMORY;
        return NULL;
    }

    /* XXX Check interaction of skiprows with comments. */
    while ((skiprows > 0) && ((result = tokenize(fb, word_buffer, WORD_BUFFER_SIZE,
                              delimiter, quote, comment, &num_fields, TRUE, &tok_error_type)) != NULL)) {
        if (result == NULL) {
            break;
        }
        free(result);
        --skiprows;
    }

    if (skiprows > 0) {
        /* There were fewer rows in the file than skiprows. */
        /* This is not treated as an error. The result should be an empty array. */
        *nrows = 0;
        free(ftypes);
        del_file_buffer(fb, RESTORE_FINAL);
        return data_ptr;
    }

    /* XXX Assume *nrows > 0! */
    /*
     *  Read the first row to get the number of fields in the file.
     *  We'll then use this to pre-validate the values in usecols.
     *  (It might be easier to do this in the Python wrapper, but that
     *  would require refactoring the C interface a bit to expose more
     *  to Python.)
     */
    row_count = 0;
    result = tokenize(fb, word_buffer, WORD_BUFFER_SIZE,
                              delimiter, quote, comment, &num_fields, TRUE, &tok_error_type);
    if (result == NULL) {
        *p_error_type = tok_error_type;
        *p_error_lineno = 1;
        free(ftypes);
        del_file_buffer(fb, RESTORE_FINAL);
        return NULL;
    }

    valid_usecols = (int *) malloc(num_usecols * sizeof(int));
    if (valid_usecols == NULL) {
        /* Out of memory. */
        *p_error_type = ERROR_OUT_OF_MEMORY;
        free(result);
        free(ftypes);
        del_file_buffer(fb, RESTORE_FINAL);
        return NULL;
    }

    /*
     *  Validate the column indices in usecols, and put the validated
     *  column indices in valid_usecols.
     */
    for (j = 0; j < num_usecols; ++j) {

        int32_t k;
        k = usecols[j];
        if (k < -num_fields || k >= num_fields) {
            /* Invalid column index. */
            *p_error_type = ERROR_INVALID_COLUMN_INDEX;
            *p_error_lineno = j;  /* Abuse 'lineno' and put the bad column index there. */
            free(valid_usecols);
            free(result);
            free(ftypes);
            del_file_buffer(fb, RESTORE_FINAL);
            return NULL;
        }
        if (k < 0) {
            k += num_fields;
        }
        valid_usecols[j] = k;
    }

    current_num_fields = num_fields;
    row_count = 0;
    do {
        int j, k;

        if (current_num_fields != num_fields) {
            *p_error_type = ERROR_CHANGED_NUMBER_OF_FIELDS;
            *p_error_lineno = line_number(fb);
            break;
        }

        for (j = 0; j < num_usecols; ++j) {

            int error;
            char typ = ftypes[j].typechar;
            /* k is the column index of the field in the file. */
            k = valid_usecols[j];

            /* XXX Handle error != 0 in the following cases. */
            if (typ == 'b') {
                int8_t x = (int8_t) str_to_int64(result[k], INT8_MIN, INT8_MAX, &error);
                *(int8_t *) data_ptr = x;
                data_ptr += ftypes[j].size;
            }
            else if (typ == 'B') {
                uint8_t x = (uint8_t) str_to_uint64(result[k], UINT8_MAX, &error);
                *(uint8_t *) data_ptr = x;
                data_ptr += ftypes[j].size;   
            }
            else if (typ == 'h') {
                int16_t x = (int16_t) str_to_int64(result[k], INT16_MIN, INT16_MAX, &error);
                *(int16_t *) data_ptr = x;
                data_ptr += ftypes[j].size;
            }
            else if (typ == 'H') {
                uint16_t x = (uint16_t) str_to_uint64(result[k], UINT16_MAX, &error);
                *(uint16_t *) data_ptr = x;
                data_ptr += ftypes[j].size;    
            }
            else if (typ == 'i') {
                int32_t x = (int32_t) str_to_int64(result[k], INT32_MIN, INT32_MAX, &error);
                *(int32_t *) data_ptr = x;
                data_ptr += ftypes[j].size;   
            }
            else if (typ == 'I') {
                uint32_t x = (uint32_t) str_to_uint64(result[k], UINT32_MAX, &error);
                *(uint32_t *) data_ptr = x;
                data_ptr += ftypes[j].size;   
            }
            else if (typ == 'q') {
                int64_t x = (int64_t) str_to_int64(result[k], INT64_MIN, INT64_MAX, &error);
                *(int64_t *) data_ptr = x;
                data_ptr += ftypes[j].size; 
            }
            else if (typ == 'Q') {
                uint64_t x = (uint64_t) str_to_uint64(result[k], UINT64_MAX, &error);
                *(uint64_t *) data_ptr = x;
                data_ptr += ftypes[j].size;    
            }
            else if (typ == 'f' || typ == 'd') {
                // Convert to float.
                double x;
                if ((strlen(result[k]) == 0) || !to_double(result[k], &x, sci, decimal)) {
                    // XXX  Find the canonical platform-independent method to assign nan.
                    x = 0.0 / 0.0;
                }
                if (typ == 'f')
                    *(float *) data_ptr = (float) x;
                else
                    *(double *) data_ptr = x;
                data_ptr += ftypes[j].size;
            }
            else if (typ == 'c' || typ == 'z') {
                // Convert to complex.
                double x, y;
                if ((strlen(result[k]) == 0) || !to_complex(result[k], &x, &y, sci, decimal)) {
                    // XXX  Find the canonical platform-independent method to assign nan.
                    x = 0.0 / 0.0;
                    y = x;
                }
                if (typ == 'c') {
                    *(float *) data_ptr = (float) x;
                    data_ptr += ftypes[j].size / 2;
                    *(float *) data_ptr = (float) y;
                }
                else {
                    *(double *) data_ptr = x;
                    data_ptr += ftypes[j].size / 2; 
                    *(double *) data_ptr = y;
                }
                data_ptr += ftypes[j].size / 2;
            }
            else if (typ == 'U') {
                // Datetime64, microseconds.
                struct tm tm = {0,0,0,0,0,0,0,0,0};
                time_t t;

                if (strptime(result[k], datetime_fmt, &tm) == NULL) {
                    memset(data_ptr, 0, 8);
                }
                else {
                    tm.tm_isdst = -1;
                    t = mktime(&tm);
                    if (t == -1) {
                        memset(data_ptr, 0, 8);
                    }
                    else {
                        *(uint64_t *) data_ptr = (long long) (t - tz_offset) * 1000000L;
                    }
                }
                data_ptr += 8;
            }
            else {
                // String
                strncpy(data_ptr, result[k], ftypes[j].size);
                data_ptr += ftypes[j].size;
            }
        }
        free(result);
        ++row_count;
    } while ((row_count < *nrows) && (result = tokenize(fb, word_buffer, WORD_BUFFER_SIZE,
                              delimiter, quote, comment, &current_num_fields, TRUE, &tok_error_type)) != NULL);

    del_file_buffer(fb, RESTORE_FINAL);

    *nrows = row_count;

    free(valid_usecols);

    return (void *) data_ptr;
}