コード例 #1
0
AnyType lda_parse_model::run(AnyType & args){
    ArrayHandle<int64_t> state = args[0].getAs<ArrayHandle<int64_t> >();
    int32_t voc_size = args[1].getAs<int32_t>();
    int32_t topic_num = args[2].getAs<int32_t>();

    const int32_t *model = reinterpret_cast<const int32_t *>(state.ptr());

    int dims[2] = {voc_size/2, topic_num};
    int lbs[2] = {1, 1};
    MutableArrayHandle<int32_t> model_part1(
        madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval,
            INT4TI.align));

    for(int32_t i = 0; i < voc_size/2; i++){
        for(int32_t j = 0; j < topic_num; j++){
               model_part1[i * topic_num + j] = model[i * (topic_num+1) + j];
        }
    }

    int dims2[2] = {voc_size - voc_size/2, topic_num};

    MutableArrayHandle<int32_t> model_part2(
        madlib_construct_md_array(
            NULL, NULL, 2, dims2, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval,
            INT4TI.align));

    for(int32_t i = voc_size/2; i < voc_size; i++){
        for(int32_t j = 0; j < topic_num; j++){
               model_part2[(i-voc_size/2) * topic_num + j] = model[i * (topic_num+1) + j];
        }
    }

    //int dims3[1] = {topic_num};
    //int lbs3[1] = {1};

    MutableNativeColumnVector total_topic_counts(allocateArray<double>(topic_num));

    for (int i = 0; i < voc_size; i ++) {
        for (int j = 0; j < topic_num; j ++) {
            total_topic_counts[j] += static_cast<double>(model[i * (topic_num + 1) + j]);
        }
    }

    AnyType tuple;
    tuple << model_part1
          << model_part2
          << total_topic_counts;

    return tuple;
}
コード例 #2
0
AnyType matrix_mem_trans::run(AnyType & args)
{
    ArrayHandle<double> m = args[0].getAs<ArrayHandle<double> >();

    if (m.dims() != 2){
        throw std::invalid_argument(
            "invalid argument - 2-d array expected");
    }

    int row_m = static_cast<int>(m.sizeOfDim(0));
    int col_m = static_cast<int>(m.sizeOfDim(1));

    int dims[2] = {col_m, row_m};
    int lbs[2] = {1, 1};
    MutableArrayHandle<double> r = madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, FLOAT8TI.oid,
            FLOAT8TI.len, FLOAT8TI.byval, FLOAT8TI.align);

    for (int i = 0; i < row_m; i++){
        for(int j = 0; j < col_m; j++){
                *(r.ptr() + j * row_m + i) = *(m.ptr() + i * col_m + j);
        }
    }
    return r;
}
コード例 #3
0
/**
 * @brief This function transposes a matrix represented by a 2-D array
 * @param args[0]   The input matrix
 * return           The transposed matrix
 **/
AnyType lda_transpose::run(AnyType & args)
{
    ArrayHandle<int64_t> matrix = args[0].getAs<ArrayHandle<int64_t> >();
    if(matrix.dims() != 2)
        throw std::domain_error("invalid dimension");

    int32_t row_num = static_cast<int32_t>(matrix.sizeOfDim(0));
    int32_t col_num = static_cast<int32_t>(matrix.sizeOfDim(1));

    int dims[2] = {col_num, row_num};
    int lbs[2] = {1, 1};
    MutableArrayHandle<int64_t> transposed(
        madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, INT8TI.oid, INT8TI.len, INT8TI.byval,
            INT8TI.align));

    for(int32_t i = 0; i < row_num; i++){
        int32_t index = i * col_num;
        for(int32_t j = 0; j < col_num; j++){
               transposed[j * row_num + i] = matrix[index];
               index++;
        }
    }

    return transposed;
}
コード例 #4
0
    return r;
}

AnyType rand_block::run(AnyType & args)
{
    int row_dim = args[0].getAs<int>();
    int col_dim = args[1].getAs<int>();
    if (row_dim < 1 || col_dim < 1) {
        throw std::invalid_argument("invalid argument - row_dim and col_dim \
        should be positive");
    }

    int dims[2] = {row_dim, col_dim};
    int lbs[2] = {1, 1};
    MutableArrayHandle<int> r = madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, INT4TI.oid,
            INT4TI.len, INT4TI.byval, INT4TI.align);

    for (int i = 0; i < row_dim; i++){
        for(int j = 0; j < col_dim; j++){
                *(r.ptr() + i * col_dim + j) = (int)(drand48() * 1000);
        }
    }
    return r;
}

typedef struct __sr_ctx1{
    const double * inarray;
    int32_t dim;
    int32_t maxcall;
    int32_t size;
コード例 #5
0
/**
 * @brief This function is the sfunc for the aggregator computing the topic
 * counts. It scans the topic assignments in a document and updates the word
 * topic counts.
 * @param args[0]   The state variable, current topic counts
 * @param args[1]   The unique words in the document
 * @param args[2]   The counts of each unique word in the document
 * @param args[3]   The topic assignments in the document
 * @param args[4]   The size of vocabulary
 * @param args[5]   The number of topics
 * @return          The updated state
 **/
AnyType lda_count_topic_sfunc::run(AnyType & args)
{
    if(args[4].isNull() || args[5].isNull())
        throw std::invalid_argument("null parameter - voc_size and/or \
        topic_num is null");

    if(args[1].isNull() || args[2].isNull() || args[3].isNull())
        return args[0];

    int32_t voc_size = args[4].getAs<int32_t>();
    int32_t topic_num = args[5].getAs<int32_t>();
    if(voc_size <= 0)
        throw std::invalid_argument(
            "invalid argument - voc_size");
    if(topic_num <= 0)
        throw std::invalid_argument(
            "invalid argument - topic_num");

    ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> topic_assignment = args[3].getAs<ArrayHandle<int32_t> >();
    if(words.size() != counts.size())
        throw std::invalid_argument(
            "dimensions mismatch - words.size() != counts.size()");
    if(__min(words) < 0 || __max(words) >= voc_size)
        throw std::invalid_argument(
            "invalid values in words");
    if(__min(counts) <= 0)
        throw std::invalid_argument(
            "invalid values in counts");
    if(__min(topic_assignment) < 0 || __max(topic_assignment) >= topic_num)
        throw std::invalid_argument("invalid values in topics");
    if((size_t)__sum(counts) != topic_assignment.size())
        throw std::invalid_argument(
            "dimension mismatch - sum(counts) != topic_assignment.size()");

    MutableArrayHandle<int64_t> state(NULL);
    int32_t *model;
    if(args[0].isNull()) {
        // to store a voc_size x (topic_num+1) integer matrix in
        // bigint[] (the +1 is for a flag of ceiling the count),
        // we need padding if the size is odd.
        // 1. when voc_size * (topic_num + 1) is (2n+1), gives (n+1)
        // 2. when voc_size * (topic_num + 1) is (2n), gives (n)
        int dims[1] = {static_cast<int>( (voc_size * (topic_num + 1) + 1) * sizeof(int32_t) / sizeof(int64_t) )};
        int lbs[1] = {1};
        state = madlib_construct_md_array(
            NULL, NULL, 1, dims, lbs, INT8TI.oid, INT8TI.len, INT8TI.byval,
            INT8TI.align);
        // the reason we use bigint[] because integer[] has limit on number of
        // elements and thus cannot be larger than 500MB
        model = reinterpret_cast<int32_t *>(state.ptr());
    } else {
        state = args[0].getAs<MutableArrayHandle<int64_t> >();
        model = reinterpret_cast<int32_t *>(state.ptr());
    }

    int32_t unique_word_count = static_cast<int32_t>(words.size());
    int32_t word_index = 0;
    for(int32_t i = 0; i < unique_word_count; i++){
        int32_t wordid = words[i];
        for(int32_t j = 0; j < counts[i]; j++){
            int32_t topic = topic_assignment[word_index];
            if (model[wordid * (topic_num + 1) + topic] <= 2e9) {
                model[wordid * (topic_num + 1) + topic]++;
            } else {
                model[wordid * (topic_num + 1) + topic_num] = 1;
            }
            word_index++;
        }
    }
    return state;
}
コード例 #6
0
ファイル: lda.cpp プロジェクト: adirastogi/madlib
/**
 * @brief This function is the sfunc for the aggregator computing the topic
 * counts. It scans the topic assignments in a document and updates the word
 * topic counts.
 * @param args[0]   The state variable, current topic counts
 * @param args[1]   The unique words in the document
 * @param args[2]   The counts of each unique word in the document
 * @param args[3]   The topic assignments in the document
 * @param args[4]   The size of vocabulary
 * @param args[5]   The number of topics 
 * @return          The updated state
 **/
AnyType lda_count_topic_sfunc::run(AnyType & args)
{
    if(args[4].isNull() || args[5].isNull())
        throw std::invalid_argument("null parameter - voc_size and/or \
        topic_num is null");

    if(args[1].isNull() || args[2].isNull() || args[3].isNull()) 
        return args[0];

    int32_t voc_size = args[4].getAs<int32_t>();
    int32_t topic_num = args[5].getAs<int32_t>();
    if(voc_size <= 0)
        throw std::invalid_argument(
            "invalid argument - voc_size");
    if(topic_num <= 0)
        throw std::invalid_argument(
            "invalid argument - topic_num");

    ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> topic_assignment = args[3].getAs<ArrayHandle<int32_t> >();
    if(words.size() != counts.size())
        throw std::invalid_argument(
            "dimensions mismatch - words.size() != counts.size()");
    if(__min(words) < 0 || __max(words) >= voc_size)
        throw std::invalid_argument(
            "invalid values in words");
    if(__min(counts) <= 0)
        throw std::invalid_argument(
            "invalid values in counts");
    if(__min(topic_assignment) < 0 || __max(topic_assignment) >= topic_num)
        throw std::invalid_argument("invalid values in topics");
    if((size_t)__sum(counts) != topic_assignment.size())
        throw std::invalid_argument(
            "dimension mismatch - sum(counts) != topic_assignment.size()");

    MutableArrayHandle<int64_t> state(NULL);
    if(args[0].isNull()){
        int dims[2] = {voc_size + 1, topic_num};
        int lbs[2] = {1, 1};
        state = madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, INT8TI.oid, INT8TI.len, INT8TI.byval,
            INT8TI.align);
    } else {
        state = args[0].getAs<MutableArrayHandle<int64_t> >();
    }

    int32_t unique_word_count = static_cast<int32_t>(words.size());
    int32_t word_index = 0;
    for(int32_t i = 0; i < unique_word_count; i++){
        int32_t wordid = words[i];
        for(int32_t j = 0; j < counts[i]; j++){
            int32_t topic = topic_assignment[word_index];
            state[wordid * topic_num + topic]++;
            state[voc_size * topic_num + topic]++;
            word_index++;
        }
    }

    return state;
}