Exemplo n.º 1
0
/**
 * @brief This function is the sfunc of an aggregator computing the
 * perplexity.  
 * @param args[0]   The current state 
 * @param args[1]   The unique words in the documents
 * @param args[2]   The counts of each unique words
 * @param args[3]   The topic counts in the document
 * @param args[4]   The model (word topic counts and corpus topic
 *                  counts)
 * @param args[5]   The Dirichlet parameter for per-document topic
 *                  multinomial, i.e. alpha
 * @param args[6]   The Dirichlet parameter for per-topic word
 *                  multinomial, i.e. beta
 * @param args[7]   The size of vocabulary
 * @param args[8]   The number of topics
 * @return          The updated state 
 **/
AnyType lda_perplexity_sfunc::run(AnyType & args){
    ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> topic_counts = args[3].getAs<ArrayHandle<int32_t> >();
    double alpha = args[5].getAs<double>();
    double beta = args[6].getAs<double>();
    int32_t voc_size = args[7].getAs<int32_t>();
    int32_t topic_num = args[8].getAs<int32_t>();

    if(alpha <= 0)
        throw std::invalid_argument("invalid argument - alpha");
    if(beta <= 0)
        throw std::invalid_argument("invalid argument - beta");
    if(voc_size <= 0)
        throw std::invalid_argument(
            "invalid argument - voc_size");
    if(topic_num <= 0)
        throw std::invalid_argument(
            "invalid argument - topic_num");

    if(words.size() != counts.size())
        throw std::invalid_argument(
            "dimensions mismatch: words.size() != counts.size()");
    if(__min(words) < 0 || __max(words) >= voc_size)
        throw std::invalid_argument(
            "invalid values in words");
    if(__min(counts) <= 0)
        throw std::invalid_argument(
            "invalid values in counts");

    if(topic_counts.size() != (size_t)(topic_num))
        throw std::invalid_argument(
            "invalid dimension - topic_counts.size() != topic_num");
    if(__min(topic_counts, 0, topic_num) < 0)
        throw std::invalid_argument("invalid values in topic_counts");

    MutableArrayHandle<int64_t> state(NULL);
    if(args[0].isNull()){
        if(args[4].isNull())
            throw std::invalid_argument("invalid argument - the model \
            parameter should not be null for the first call");
        ArrayHandle<int64_t> model = args[4].getAs<ArrayHandle<int64_t> >();

        if(model.size() != (size_t)((voc_size + 1) * topic_num))
            throw std::invalid_argument(
                "invalid dimension - model.size() != (voc_size + 1) * topic_num");
        if(__min(model) < 0)
            throw std::invalid_argument("invalid topic counts in model");

        state =  madlib_construct_array(NULL,
                                        static_cast<int>(model.size()) + 1,
                                        INT8TI.oid,
                                        INT8TI.len,
                                        INT8TI.byval,
                                        INT8TI.align);

        memcpy(state.ptr(), model.ptr(),  model.size() * sizeof(int64_t));
    }else{
Exemplo n.º 2
0
AnyType rand_vector::run(AnyType & args)
{
    int dim = args[0].getAs<int>();
    if (dim < 1) {
        throw std::invalid_argument("invalid argument - dim should be positive");
    }
    MutableArrayHandle<int> r =  madlib_construct_array(
            NULL, dim, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align);

    for (int i = 0; i < dim; i++){
        *(r.ptr() + i) = (int)(drand48() * 1000);
    }
    return r;
}
Exemplo n.º 3
0
/**
 * @brief The function is used to return the next row by the SRF..
 **/
AnyType lda_unnest::SRF_next(void *user_fctx, bool *is_last_call)
{
    sr_ctx * ctx = (sr_ctx *) user_fctx;
    if (ctx->maxcall == ctx->curcall) {
        *is_last_call = true;
        return Null();
    }

    MutableArrayHandle<int32_t> outarray(
        madlib_construct_array(
            NULL, ctx->dim, INT4TI.oid, INT4TI.len, INT4TI.byval,
            INT4TI.align));
    for (int i = 0; i < ctx->dim; i ++) {
        outarray[i] = ctx->inarray[ctx->curcall * (ctx->dim + 1) + i];
    }

    ctx->curcall++;
    *is_last_call = false;

    return outarray;
}
Exemplo n.º 4
0
/**
 * @brief The function is used to return the next row by the SRF..
 **/
AnyType lda_unnest::SRF_next(void *user_fctx, bool *is_last_call)
{
    sr_ctx * ctx = (sr_ctx *) user_fctx;
    if (ctx->maxcall == 0) {
        *is_last_call = true;
        return Null();
    }

    MutableArrayHandle<int64_t> outarray(
        madlib_construct_array(
            NULL, ctx->dim, INT8TI.oid, INT8TI.len, INT8TI.byval,
            INT8TI.align));
    memcpy(
        outarray.ptr(), ctx->inarray + ctx->curcall * ctx->dim, ctx->dim *
        sizeof(int64_t));

    ctx->curcall++;
    ctx->maxcall--;
    *is_last_call = false;

    return outarray;
}
Exemplo n.º 5
0
/**
 * @brief This function assigns topics to words in a document randomly and
 * returns the topic counts and topic assignments.
 * @param args[0]   The word count in the documents
 * @param args[1]   The number of topics
 * @result          The topic counts and topic assignments
 *                  (length = topic_num + word_count)
 **/
AnyType lda_random_assign::run(AnyType & args)
{
    int32_t word_count = args[0].getAs<int32_t>();
    int32_t topic_num = args[1].getAs<int32_t>();

    if(word_count < 1)
        throw std::invalid_argument( "invalid argument - word_count");
    if(topic_num < 1)
        throw std::invalid_argument( "invalid argument - topic_num");

    MutableArrayHandle<int32_t> doc_topic(
        madlib_construct_array(
            NULL, topic_num + word_count, INT4TI.oid, INT4TI.len, INT4TI.byval,
            INT4TI.align));

    for(int32_t i = 0; i < word_count; i++){
        int32_t topic = static_cast<int32_t>(random() % topic_num);
        doc_topic[topic] += 1;
        doc_topic[topic_num + i] = topic;
    }

    return doc_topic;
}
Exemplo n.º 6
0
/**
 * @brief This function is the sfunc of an aggregator computing the
 * perplexity.
 * @param args[0]   The current state
 * @param args[1]   The unique words in the documents
 * @param args[2]   The counts of each unique words
 * @param args[3]   The topic counts in the document
 * @param args[4]   The model (word topic counts and corpus topic
 *                  counts)
 * @param args[5]   The Dirichlet parameter for per-document topic
 *                  multinomial, i.e. alpha
 * @param args[6]   The Dirichlet parameter for per-topic word
 *                  multinomial, i.e. beta
 * @param args[7]   The size of vocabulary
 * @param args[8]   The number of topics
 * @return          The updated state
 **/
AnyType lda_perplexity_sfunc::run(AnyType & args){
    ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> doc_topic_counts = args[3].getAs<ArrayHandle<int32_t> >();
    double alpha = args[5].getAs<double>();
    double beta = args[6].getAs<double>();
    int32_t voc_size = args[7].getAs<int32_t>();
    int32_t topic_num = args[8].getAs<int32_t>();
    size_t model64_size = static_cast<size_t>(voc_size * (topic_num + 1) + 1) * sizeof(int32_t) / sizeof(int64_t);

    if(alpha <= 0)
        throw std::invalid_argument("invalid argument - alpha");
    if(beta <= 0)
        throw std::invalid_argument("invalid argument - beta");
    if(voc_size <= 0)
        throw std::invalid_argument(
            "invalid argument - voc_size");
    if(topic_num <= 0)
        throw std::invalid_argument(
            "invalid argument - topic_num");

    if(words.size() != counts.size())
        throw std::invalid_argument(
            "dimensions mismatch: words.size() != counts.size()");
    if(__min(words) < 0 || __max(words) >= voc_size)
        throw std::invalid_argument(
            "invalid values in words");
    if(__min(counts) <= 0)
        throw std::invalid_argument(
            "invalid values in counts");

    if(doc_topic_counts.size() != (size_t)(topic_num))
        throw std::invalid_argument(
            "invalid dimension - doc_topic_counts.size() != topic_num");
    if(__min(doc_topic_counts, 0, topic_num) < 0)
        throw std::invalid_argument("invalid values in doc_topic_counts");

    MutableArrayHandle<int64_t> state(NULL);
    if (args[0].isNull()) {
        ArrayHandle<int64_t> model64 = args[4].getAs<ArrayHandle<int64_t> >();

        if (model64.size() != model64_size) {
            std::stringstream ss;
            ss << "invalid dimension: model64.size() = " << model64.size();
            throw std::invalid_argument(ss.str());
        }
        if(__min(model64) < 0) {
            throw std::invalid_argument("invalid topic counts in model");
        }

        state =  madlib_construct_array(NULL,
                                        static_cast<int>(model64.size())
                                            + topic_num
                                            + sizeof(double) / sizeof(int64_t),
                                        INT8TI.oid,
                                        INT8TI.len,
                                        INT8TI.byval,
                                        INT8TI.align);

        memcpy(state.ptr(), model64.ptr(), model64.size() * sizeof(int64_t));
        int32_t *_model = reinterpret_cast<int32_t *>(state.ptr());
        int64_t *_total_topic_counts = reinterpret_cast<int64_t *>(state.ptr() + model64.size());
        for (int i = 0; i < voc_size; i ++) {
            for (int j = 0; j < topic_num; j ++) {
                _total_topic_counts[j] += _model[i * (topic_num + 1) + j];
            }
        }
    } else {
        state = args[0].getAs<MutableArrayHandle<int64_t> >();
    }

    int32_t *model = reinterpret_cast<int32_t *>(state.ptr());
    int64_t *total_topic_counts = reinterpret_cast<int64_t *>(state.ptr() + model64_size);
    double *perp = reinterpret_cast<double *>(state.ptr() + state.size() - 1);

    int32_t n_d = 0;
    for(size_t i = 0; i < words.size(); i++){
        n_d += counts[i];
    }

    for(size_t i = 0; i < words.size(); i++){
        int32_t w = words[i];
        int32_t n_dw = counts[i];

        double sum_p = 0.0;
        for(int32_t z = 0; z < topic_num; z++){
                int32_t n_dz = doc_topic_counts[z];
                int32_t n_wz = model[w * (topic_num + 1) + z];
                int64_t n_z = total_topic_counts[z];
                sum_p += (static_cast<double>(n_wz) + beta) * (n_dz + alpha)
                            / (static_cast<double>(n_z) + voc_size * beta);
        }
        sum_p /= (n_d + topic_num * alpha);

        *perp += n_dw * log(sum_p);
    }

    return state;
}