Exemplo n.º 1
0
AnyType matrix_mem_trans::run(AnyType & args)
{
    ArrayHandle<double> m = args[0].getAs<ArrayHandle<double> >();

    if (m.dims() != 2){
        throw std::invalid_argument(
            "invalid argument - 2-d array expected");
    }

    int row_m = static_cast<int>(m.sizeOfDim(0));
    int col_m = static_cast<int>(m.sizeOfDim(1));

    int dims[2] = {col_m, row_m};
    int lbs[2] = {1, 1};
    MutableArrayHandle<double> r = madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, FLOAT8TI.oid,
            FLOAT8TI.len, FLOAT8TI.byval, FLOAT8TI.align);

    for (int i = 0; i < row_m; i++){
        for(int j = 0; j < col_m; j++){
                *(r.ptr() + j * row_m + i) = *(m.ptr() + i * col_m + j);
        }
    }
    return r;
}
Exemplo n.º 2
0
/**
 * @brief This function is the sfunc of an aggregator computing the
 * perplexity.  
 * @param args[0]   The current state 
 * @param args[1]   The unique words in the documents
 * @param args[2]   The counts of each unique words
 * @param args[3]   The topic counts in the document
 * @param args[4]   The model (word topic counts and corpus topic
 *                  counts)
 * @param args[5]   The Dirichlet parameter for per-document topic
 *                  multinomial, i.e. alpha
 * @param args[6]   The Dirichlet parameter for per-topic word
 *                  multinomial, i.e. beta
 * @param args[7]   The size of vocabulary
 * @param args[8]   The number of topics
 * @return          The updated state 
 **/
AnyType lda_perplexity_sfunc::run(AnyType & args){
    ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> topic_counts = args[3].getAs<ArrayHandle<int32_t> >();
    double alpha = args[5].getAs<double>();
    double beta = args[6].getAs<double>();
    int32_t voc_size = args[7].getAs<int32_t>();
    int32_t topic_num = args[8].getAs<int32_t>();

    if(alpha <= 0)
        throw std::invalid_argument("invalid argument - alpha");
    if(beta <= 0)
        throw std::invalid_argument("invalid argument - beta");
    if(voc_size <= 0)
        throw std::invalid_argument(
            "invalid argument - voc_size");
    if(topic_num <= 0)
        throw std::invalid_argument(
            "invalid argument - topic_num");

    if(words.size() != counts.size())
        throw std::invalid_argument(
            "dimensions mismatch: words.size() != counts.size()");
    if(__min(words) < 0 || __max(words) >= voc_size)
        throw std::invalid_argument(
            "invalid values in words");
    if(__min(counts) <= 0)
        throw std::invalid_argument(
            "invalid values in counts");

    if(topic_counts.size() != (size_t)(topic_num))
        throw std::invalid_argument(
            "invalid dimension - topic_counts.size() != topic_num");
    if(__min(topic_counts, 0, topic_num) < 0)
        throw std::invalid_argument("invalid values in topic_counts");

    MutableArrayHandle<int64_t> state(NULL);
    if(args[0].isNull()){
        if(args[4].isNull())
            throw std::invalid_argument("invalid argument - the model \
            parameter should not be null for the first call");
        ArrayHandle<int64_t> model = args[4].getAs<ArrayHandle<int64_t> >();

        if(model.size() != (size_t)((voc_size + 1) * topic_num))
            throw std::invalid_argument(
                "invalid dimension - model.size() != (voc_size + 1) * topic_num");
        if(__min(model) < 0)
            throw std::invalid_argument("invalid topic counts in model");

        state =  madlib_construct_array(NULL,
                                        static_cast<int>(model.size()) + 1,
                                        INT8TI.oid,
                                        INT8TI.len,
                                        INT8TI.byval,
                                        INT8TI.align);

        memcpy(state.ptr(), model.ptr(),  model.size() * sizeof(int64_t));
    }else{
Exemplo n.º 3
0
AnyType lda_parse_model::run(AnyType & args){
    ArrayHandle<int64_t> state = args[0].getAs<ArrayHandle<int64_t> >();
    int32_t voc_size = args[1].getAs<int32_t>();
    int32_t topic_num = args[2].getAs<int32_t>();

    const int32_t *model = reinterpret_cast<const int32_t *>(state.ptr());

    int dims[2] = {voc_size/2, topic_num};
    int lbs[2] = {1, 1};
    MutableArrayHandle<int32_t> model_part1(
        madlib_construct_md_array(
            NULL, NULL, 2, dims, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval,
            INT4TI.align));

    for(int32_t i = 0; i < voc_size/2; i++){
        for(int32_t j = 0; j < topic_num; j++){
               model_part1[i * topic_num + j] = model[i * (topic_num+1) + j];
        }
    }

    int dims2[2] = {voc_size - voc_size/2, topic_num};

    MutableArrayHandle<int32_t> model_part2(
        madlib_construct_md_array(
            NULL, NULL, 2, dims2, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval,
            INT4TI.align));

    for(int32_t i = voc_size/2; i < voc_size; i++){
        for(int32_t j = 0; j < topic_num; j++){
               model_part2[(i-voc_size/2) * topic_num + j] = model[i * (topic_num+1) + j];
        }
    }

    //int dims3[1] = {topic_num};
    //int lbs3[1] = {1};

    MutableNativeColumnVector total_topic_counts(allocateArray<double>(topic_num));

    for (int i = 0; i < voc_size; i ++) {
        for (int j = 0; j < topic_num; j ++) {
            total_topic_counts[j] += static_cast<double>(model[i * (topic_num + 1) + j]);
        }
    }

    AnyType tuple;
    tuple << model_part1
          << model_part2
          << total_topic_counts;

    return tuple;
}
Exemplo n.º 4
0
/**
 * @brief The function is used for the initlization of the SRF. The SRF unnests
 * a 2-D array into a set of 1-D arrays.
 **/
void * lda_unnest::SRF_init(AnyType &args) 
{
    ArrayHandle<int64_t> inarray = args[0].getAs<ArrayHandle<int64_t> >();
    if(inarray.dims() != 2)
        throw std::invalid_argument("invalid dimension");

    sr_ctx * ctx = new sr_ctx;
    ctx->inarray = inarray.ptr();
    ctx->maxcall = static_cast<int32_t>(inarray.sizeOfDim(0));
    ctx->dim = static_cast<int32_t>(inarray.sizeOfDim(1));
    ctx->curcall = 0;

    return ctx;
}
Exemplo n.º 5
0
inline
HandleMap<const Matrix, ArrayHandle<double> >::HandleMap(
    const ArrayHandle<double>& inHandle)
  : Base(const_cast<double*>(inHandle.ptr()), inHandle.sizeOfDim(1),
        inHandle.sizeOfDim(0)),
    mMemoryHandle(inHandle) { }
Exemplo n.º 6
0
/**
 * @brief This function is the finalfunc of an aggregator computing the
 * perplexity.
 * @param args[0]   The global state
 * @return          The perplexity
 **/
AnyType lda_perplexity_ffunc::run(AnyType & args){
    ArrayHandle<int64_t> state = args[0].getAs<ArrayHandle<int64_t> >();
    const double * perp = reinterpret_cast<const double *>(state.ptr() + state.size() - 1);
    return *perp;
}
Exemplo n.º 7
0
/**
 * @brief Get the sum of an array - for parameter checking
 * @return      The sum
 * @note The caller will ensure that ah is always non-null.
 **/
static int32_t __sum(ArrayHandle<int32_t> ah){
    const int32_t * array = ah.ptr();
    size_t size = ah.size();
    return std::accumulate(array, array + size, static_cast<int32_t>(0));
}
Exemplo n.º 8
0
/**
 * @brief Get the max value of an array - for parameter checking
 * @return      The max value
 * @note The caller will ensure that ah is always non-null.
 **/
template<class T> static T __max(
    ArrayHandle<T> ah, size_t start, size_t len){
    const T * array = ah.ptr() + start;
    return *std::max_element(array, array + len);
}
Exemplo n.º 9
0
/**
 * @brief This function learns the topics of words in a document and is the
 * main step of a Gibbs sampling iteration. The word topic counts and
 * corpus topic counts are passed to this function in the first call and
 * then transfered to the rest calls through args.mSysInfo->user_fctx for
 * efficiency. 
 * @param args[0]   The unique words in the documents
 * @param args[1]   The counts of each unique words
 * @param args[2]   The topic counts and topic assignments in the document
 * @param args[3]   The model (word topic counts and corpus topic
 *                  counts)
 * @param args[4]   The Dirichlet parameter for per-document topic
 *                  multinomial, i.e. alpha
 * @param args[5]   The Dirichlet parameter for per-topic word
 *                  multinomial, i.e. beta
 * @param args[6]   The size of vocabulary
 * @param args[7]   The number of topics
 * @param args[8]   The number of iterations (=1:training, >1:prediction)
 * @return          The updated topic counts and topic assignments for
 *                  the document
 **/
AnyType lda_gibbs_sample::run(AnyType & args)
{
    ArrayHandle<int32_t> words = args[0].getAs<ArrayHandle<int32_t> >();
    ArrayHandle<int32_t> counts = args[1].getAs<ArrayHandle<int32_t> >();
    MutableArrayHandle<int32_t> doc_topic = args[2].getAs<MutableArrayHandle<int32_t> >();
    double alpha = args[4].getAs<double>();
    double beta = args[5].getAs<double>();
    int32_t voc_size = args[6].getAs<int32_t>();
    int32_t topic_num = args[7].getAs<int32_t>();
    int32_t iter_num = args[8].getAs<int32_t>();

    if(alpha <= 0)
        throw std::invalid_argument("invalid argument - alpha");
    if(beta <= 0)
        throw std::invalid_argument("invalid argument - beta");
    if(voc_size <= 0)
        throw std::invalid_argument(
            "invalid argument - voc_size");
    if(topic_num <= 0)
        throw std::invalid_argument(
            "invalid argument - topic_num");
    if(iter_num <= 0)
        throw std::invalid_argument(
            "invalid argument - iter_num");

    if(words.size() != counts.size())
        throw std::invalid_argument(
            "dimensions mismatch: words.size() != counts.size()");
    if(__min(words) < 0 || __max(words) >= voc_size)
        throw std::invalid_argument(
            "invalid values in words");
    if(__min(counts) <= 0)
        throw std::invalid_argument(
            "invalid values in counts");

    int32_t word_count = __sum(counts);
    if(doc_topic.size() != (size_t)(word_count + topic_num))
        throw std::invalid_argument(
            "invalid dimension - doc_topic.size() != word_count + topic_num");
    if(__min(doc_topic, 0, topic_num) < 0)
        throw std::invalid_argument("invalid values in topic_count");
    if(
        __min(doc_topic, topic_num, word_count) < 0 ||
        __max(doc_topic, topic_num, word_count) >= topic_num)
        throw std::invalid_argument( "invalid values in topic_assignment");

    if (!args.getUserFuncContext())
    {
        if(args[3].isNull())
            throw std::invalid_argument("invalid argument - the model \
            parameter should not be null for the first call");
        ArrayHandle<int64_t> model = args[3].getAs<ArrayHandle<int64_t> >();
        if(model.size() != (size_t)((voc_size + 1) * topic_num))
            throw std::invalid_argument(
                "invalid dimension - model.size() != (voc_size + 1) * topic_num");
        if(__min(model) < 0)
            throw std::invalid_argument("invalid topic counts in model");

        int64_t * state = 
            static_cast<int64_t *>(
                MemoryContextAllocZero(
                    args.getCacheMemoryContext(), 
                    model.size() * sizeof(int64_t)));
        memcpy(state, model.ptr(), model.size() * sizeof(int64_t));
        args.setUserFuncContext(state);
    }

    int64_t * state = static_cast<int64_t *>(args.getUserFuncContext());
    if(NULL == state){
        throw std::runtime_error("args.mSysInfo->user_fctx is null");
    }

    int32_t unique_word_count = static_cast<int32_t>(words.size());
    for(int32_t it = 0; it < iter_num; it++){
        int32_t word_index = topic_num;
        for(int32_t i = 0; i < unique_word_count; i++) {
            int32_t wordid = words[i];
            for(int32_t j = 0; j < counts[i]; j++){
                int32_t topic = doc_topic[word_index];
                int32_t retopic = __lda_gibbs_sample(
                    topic_num, topic, doc_topic.ptr(), 
                    state + wordid * topic_num, 
                    state + voc_size * topic_num, alpha, beta);
                doc_topic[word_index] = retopic;
                doc_topic[topic]--;
                doc_topic[retopic]++;

                if(iter_num == 1){
                    state[voc_size * topic_num + topic]--;
                    state[voc_size * topic_num + retopic]++;
                    state[wordid * topic_num + topic]--;
                    state[wordid * topic_num + retopic]++;
                }
                word_index++;
            }
        }
    }
    
    return doc_topic;
}