/** * @brief This function is the sfunc of an aggregator computing the * perplexity. * @param args[0] The current state * @param args[1] The unique words in the documents * @param args[2] The counts of each unique words * @param args[3] The topic counts in the document * @param args[4] The model (word topic counts and corpus topic * counts) * @param args[5] The Dirichlet parameter for per-document topic * multinomial, i.e. alpha * @param args[6] The Dirichlet parameter for per-topic word * multinomial, i.e. beta * @param args[7] The size of vocabulary * @param args[8] The number of topics * @return The updated state **/ AnyType lda_perplexity_sfunc::run(AnyType & args){ ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> topic_counts = args[3].getAs<ArrayHandle<int32_t> >(); double alpha = args[5].getAs<double>(); double beta = args[6].getAs<double>(); int32_t voc_size = args[7].getAs<int32_t>(); int32_t topic_num = args[8].getAs<int32_t>(); if(alpha <= 0) throw std::invalid_argument("invalid argument - alpha"); if(beta <= 0) throw std::invalid_argument("invalid argument - beta"); if(voc_size <= 0) throw std::invalid_argument( "invalid argument - voc_size"); if(topic_num <= 0) throw std::invalid_argument( "invalid argument - topic_num"); if(words.size() != counts.size()) throw std::invalid_argument( "dimensions mismatch: words.size() != counts.size()"); if(__min(words) < 0 || __max(words) >= voc_size) throw std::invalid_argument( "invalid values in words"); if(__min(counts) <= 0) throw std::invalid_argument( "invalid values in counts"); if(topic_counts.size() != (size_t)(topic_num)) throw std::invalid_argument( "invalid dimension - topic_counts.size() != topic_num"); if(__min(topic_counts, 0, topic_num) < 0) throw std::invalid_argument("invalid values in topic_counts"); MutableArrayHandle<int64_t> state(NULL); if(args[0].isNull()){ if(args[4].isNull()) throw std::invalid_argument("invalid argument - the model \ parameter should not be null for the first call"); ArrayHandle<int64_t> model = args[4].getAs<ArrayHandle<int64_t> >(); if(model.size() != (size_t)((voc_size + 1) * topic_num)) throw std::invalid_argument( "invalid dimension - model.size() != (voc_size + 1) * topic_num"); if(__min(model) < 0) throw std::invalid_argument("invalid topic counts in model"); state = madlib_construct_array(NULL, static_cast<int>(model.size()) + 1, INT8TI.oid, INT8TI.len, INT8TI.byval, INT8TI.align); memcpy(state.ptr(), model.ptr(), model.size() * sizeof(int64_t)); }else{
AnyType rand_vector::run(AnyType & args) { int dim = args[0].getAs<int>(); if (dim < 1) { throw std::invalid_argument("invalid argument - dim should be positive"); } MutableArrayHandle<int> r = madlib_construct_array( NULL, dim, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align); for (int i = 0; i < dim; i++){ *(r.ptr() + i) = (int)(drand48() * 1000); } return r; }
/** * @brief The function is used to return the next row by the SRF.. **/ AnyType lda_unnest::SRF_next(void *user_fctx, bool *is_last_call) { sr_ctx * ctx = (sr_ctx *) user_fctx; if (ctx->maxcall == ctx->curcall) { *is_last_call = true; return Null(); } MutableArrayHandle<int32_t> outarray( madlib_construct_array( NULL, ctx->dim, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align)); for (int i = 0; i < ctx->dim; i ++) { outarray[i] = ctx->inarray[ctx->curcall * (ctx->dim + 1) + i]; } ctx->curcall++; *is_last_call = false; return outarray; }
/** * @brief The function is used to return the next row by the SRF.. **/ AnyType lda_unnest::SRF_next(void *user_fctx, bool *is_last_call) { sr_ctx * ctx = (sr_ctx *) user_fctx; if (ctx->maxcall == 0) { *is_last_call = true; return Null(); } MutableArrayHandle<int64_t> outarray( madlib_construct_array( NULL, ctx->dim, INT8TI.oid, INT8TI.len, INT8TI.byval, INT8TI.align)); memcpy( outarray.ptr(), ctx->inarray + ctx->curcall * ctx->dim, ctx->dim * sizeof(int64_t)); ctx->curcall++; ctx->maxcall--; *is_last_call = false; return outarray; }
/** * @brief This function assigns topics to words in a document randomly and * returns the topic counts and topic assignments. * @param args[0] The word count in the documents * @param args[1] The number of topics * @result The topic counts and topic assignments * (length = topic_num + word_count) **/ AnyType lda_random_assign::run(AnyType & args) { int32_t word_count = args[0].getAs<int32_t>(); int32_t topic_num = args[1].getAs<int32_t>(); if(word_count < 1) throw std::invalid_argument( "invalid argument - word_count"); if(topic_num < 1) throw std::invalid_argument( "invalid argument - topic_num"); MutableArrayHandle<int32_t> doc_topic( madlib_construct_array( NULL, topic_num + word_count, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align)); for(int32_t i = 0; i < word_count; i++){ int32_t topic = static_cast<int32_t>(random() % topic_num); doc_topic[topic] += 1; doc_topic[topic_num + i] = topic; } return doc_topic; }
/** * @brief This function is the sfunc of an aggregator computing the * perplexity. * @param args[0] The current state * @param args[1] The unique words in the documents * @param args[2] The counts of each unique words * @param args[3] The topic counts in the document * @param args[4] The model (word topic counts and corpus topic * counts) * @param args[5] The Dirichlet parameter for per-document topic * multinomial, i.e. alpha * @param args[6] The Dirichlet parameter for per-topic word * multinomial, i.e. beta * @param args[7] The size of vocabulary * @param args[8] The number of topics * @return The updated state **/ AnyType lda_perplexity_sfunc::run(AnyType & args){ ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> doc_topic_counts = args[3].getAs<ArrayHandle<int32_t> >(); double alpha = args[5].getAs<double>(); double beta = args[6].getAs<double>(); int32_t voc_size = args[7].getAs<int32_t>(); int32_t topic_num = args[8].getAs<int32_t>(); size_t model64_size = static_cast<size_t>(voc_size * (topic_num + 1) + 1) * sizeof(int32_t) / sizeof(int64_t); if(alpha <= 0) throw std::invalid_argument("invalid argument - alpha"); if(beta <= 0) throw std::invalid_argument("invalid argument - beta"); if(voc_size <= 0) throw std::invalid_argument( "invalid argument - voc_size"); if(topic_num <= 0) throw std::invalid_argument( "invalid argument - topic_num"); if(words.size() != counts.size()) throw std::invalid_argument( "dimensions mismatch: words.size() != counts.size()"); if(__min(words) < 0 || __max(words) >= voc_size) throw std::invalid_argument( "invalid values in words"); if(__min(counts) <= 0) throw std::invalid_argument( "invalid values in counts"); if(doc_topic_counts.size() != (size_t)(topic_num)) throw std::invalid_argument( "invalid dimension - doc_topic_counts.size() != topic_num"); if(__min(doc_topic_counts, 0, topic_num) < 0) throw std::invalid_argument("invalid values in doc_topic_counts"); MutableArrayHandle<int64_t> state(NULL); if (args[0].isNull()) { ArrayHandle<int64_t> model64 = args[4].getAs<ArrayHandle<int64_t> >(); if (model64.size() != model64_size) { std::stringstream ss; ss << "invalid dimension: model64.size() = " << model64.size(); throw std::invalid_argument(ss.str()); } if(__min(model64) < 0) { throw std::invalid_argument("invalid topic counts in model"); } state = madlib_construct_array(NULL, static_cast<int>(model64.size()) + topic_num + sizeof(double) / sizeof(int64_t), INT8TI.oid, INT8TI.len, INT8TI.byval, INT8TI.align); memcpy(state.ptr(), model64.ptr(), model64.size() * sizeof(int64_t)); int32_t *_model = reinterpret_cast<int32_t *>(state.ptr()); int64_t *_total_topic_counts = reinterpret_cast<int64_t *>(state.ptr() + model64.size()); for (int i = 0; i < voc_size; i ++) { for (int j = 0; j < topic_num; j ++) { _total_topic_counts[j] += _model[i * (topic_num + 1) + j]; } } } else { state = args[0].getAs<MutableArrayHandle<int64_t> >(); } int32_t *model = reinterpret_cast<int32_t *>(state.ptr()); int64_t *total_topic_counts = reinterpret_cast<int64_t *>(state.ptr() + model64_size); double *perp = reinterpret_cast<double *>(state.ptr() + state.size() - 1); int32_t n_d = 0; for(size_t i = 0; i < words.size(); i++){ n_d += counts[i]; } for(size_t i = 0; i < words.size(); i++){ int32_t w = words[i]; int32_t n_dw = counts[i]; double sum_p = 0.0; for(int32_t z = 0; z < topic_num; z++){ int32_t n_dz = doc_topic_counts[z]; int32_t n_wz = model[w * (topic_num + 1) + z]; int64_t n_z = total_topic_counts[z]; sum_p += (static_cast<double>(n_wz) + beta) * (n_dz + alpha) / (static_cast<double>(n_z) + voc_size * beta); } sum_p /= (n_d + topic_num * alpha); *perp += n_dw * log(sum_p); } return state; }