AnyType lda_parse_model::run(AnyType & args){ ArrayHandle<int64_t> state = args[0].getAs<ArrayHandle<int64_t> >(); int32_t voc_size = args[1].getAs<int32_t>(); int32_t topic_num = args[2].getAs<int32_t>(); const int32_t *model = reinterpret_cast<const int32_t *>(state.ptr()); int dims[2] = {voc_size/2, topic_num}; int lbs[2] = {1, 1}; MutableArrayHandle<int32_t> model_part1( madlib_construct_md_array( NULL, NULL, 2, dims, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align)); for(int32_t i = 0; i < voc_size/2; i++){ for(int32_t j = 0; j < topic_num; j++){ model_part1[i * topic_num + j] = model[i * (topic_num+1) + j]; } } int dims2[2] = {voc_size - voc_size/2, topic_num}; MutableArrayHandle<int32_t> model_part2( madlib_construct_md_array( NULL, NULL, 2, dims2, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align)); for(int32_t i = voc_size/2; i < voc_size; i++){ for(int32_t j = 0; j < topic_num; j++){ model_part2[(i-voc_size/2) * topic_num + j] = model[i * (topic_num+1) + j]; } } //int dims3[1] = {topic_num}; //int lbs3[1] = {1}; MutableNativeColumnVector total_topic_counts(allocateArray<double>(topic_num)); for (int i = 0; i < voc_size; i ++) { for (int j = 0; j < topic_num; j ++) { total_topic_counts[j] += static_cast<double>(model[i * (topic_num + 1) + j]); } } AnyType tuple; tuple << model_part1 << model_part2 << total_topic_counts; return tuple; }
AnyType matrix_mem_trans::run(AnyType & args) { ArrayHandle<double> m = args[0].getAs<ArrayHandle<double> >(); if (m.dims() != 2){ throw std::invalid_argument( "invalid argument - 2-d array expected"); } int row_m = static_cast<int>(m.sizeOfDim(0)); int col_m = static_cast<int>(m.sizeOfDim(1)); int dims[2] = {col_m, row_m}; int lbs[2] = {1, 1}; MutableArrayHandle<double> r = madlib_construct_md_array( NULL, NULL, 2, dims, lbs, FLOAT8TI.oid, FLOAT8TI.len, FLOAT8TI.byval, FLOAT8TI.align); for (int i = 0; i < row_m; i++){ for(int j = 0; j < col_m; j++){ *(r.ptr() + j * row_m + i) = *(m.ptr() + i * col_m + j); } } return r; }
/** * @brief This function transposes a matrix represented by a 2-D array * @param args[0] The input matrix * return The transposed matrix **/ AnyType lda_transpose::run(AnyType & args) { ArrayHandle<int64_t> matrix = args[0].getAs<ArrayHandle<int64_t> >(); if(matrix.dims() != 2) throw std::domain_error("invalid dimension"); int32_t row_num = static_cast<int32_t>(matrix.sizeOfDim(0)); int32_t col_num = static_cast<int32_t>(matrix.sizeOfDim(1)); int dims[2] = {col_num, row_num}; int lbs[2] = {1, 1}; MutableArrayHandle<int64_t> transposed( madlib_construct_md_array( NULL, NULL, 2, dims, lbs, INT8TI.oid, INT8TI.len, INT8TI.byval, INT8TI.align)); for(int32_t i = 0; i < row_num; i++){ int32_t index = i * col_num; for(int32_t j = 0; j < col_num; j++){ transposed[j * row_num + i] = matrix[index]; index++; } } return transposed; }
return r; } AnyType rand_block::run(AnyType & args) { int row_dim = args[0].getAs<int>(); int col_dim = args[1].getAs<int>(); if (row_dim < 1 || col_dim < 1) { throw std::invalid_argument("invalid argument - row_dim and col_dim \ should be positive"); } int dims[2] = {row_dim, col_dim}; int lbs[2] = {1, 1}; MutableArrayHandle<int> r = madlib_construct_md_array( NULL, NULL, 2, dims, lbs, INT4TI.oid, INT4TI.len, INT4TI.byval, INT4TI.align); for (int i = 0; i < row_dim; i++){ for(int j = 0; j < col_dim; j++){ *(r.ptr() + i * col_dim + j) = (int)(drand48() * 1000); } } return r; } typedef struct __sr_ctx1{ const double * inarray; int32_t dim; int32_t maxcall; int32_t size;
/** * @brief This function is the sfunc for the aggregator computing the topic * counts. It scans the topic assignments in a document and updates the word * topic counts. * @param args[0] The state variable, current topic counts * @param args[1] The unique words in the document * @param args[2] The counts of each unique word in the document * @param args[3] The topic assignments in the document * @param args[4] The size of vocabulary * @param args[5] The number of topics * @return The updated state **/ AnyType lda_count_topic_sfunc::run(AnyType & args) { if(args[4].isNull() || args[5].isNull()) throw std::invalid_argument("null parameter - voc_size and/or \ topic_num is null"); if(args[1].isNull() || args[2].isNull() || args[3].isNull()) return args[0]; int32_t voc_size = args[4].getAs<int32_t>(); int32_t topic_num = args[5].getAs<int32_t>(); if(voc_size <= 0) throw std::invalid_argument( "invalid argument - voc_size"); if(topic_num <= 0) throw std::invalid_argument( "invalid argument - topic_num"); ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> topic_assignment = args[3].getAs<ArrayHandle<int32_t> >(); if(words.size() != counts.size()) throw std::invalid_argument( "dimensions mismatch - words.size() != counts.size()"); if(__min(words) < 0 || __max(words) >= voc_size) throw std::invalid_argument( "invalid values in words"); if(__min(counts) <= 0) throw std::invalid_argument( "invalid values in counts"); if(__min(topic_assignment) < 0 || __max(topic_assignment) >= topic_num) throw std::invalid_argument("invalid values in topics"); if((size_t)__sum(counts) != topic_assignment.size()) throw std::invalid_argument( "dimension mismatch - sum(counts) != topic_assignment.size()"); MutableArrayHandle<int64_t> state(NULL); int32_t *model; if(args[0].isNull()) { // to store a voc_size x (topic_num+1) integer matrix in // bigint[] (the +1 is for a flag of ceiling the count), // we need padding if the size is odd. // 1. when voc_size * (topic_num + 1) is (2n+1), gives (n+1) // 2. when voc_size * (topic_num + 1) is (2n), gives (n) int dims[1] = {static_cast<int>( (voc_size * (topic_num + 1) + 1) * sizeof(int32_t) / sizeof(int64_t) )}; int lbs[1] = {1}; state = madlib_construct_md_array( NULL, NULL, 1, dims, lbs, INT8TI.oid, INT8TI.len, INT8TI.byval, INT8TI.align); // the reason we use bigint[] because integer[] has limit on number of // elements and thus cannot be larger than 500MB model = reinterpret_cast<int32_t *>(state.ptr()); } else { state = args[0].getAs<MutableArrayHandle<int64_t> >(); model = reinterpret_cast<int32_t *>(state.ptr()); } int32_t unique_word_count = static_cast<int32_t>(words.size()); int32_t word_index = 0; for(int32_t i = 0; i < unique_word_count; i++){ int32_t wordid = words[i]; for(int32_t j = 0; j < counts[i]; j++){ int32_t topic = topic_assignment[word_index]; if (model[wordid * (topic_num + 1) + topic] <= 2e9) { model[wordid * (topic_num + 1) + topic]++; } else { model[wordid * (topic_num + 1) + topic_num] = 1; } word_index++; } } return state; }
/** * @brief This function is the sfunc for the aggregator computing the topic * counts. It scans the topic assignments in a document and updates the word * topic counts. * @param args[0] The state variable, current topic counts * @param args[1] The unique words in the document * @param args[2] The counts of each unique word in the document * @param args[3] The topic assignments in the document * @param args[4] The size of vocabulary * @param args[5] The number of topics * @return The updated state **/ AnyType lda_count_topic_sfunc::run(AnyType & args) { if(args[4].isNull() || args[5].isNull()) throw std::invalid_argument("null parameter - voc_size and/or \ topic_num is null"); if(args[1].isNull() || args[2].isNull() || args[3].isNull()) return args[0]; int32_t voc_size = args[4].getAs<int32_t>(); int32_t topic_num = args[5].getAs<int32_t>(); if(voc_size <= 0) throw std::invalid_argument( "invalid argument - voc_size"); if(topic_num <= 0) throw std::invalid_argument( "invalid argument - topic_num"); ArrayHandle<int32_t> words = args[1].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> counts = args[2].getAs<ArrayHandle<int32_t> >(); ArrayHandle<int32_t> topic_assignment = args[3].getAs<ArrayHandle<int32_t> >(); if(words.size() != counts.size()) throw std::invalid_argument( "dimensions mismatch - words.size() != counts.size()"); if(__min(words) < 0 || __max(words) >= voc_size) throw std::invalid_argument( "invalid values in words"); if(__min(counts) <= 0) throw std::invalid_argument( "invalid values in counts"); if(__min(topic_assignment) < 0 || __max(topic_assignment) >= topic_num) throw std::invalid_argument("invalid values in topics"); if((size_t)__sum(counts) != topic_assignment.size()) throw std::invalid_argument( "dimension mismatch - sum(counts) != topic_assignment.size()"); MutableArrayHandle<int64_t> state(NULL); if(args[0].isNull()){ int dims[2] = {voc_size + 1, topic_num}; int lbs[2] = {1, 1}; state = madlib_construct_md_array( NULL, NULL, 2, dims, lbs, INT8TI.oid, INT8TI.len, INT8TI.byval, INT8TI.align); } else { state = args[0].getAs<MutableArrayHandle<int64_t> >(); } int32_t unique_word_count = static_cast<int32_t>(words.size()); int32_t word_index = 0; for(int32_t i = 0; i < unique_word_count; i++){ int32_t wordid = words[i]; for(int32_t j = 0; j < counts[i]; j++){ int32_t topic = topic_assignment[word_index]; state[wordid * topic_num + topic]++; state[voc_size * topic_num + topic]++; word_index++; } } return state; }