int read_csr_mat(const char *file_name, struct csr_mat_t *mat) { FILE *fp = fopen(file_name, "rb"); if (fp == NULL) { return -1; } fread(&mat->rows, sizeof(int), 1, fp); fread(&mat->cols, sizeof(int), 1, fp); fread(&mat->non_zeros, sizeof(INT64), 1, fp); mat->row_ptr = (DWORD*)numa_alloc((mat->rows + 1) * sizeof(DWORD)); mat->col_idx = (int*)numa_alloc(mat->non_zeros * sizeof(int)); mat->vals = (FLOAT*)numa_alloc(mat->non_zeros * sizeof(FLOAT)); fread(mat->row_ptr, sizeof(DWORD), mat->rows + 1, fp); fread(mat->col_idx, sizeof(int), mat->non_zeros, fp); fread(mat->vals, sizeof(FLOAT), mat->non_zeros, fp); printf("Row x Column: %d x %d\n", mat->rows, mat->cols); printf("Non-zero elements number: %ld\n", mat->non_zeros); return 0; }
int csr_reorder(struct csr_mat_t *csr, struct csr_mat_t *csr_re, int *reorder_map) { int *row_len = (int*)malloc(csr->rows * sizeof(int)); int i, j; for (i = 0; i < csr->rows; i++) { reorder_map[i] = i; row_len[i] = csr->row_ptr[i + 1] - csr->row_ptr[i]; } row_sort(row_len, reorder_map, csr->rows); csr_re->rows = csr->rows; csr_re->cols = csr->cols; csr_re->non_zeros = csr->non_zeros; csr_re->row_ptr = (DWORD*)numa_alloc((csr_re->rows + 1) * sizeof(DWORD)); csr_re->col_idx = (int*)numa_alloc(csr_re->non_zeros * sizeof(int)); csr_re->vals = (FLOAT*)numa_alloc(csr_re->non_zeros * sizeof(FLOAT)); int idx = 0; csr_re->row_ptr[0] = 0; for (i = 0; i < csr_re->rows; i++) { memcpy(csr_re->col_idx + idx, csr->col_idx + csr->row_ptr[reorder_map[i]], row_len[i] * sizeof(int)); memcpy(csr_re->vals + idx, csr->vals + csr->row_ptr[reorder_map[i]], row_len[i] * sizeof(FLOAT)); idx += row_len[i]; csr_re->row_ptr[i + 1] = idx; } free(row_len); return 0; }
JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocMemory (JNIEnv *env, jobject obj, jlong capacity) { void* mem = numa_alloc((size_t) capacity); if(mem == NULL) printf("failed to allocate local memory\n"); return (jlong) mem; }
/* * Class: xerial_jnuma_NumaNative * Method: alloc * Signature: (I)Ljava/nio/ByteBuffer; */ JNIEXPORT jobject JNICALL Java_xerial_jnuma_NumaNative_alloc (JNIEnv *env, jobject obj, jint capacity) { void* mem = numa_alloc((size_t) capacity); //printf("allocate local memory\n"); if(mem == NULL) printf("failed to allocate local memory\n"); return (*env)->NewDirectByteBuffer(env, mem, (jlong) capacity); }
/* * Class: xerial_jnuma_NumaNative * Method: allocate * Signature: (J)J */ JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocate (JNIEnv *env, jobject obj, jlong capacity) { void* mem = numa_alloc((size_t) capacity); if(mem != NULL) { return (jlong) mem; } throwException(env, obj, 11); return 0L; }
int csr_transpose(struct csr_mat_t *csr, struct csr_mat_t *csr_t) { csr_t->cols = csr->rows; csr_t->rows = csr->cols; csr_t->non_zeros = csr->non_zeros; csr_t->row_ptr = (DWORD*)numa_alloc((csr_t->rows + 1) * sizeof(DWORD)); csr_t->col_idx = (int*)numa_alloc(csr_t->non_zeros * sizeof(int)); csr_t->vals = (FLOAT*)numa_alloc(csr_t->non_zeros * sizeof(FLOAT)); memset(csr_t->row_ptr, 0, (csr_t->rows + 1) * sizeof(DWORD)); int i, j; for (i = 0; i < csr->rows; i++) { for (j = csr->row_ptr[i]; j < csr->row_ptr[i + 1]; j++) { csr_t->row_ptr[csr->col_idx[j] + 1]++; } } for (i = 1; i <= csr_t->rows; i++) { csr_t->row_ptr[i] += csr_t->row_ptr[i - 1]; } int *row_start = (int*)malloc(csr_t->rows * sizeof(int)); memcpy(row_start, csr_t->row_ptr, csr_t->rows * sizeof(int)); for (i = 0; i < csr->rows; i++) { for (j = csr->row_ptr[i]; j < csr->row_ptr[i + 1]; j++) { int row = row_start[csr->col_idx[j]]; csr_t->col_idx[row] = i; csr_t->vals[row] = csr->vals[j]; row_start[csr->col_idx[j]]++; } } free(row_start); return 0; }
int split_csr_lb_nz(struct csr_mat_t *csr, struct csr_cont_t *csr_cont, int count, split_dir_t dir) { int i, j; csr_cont->dir = dir; csr_cont->count = count; csr_cont->split_idx = (int*)numa_alloc((count + 1) * sizeof(int)); csr_cont->csrs = (struct csr_mat_t*)numa_alloc(count * sizeof(struct csr_mat_t)); if (dir == SPLIT_HORIZON) { struct csr_mat_t *csrs = csr_cont->csrs; int *split_idx = csr_cont->split_idx; split_idx[0] = 0; int avg_ele = csr->non_zeros / count, split_val; for (i = 1, j = 1; i < count; i++) { split_val = i * avg_ele; while (csr->row_ptr[j] < split_val) { j++; } if (csr->row_ptr[j] - split_val > split_val - csr->row_ptr[j - 1]) { j--; } split_idx[i] = j; } split_idx[i] = csr->rows; int item_idx = 0; for (i = 0; i < count; i++) { csrs[i].rows = split_idx[i + 1] - split_idx[i]; printf("csrs[%d].rows = %d\n", i, csrs[i].rows); csrs[i].cols = csr->cols; csrs[i].non_zeros = csr->row_ptr[split_idx[i + 1]] - csr->row_ptr[split_idx[i]]; csrs[i].row_ptr = (DWORD*)numa_alloc((csrs[i].rows + 1) * sizeof(DWORD)); csrs[i].col_idx = (int*)numa_alloc(csrs[i].non_zeros * sizeof(int)); csrs[i].vals = (FLOAT*)numa_alloc(csrs[i].non_zeros * sizeof(FLOAT)); memcpy(csrs[i].row_ptr, csr->row_ptr + split_idx[i], (csrs[i].rows + 1) * sizeof(DWORD)); memcpy(csrs[i].col_idx, csr->col_idx + item_idx, csrs[i].non_zeros * sizeof(int)); memcpy(csrs[i].vals, csr->vals + item_idx, csrs[i].non_zeros * sizeof(FLOAT)); for (j = 0; j <= csrs[i].rows; j++) { csrs[i].row_ptr[j] -= csr->row_ptr[split_idx[i]]; } item_idx += csrs[i].non_zeros; } } else if (dir == SPLIT_VERTICAL) { struct csr_mat_t *csrs = csr_cont->csrs; int *split_idx = csr_cont->split_idx; split_idx[0] = 0; INT64 *col_cnt = (INT64*)calloc((csr->cols + 1), sizeof(INT64)); for (i = 0; i < csr->rows; i++) { for (j = csr->row_ptr[i]; j < csr->row_ptr[i + 1]; j++) { col_cnt[csr->col_idx[j] + 1]++; } } int avg_ele = csr->non_zeros / count, split_val; int cur_col = 0; for (i = 1, j = 1; i < count; i++) { split_val = i * avg_ele; do { cur_col += col_cnt[j++]; } while (cur_col < split_val); if (cur_col - split_val > split_val - (cur_col - col_cnt[j])) { cur_col -= col_cnt[j--]; } split_idx[i] = j; } split_idx[i] = csr->cols; for (i = 0; i < csr->cols; i++) { col_cnt[i + 1] += col_cnt[i]; } for (i = 0; i < count; i++) { csrs[i].rows = csr->rows; csrs[i].cols = split_idx[i + 1] - split_idx[i]; csrs[i].non_zeros = col_cnt[split_idx[i + 1]] - col_cnt[split_idx[i]]; csrs[i].row_ptr = (DWORD*)numa_alloc((csrs[i].rows + 1) * sizeof(DWORD)); csrs[i].col_idx = (int*)numa_alloc(csrs[i].non_zeros * sizeof(int)); csrs[i].vals = (FLOAT*)numa_alloc(csrs[i].non_zeros * sizeof(FLOAT)); memset(csrs[i].row_ptr, 0, (csrs[i].rows + 1) * sizeof(DWORD)); } int col, k; for (i = 0; i < csr->rows; i++) { for (j = 0; j < count; j++) { csrs[j].row_ptr[i + 1] = csrs[j].row_ptr[i]; } for (j = csr->row_ptr[i]; j < csr->row_ptr[i + 1]; j++) { col = csr->col_idx[j]; for (k = 0; k < count; k++) { if (col < split_idx[k + 1]) { break; } } csrs[k].col_idx[csrs[k].row_ptr[i + 1]] = csr->col_idx[j] - split_idx[k]; csrs[k].vals[csrs[k].row_ptr[i + 1]] = csr->vals[j]; csrs[k].row_ptr[i + 1]++; } } free(col_cnt); } return 0; }
void INTERNAL *qt_affinity_alloc(size_t bytes) { /*{{{ */ return numa_alloc(bytes); } /*}}} */
int main(int argc, char *argv[]) { if (argc != 2) { fprintf(stderr, "usage: %s csr_matrix_file\n", argv[0]); exit(0); } int i, j, k; struct timespec start, end; int num_threads = 1; #pragma omp parallel { #pragma omp master { num_threads = omp_get_num_threads(); } } printf("Thread number: %d.\n", num_threads); #pragma omp parallel for for (i = 0; i < num_threads; i++) { int cpu = omp_get_thread_num(); thread_bind(cpu); } FILE *fp; struct csr_mat_t csr, csr_re, csr_t, csr_t_re, csr_t_t; struct blk_mat_t blk; struct csr_cont_t csr_h, csr_v; struct blk_cont_t blk_h, blk_t_h; read_csr_mat(argv[1], &csr); int rows = csr.rows; int cols = csr.cols; INT64 non_zeros = csr.non_zeros; csr_transpose(&csr, &csr_t); release_csr_mat(&csr); int *reorder_map = (int*)malloc(cols * sizeof(int)); csr_reorder(&csr_t, &csr_re, reorder_map); release_csr_mat(&csr_t); csr_transpose(&csr_re, &csr_t_t); release_csr_mat(&csr_re); split_csr_lb_nz(&csr_t_t, &csr_h, num_threads, SPLIT_HORIZON); release_csr_mat(&csr_t_t); csr_cont_to_blk_cont(&csr_h, &blk_h); release_csr_cont(&csr_h); printf("Notify: finished the preprocessing.\n"); FLOAT *x = (FLOAT*)numa_alloc(cols * sizeof(FLOAT)); FLOAT *y = (FLOAT*)numa_alloc(rows * sizeof(FLOAT)); for (i = 0; i < cols; i++) { x[i] = 1.0; } // warm up spmv_blks(&blk_h, x, y, NULL); printf("Notify: begin csr spmv.\n"); clock_gettime(CLOCK_MONOTONIC_RAW, &start); for (i = 0; i < LOOP_TIME; i++) { spmv_blks(&blk_h, x, y, NULL); } clock_gettime(CLOCK_MONOTONIC_RAW, &end); double time = get_sec(&start, &end) / LOOP_TIME; double gflops = 2.0 * non_zeros / time * 1e-9; printf("Notify: blk spmv time = %lfs, perf = %lf GFLOPS.\n", time, gflops); // result_file(y, rows); return 0; }
static inline T *alloc( std::size_t i_size ///< size of block ) { T *data = nullptr; #if NUMA_BLOCK_ALLOCATOR_TYPE == 1 || NUMA_BLOCK_ALLOCATOR_TYPE == 2 # if NUMA_BLOCK_ALLOCATOR_TYPE == 1 // dummy call here to initialize this class as part of the singleton out of critical region getSingletonRef(); # if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM # pragma omp critical # endif # endif { std::vector<void*>& block_list = getBlocksSameSize(i_size); if (block_list.size() > 0) { data = (T*)block_list.back(); block_list.pop_back(); } } if (data != nullptr) return data; return (T*)first_touch_init(numa_alloc(i_size), i_size); #elif NUMA_BLOCK_ALLOCATOR_TYPE == 3 #if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM # pragma omp critical #endif { std::vector<void*>& block_list = getBlocksSameSize(i_size); if (block_list.size() > 0) { data = (T*)block_list.back(); block_list.pop_back(); } } if (data != nullptr) return data; int retval = posix_memalign((void**)&data, 4096, i_size); if (retval != 0) { std::cerr << "Unable to allocate memory" << std::endl; assert(false); exit(-1); } first_touch_init(data, i_size); return data; #else // allocate a new element to the list of blocks given in block_list // posix_memalign is thread safe // http://www.qnx.com/developers/docs/6.3.0SP3/neutrino/lib_ref/p/posix_memalign.html int retval = posix_memalign((void**)&data, 4096, i_size); if (retval != 0) { std::cerr << "Unable to allocate memory" << std::endl; assert(false); exit(-1); } first_touch_init(data, i_size); return data; #endif }