void mp_barrier(cycles_t *measurement) { coreid_t tid = get_core_id(); #ifdef QRM_DBG_ENABLED ++_num_barrier; uint32_t _num_barrier_recv = _num_barrier; #endif debug_printfff(DBG__REDUCE, "barrier enter #%d\n", _num_barrier); // Recution // -------------------------------------------------- #ifdef QRM_DBG_ENABLED uint32_t _tmp = #endif mp_reduce(_num_barrier); #ifdef QRM_DBG_ENABLED // Sanity check if (tid==get_sequentializer()) { assert (_tmp == get_num_threads()*_num_barrier); } if (measurement) *measurement = bench_tsc(); #endif // Broadcast // -------------------------------------------------- if (tid == get_sequentializer()) { mp_send_ab(_num_barrier); } else { #ifdef QRM_DBG_ENABLED _num_barrier_recv = #endif mp_receive_forward(0); } #ifdef QRM_DBG_ENABLED if (_num_barrier_recv != _num_barrier) { debug_printf("ASSERTION fail %d != %d\n", _num_barrier_recv, _num_barrier); } assert (_num_barrier_recv == _num_barrier); // Add a shared memory barrier to absolutely make sure that // everybody finished the barrier before leaving - this simplifies // debugging, as the programm will get stuck if barriers are // broken, rather than some threads (wrongly) continuing and // causing problems somewhere else #if 0 // Enable separately debug_printfff(DBG_REDUCE, "finished barrier .. waiting for others\n"); shl_barrier_shm(get_num_threads()); #endif #endif debug_printfff(DBG__REDUCE, "barrier complete #%d\n", _num_barrier); }
static void _convolution(const my_Dvector& vec1, const my_Dvector& vec2, my_Dvector& conv, int init, ThreadPool& pool) { // To do the convolution I trim the beginning and ending points if they are zero (1e-30); int min1 (0), max1 (vec1.size()); _trim_vec(vec1, min1, max1); int min2 (0), max2 (vec2.size()); _trim_vec(vec2, min2, max2); int ini = max(min1+min2-init,0); int fin = min(int(conv.size()), max1 + max2 - 1); unsigned int nr_th = get_num_threads(); unsigned int nr_job = max((fin-ini)/100, min(int(nr_th), fin-ini)); my_Ivector lims (get_bounds(ini, fin, nr_job)); std::vector< std::future<int> > results; //lambda function auto func = [&](my_Dvector& v, int com, int ter) { for (int ii=com;ii<ter;++ii){ const int delta = max(ii+init-min2-max1,0); for (int j=min2+delta,k=(ii+init-min2-delta);j<max2 && k>=min1;++j,--k){ v[ii] += vec1[k]*vec2[j]; } } return 1; }; // for (unsigned int i=0;i<nr_job;++i) results.emplace_back(pool.enqueue( func, ref(conv), lims[i], lims[i+1])); for(auto && result: results) result.get(); }
my_Dvector convolution_same_orig(const my_Dvector& vec1, const my_Dvector& vec2, ThreadPool& pool) { my_Dvector conv (vec1.size(),0.0); int init = vec2.size()/2; unsigned int nr_th = get_num_threads(); my_Ivector lims (get_bounds(0,conv.size())); std::vector< std::future<int> > results; //lambda function auto func = [&](my_Dvector& v, int com, int ter) { for (int ii=com;ii<ter;++ii){ for (int j=0,k=(ii+init);j<vec2.size() && k>=0;++j,--k){ if (k<vec1.size()){ v[ii] += vec1[k]*vec2[j];} } } return 1; }; // for (unsigned int i=0;i<nr_th;++i) results.emplace_back(pool.enqueue( func, ref(conv), lims[i], lims[i+1])); for(auto && result: results) result.get(); return conv; }
uint64_t parallel_nth_prime(int64_t n, uint64_t start) { ParallelPrimeSieve pps; pps.setSieveSize(get_sieve_size()); pps.setNumThreads(get_num_threads()); return pps.nthPrime(n, start); }
uint64_t parallel_count_sextuplets(uint64_t start, uint64_t stop) { ParallelPrimeSieve pps; pps.setSieveSize(get_sieve_size()); pps.setNumThreads(get_num_threads()); return pps.countSextuplets(start, stop); }
unsigned Extrae_get_num_threads (void) { #if defined(OMP_SUPPORT) && !defined(OMPT_INSTRUMENTATION) return omp_get_num_threads(); #elif defined(SMPSS_SUPPORT) return css_get_max_threads(); #elif defined(NANOS_SUPPORT) return get_num_threads(); #elif defined(PTHREAD_SUPPORT) return Backend_getNumberOfThreads(); #elif defined(UPC_SUPPORT) return GetNumUPCthreads(); #else return get_num_threads(); #endif }
JNIEXPORT jint JNICALL Java_es_bsc_cepbatools_extrae_Wrapper_GetNumThreads( JNIEnv *env, jclass jc) { UNREFERENCED(env); UNREFERENCED(jc); return get_num_threads(); }
void memcache_unlock(void) { /* ok to ask for locks or mark stale before all_memory_areas is allocated, * during heap init and before we can allocate it. no lock needed then. */ ASSERT(all_memory_areas != NULL || get_num_threads() <= 1 /*must be only DR thread*/); if (all_memory_areas == NULL) return; if (all_memory_areas_recursion > 0) { ASSERT_OWN_WRITE_LOCK(true, &all_memory_areas->lock); all_memory_areas_recursion--; } else write_unlock(&all_memory_areas->lock); }
unsigned int Trick::Executive::get_process_id() { unsigned int ii ; pthread_t curr_pthread_id ; if ( get_num_threads() > 1 ) { curr_pthread_id = pthread_self() ; for (ii = 0 ; ii < threads.size() ; ii++ ) { if ( pthread_equal(curr_pthread_id,threads[ii]->get_pthread_id()) ) { return(ii) ; } } } return(0) ; }
/* HACK to get recursive write lock for internal and external use * FIXME: code blatantly copied from dynamo_vm_areas_{un}lock(); eliminate duplication! */ void memcache_lock(void) { /* ok to ask for locks or mark stale before all_memory_areas is allocated, * during heap init and before we can allocate it. no lock needed then. */ ASSERT(all_memory_areas != NULL || get_num_threads() <= 1 /* must be only DR thread */); if (all_memory_areas == NULL) return; if (self_owns_write_lock(&all_memory_areas->lock)) { all_memory_areas_recursion++; /* we have a 5-deep path: * global_heap_alloc | heap_create_unit | get_guarded_real_memory | * heap_low_on_memory | release_guarded_real_memory */ ASSERT_CURIOSITY(all_memory_areas_recursion <= 4); } else write_lock(&all_memory_areas->lock); }
static void fillThreadsAndLoadObjects(JNIEnv* env, jobject this_obj, struct ps_prochandle* ph) { int n = 0, i = 0; // add threads n = get_num_threads(ph); for (i = 0; i < n; i++) { jobject thread; jobject threadList; lwpid_t lwpid; lwpid = get_lwp_id(ph, i); thread = (*env)->CallObjectMethod(env, this_obj, getThreadForThreadId_ID, (jlong)lwpid); CHECK_EXCEPTION; threadList = (*env)->GetObjectField(env, this_obj, threadList_ID); CHECK_EXCEPTION; (*env)->CallBooleanMethod(env, threadList, listAdd_ID, thread); CHECK_EXCEPTION; } // add load objects n = get_num_libs(ph); for (i = 0; i < n; i++) { uintptr_t base; const char* name; jobject loadObject; jobject loadObjectList; base = get_lib_base(ph, i); name = get_lib_name(ph, i); loadObject = (*env)->CallObjectMethod(env, this_obj, createLoadObject_ID, (*env)->NewStringUTF(env, name), (jlong)0, (jlong)base); CHECK_EXCEPTION; loadObjectList = (*env)->GetObjectField(env, this_obj, loadObjectList_ID); CHECK_EXCEPTION; (*env)->CallBooleanMethod(env, loadObjectList, listAdd_ID, loadObject); CHECK_EXCEPTION; } }
extern "C" CDECL int rust_start(uintptr_t main_fn, int argc, char **argv, void* crate_map) { update_log_settings(crate_map, getenv("RUST_LOG")); enable_claims(getenv("CHECK_CLAIMS")); rust_srv *srv = new rust_srv(); rust_kernel *kernel = new rust_kernel(srv); kernel->start(); rust_scheduler *sched = kernel->get_scheduler(); command_line_args *args = new (kernel) command_line_args(sched->root_task, argc, argv); DLOG(sched, dom, "startup: %d args in 0x%" PRIxPTR, args->argc, (uintptr_t)args->args); for (int i = 0; i < args->argc; i++) { DLOG(sched, dom, "startup: arg[%d] = '%s'", i, args->argv[i]); } sched->root_task->start(main_fn, (uintptr_t)args->args); int num_threads = get_num_threads(); DLOG(sched, dom, "Using %d worker threads.", num_threads); int ret = kernel->start_task_threads(num_threads); delete args; delete kernel; delete srv; #if !defined(__WIN32__) // Don't take down the process if the main thread exits without an // error. if (!ret) pthread_exit(NULL); #endif return ret; }
int main(int argc, char *argv[]) { int nobs, sizex, nsample = 0; char *location = NULL; int ret = 0; ////////////////////////////////////// /////////////// PARSERS ////////////// ////////////////////////////////////// // Parse the command line ret = parse_command_line(argc,argv,&nobs,&sizex,&nsample,&location); if( ret != PARSER_SUCCESS ) { printf("Parsing failed ! Exiting...\n"); return EXIT_FAILURE; } // Parse the data on master double *buffer_X = (double*)malloc(nobs*sizex*sizeof(double)); double *isigma = (double*)malloc(sizex*sizex*sizeof(double)); double *mu = (double*)malloc(sizex*sizeof(double)); double det_sigma = 0.0; ret = read_data(buffer_X, isigma, &det_sigma, mu, &nobs, &sizex, location); if( ret != PARSER_SUCCESS ) { printf("Parsing failed ! Exiting...\n"); return EXIT_FAILURE; } //////////////////////////////////////// /////////////// Variables ////////////// //////////////////////////////////////// // Thread variables int nthreads = 1; int th_num = 0; int th_nobs = nobs; nthreads = get_num_threads(); // Timing variables double tic, toc, tot_time = 0.0; //// Arrays for all threads // The pool is allocated inside the shared memory double *pool_LV = (double*)malloc(nobs*sizex*sizeof(double)); // Left hand side vector (X-mu) double *pool_tmp = (double*)malloc(nobs*sizex*sizeof(double)); // Temporary holder for (X-mu)*SIG double *pool_ones = (double*)malloc(nobs*sizeof(double)); // Temporary holder to create LV double *pool_res = (double*)malloc(nthreads*sizeof(double)); // Each thread puts its result in pool_res // Use pointers to get the correct location in the array double *LV = NULL; double *tmp = NULL; double *ones = NULL; double *X = NULL; // Holder for final sum double final_sum = 0.0; //////////////////////////////////////// /////////////// Algorithm ////////////// //////////////////////////////////////// //// Start time sampling for(int k = 0; k < nsample; k++) { tic = omp_get_wtime(); final_sum = 0.0; // Main driver #pragma omp parallel private(th_num,th_nobs,LV,tmp,ones,X) default(shared) { // Get thread number th_num = omp_get_thread_num(); // Total number of observations for that thread th_nobs = nobs/nthreads; // Use the address to point to the correct location in the vector X = &buffer_X[th_num*nobs*sizex/nthreads]; LV = &pool_LV[th_num*th_nobs*sizex]; tmp = &pool_tmp[th_num*th_nobs*sizex]; ones = &pool_ones[th_num*th_nobs]; // Each process can now calculate the term in the // exponent for a subset of random vectors // Naive approach: for loop on each vector X // pool_res[th_num] += exp_term(); // Guru approach: BLAS log_likelihood(X,isigma,mu,det_sigma,th_nobs,sizex,&pool_res[th_num],LV,tmp,ones); #pragma omp barrier // Reduction: sum all the intermediary results #pragma omp for reduction(+:final_sum) for(int i = 0; i < nthreads; i++) final_sum = final_sum + pool_res[i]; } toc = omp_get_wtime(); tot_time += toc-tic; } printf("Result: %f\n",final_sum); printf("Total time: %f\n",tot_time/(double)nsample); //////////////////////////////////////// /////////////// Clean up /////////////// //////////////////////////////////////// free(pool_res); free(pool_ones); free(pool_tmp); free(pool_LV); free(buffer_X); free(isigma); free(mu); free(location); return EXIT_SUCCESS; }
PrimeCountOptions() : x(-1), option(OPTION_PI), time(false), threads(get_num_threads()) { }
// this function follows matlab's convention of same, not numpy's. my_Dvector convolution_same_orig(const my_Dvector& vec1, const my_Dvector& vec2) { ThreadPool pool (get_num_threads()); return convolution_same_orig(vec1, vec2, pool); }
int get_multithreaded() { return get_multithread_capable() && get_num_threads() > 1; }
//--------------------------------------------------------------- // START FUNC DECL int num_in_range( char *t1, char *f1, char *t2, char *lb, char *ub, char *cnt ) // STOP FUNC DECL { int status = 0; char *f1_X = NULL; size_t f1_nX = 0; char *lb_X = NULL; size_t lb_nX = 0; char *ub_X = NULL; size_t ub_nX = 0; char *cnt_X = NULL; size_t cnt_nX = 0; int t1_id = INT_MIN, t2_id = INT_MIN; int f1_id = INT_MIN, lb_id = INT_MIN, ub_id = INT_MIN, cnt_id = INT_MIN; FLD_TYPE *f1_meta = NULL, *lb_meta = NULL, *ub_meta = NULL; long long nR1 = INT_MIN, nR2 = INT_MIN, chk_nR1 = INT_MIN; long long **cntptrs = NULL; // For multi-threading int nT; int rc; // result code for thread create pthread_t threads[MAX_NUM_THREADS]; pthread_attr_t attr; void *thread_status; char str_meta_data[1024]; char *opfile = NULL; FILE *ofp = NULL; //---------------------------------------------------------------- if ( ( t1 == NULL ) || ( *t1 == '\0' ) ) { go_BYE(-1); } if ( ( f1 == NULL ) || ( *f1 == '\0' ) ) { go_BYE(-1); } if ( ( t2 == NULL ) || ( *t2 == '\0' ) ) { go_BYE(-1); } if ( ( lb == NULL ) || ( *lb == '\0' ) ) { go_BYE(-1); } if ( ( ub == NULL ) || ( *ub == '\0' ) ) { go_BYE(-1); } if ( ( cnt == NULL ) || ( *cnt == '\0' ) ) { go_BYE(-1); } if ( strcmp(t1, t2) == 0 ) { go_BYE(-1); } zero_string(str_meta_data, 1024); //-------------------------------------------------------- status = is_tbl(t1, &t1_id); cBYE(status); chk_range(t1_id, 0, g_n_tbl); nR1 = g_tbl[t1_id].nR; //-------------------------------------------------------- status = is_fld(NULL, t1_id, f1, &f1_id); cBYE(status); chk_range(f1_id, 0, g_n_fld); f1_meta = &(g_fld[f1_id]); status = rs_mmap(f1_meta->filename, &f1_X, &f1_nX, 0); cBYE(status); // Have not implemented case where f1 has null field if ( f1_meta->nn_fld_id >= 0 ) { go_BYE(-1); } // Have implemented only for int if ( strcmp(f1_meta->fldtype, "int") != 0 ) { cBYE(-1); } //-------------------------------------------------------- status = is_tbl(t2, &t2_id); cBYE(status); chk_range(t2_id, 0, g_n_tbl); nR2 = g_tbl[t2_id].nR; //-------------------------------------------------------- status = is_fld(NULL, t2_id, lb, &lb_id); cBYE(status); chk_range(lb_id, 0, g_n_fld); lb_meta = &(g_fld[lb_id]); status = rs_mmap(lb_meta->filename, &lb_X, &lb_nX, 0); cBYE(status); // Have not implemented case where lb has null field if ( lb_meta->nn_fld_id >= 0 ) { go_BYE(-1); } // Have implemented only for int if ( strcmp(lb_meta->fldtype, "int") != 0 ) { cBYE(-1); } //-------------------------------------------------------- status = is_fld(NULL, t2_id, ub, &ub_id); cBYE(status); chk_range(ub_id, 0, g_n_fld); ub_meta = &(g_fld[ub_id]); status = rs_mmap(ub_meta->filename, &ub_X, &ub_nX, 0); cBYE(status); // Have not implemented case where ub has null field if ( ub_meta->nn_fld_id >= 0 ) { go_BYE(-1); } // Have implemented only for int if ( strcmp(ub_meta->fldtype, "int") != 0 ) { cBYE(-1); } //-------------------------------------------------------- // Set up access to input int *inptr = (int *)f1_X; int *lbptr = (int *)lb_X; int *ubptr = (int *)ub_X; //-------------------------------------------------------- //--- Decide on how much parallelism to use for ( int i = 0; i < MAX_NUM_THREADS; i++ ) { g_thread_id[i] = i; g_num_rows[i] = 0; } status = get_num_threads(&nT); cBYE(status); //-------------------------------------------- #define MIN_ROWS_FOR_SUBSAMPLE 10000 // 1048576 if ( nR1 <= MIN_ROWS_FOR_SUBSAMPLE ) { nT = 1; } /* Don't create more threads than you can use */ if ( nT > nR1 ) { nT = nR1; } //-------------------------------------------- /* Make space for output */ long long filesz = nR2 * sizeof(long long); status = open_temp_file(&ofp, &opfile, filesz); cBYE(status); fclose_if_non_null(ofp); status = mk_file(opfile, filesz); cBYE(status); status = rs_mmap(opfile, &cnt_X, &cnt_nX, 1); long long *cntptr = (long long *)cnt_X; /* Make a holding tank for partial results */ cntptrs = malloc(nT * sizeof(long long *)); return_if_malloc_failed(cntptrs); for ( int i = 0; i < nT; i++ ) { cntptrs[i] = malloc(nR2 * sizeof(long long)); return_if_malloc_failed(cntptrs[i]); for ( long long j = 0; j <nR2; j++ ) { cntptrs[i][j] = 0; } } // Add count field to meta data sprintf(str_meta_data, "fldtype=long long:n_sizeof=8:filename=%s", opfile); status = add_fld(t2, cnt, str_meta_data, &cnt_id); cBYE(status); chk_range(cnt_id, 0, g_n_fld); //----------------------------------------------------------- // Now we count how much there is in each range // Set up global variables g_nT = nT; g_inptr = inptr; g_lbptr = lbptr; g_ubptr = ubptr; g_cntptrs = cntptrs; g_nR1 = nR1; g_nR2 = nR2; if ( g_nT == 1 ) { core_num_in_range(&(g_thread_id[0])); chk_nR1 = g_num_rows[0]; } else { chk_nR1 = 0; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for ( int t = 0; t < g_nT; t++ ) { rc = pthread_create(&threads[t], NULL, core_num_in_range, &(g_thread_id[t])); if ( rc ) { go_BYE(-1); } } /* Free attribute and wait for the other threads */ pthread_attr_destroy(&attr); for ( int t = 0; t < g_nT; t++ ) { rc = pthread_join(threads[t], &thread_status); if ( rc ) { go_BYE(-1); } chk_nR1 += g_num_rows[t]; } } if ( chk_nR1 != nR1 ) { go_BYE(-1); } // Accumulate partial results for ( long long i = 0; i < nR2; i++ ) { cntptr[i] = 0; for ( int j= 0; j < nT; j++ ) { cntptr[i] += cntptrs[j][i]; } } BYE: g_write_to_temp_dir = false; rs_munmap(f1_X, f1_nX); rs_munmap(lb_X, lb_nX); rs_munmap(ub_X, ub_nX); rs_munmap(cnt_X, cnt_nX); free_if_non_null(opfile); return(status); }
/// Calculate the number of primes below x using Legendre's formula. /// Run time: O(x) operations, O(x^(1/2)) space. /// int64_t pi_legendre(int64_t x) { return pi_legendre(x, get_num_threads()); }
/// Alias for the fastest prime summing function in primesum. /// @param x integer arithmetic expression e.g. "10^12". /// @pre x <= get_max_x(). /// string pi(const string& x) { return pi(x, get_num_threads()); }
int256_t pi(int128_t x) { return pi(x, get_num_threads()); }
/// Partial sieve function (a.k.a. Legendre-sum). /// phi(x, a) counts the numbers <= x that are not divisible /// by any of the first a primes. /// int64_t phi(int64_t x, int64_t a) { return phi(x, a, get_num_threads()); }
//--------------------------------------------------------------- // START FUNC DECL int parsort1( char *tbl, char *f1, char *f2, char *up_or_down /* not used right now */ ) // STOP FUNC DECL { int status = 0; char *f1_X = NULL; size_t f1_nX = 0; char *op_X = NULL; size_t op_nX = 0; char *cnt_X = NULL; size_t cnt_nX = 0; char *t2f2_X = NULL; size_t t2f2_nX = 0; FLD_TYPE *f1_meta = NULL; FLD_TYPE *f2_meta = NULL; FLD_TYPE *t2f2_meta = NULL; FLD_TYPE *cnt_meta = NULL; long long nR, nR2; int tbl_id = INT_MIN, f1_id = INT_MIN, f2_id = INT_MIN, cnt_id = INT_MIN; int t2f2_id = INT_MIN; char str_meta_data[1024]; FILE *ofp = NULL; char *opfile = NULL; FILE *tfp = NULL; char *tempfile = NULL; char str_rslt[32]; zero_string(str_rslt, 32); char t2[MAX_LEN_TBL_NAME]; int itemp; int *xxx = NULL, *f1lb = NULL, *f1ub = NULL; long long *count = NULL, *chk_count = NULL; int **offsets = NULL, **bak_offsets = NULL; int *inptr = NULL; // For multi-threading int rc; // result code for thread create pthread_t threads[MAX_NUM_THREADS]; pthread_attr_t attr; void *thread_status; // START: For timing struct timeval Tps; struct timezone Tpf; void *Tzp = NULL; long long t_before_sec = 0, t_before_usec = 0, t_before = 0; long long t_after_sec, t_after_usec, t_after; long long t_delta_usec; // STOP : For timing //---------------------------------------------------------------- if ( ( tbl == NULL ) || ( *tbl == '\0' ) ) { go_BYE(-1); } if ( ( f1 == NULL ) || ( *f1 == '\0' ) ) { go_BYE(-1); } if ( ( f2 == NULL ) || ( *f2 == '\0' ) ) { go_BYE(-1); } zero_string(str_meta_data, 1024); /* t2 isa temporary table */ zero_string(t2, MAX_LEN_TBL_NAME); status = qd_uq_str(t2, MAX_LEN_TBL_NAME); strcpy(t2, "t2"); // TODO DELETE THIS g_offsets = NULL; g_count = NULL; //-------------------------------------------------------- status = is_tbl(tbl, &tbl_id); cBYE(status); chk_range(tbl_id, 0, g_n_tbl); nR = g_tbl[tbl_id].nR; status = is_fld(NULL, tbl_id, f1, &f1_id); cBYE(status); chk_range(f1_id, 0, g_n_fld); f1_meta = &(g_fld[f1_id]); status = rs_mmap(f1_meta->filename, &f1_X, &f1_nX, 0); cBYE(status); // Not implemented for following cases if ( g_fld[f1_id].nn_fld_id >= 0 ) { go_BYE(-1); } if ( strcmp(f1_meta->fldtype, "int") != 0 ) { go_BYE(-1); } if ( nR <= 1048576 ) { go_BYE(-1); } //--------------------------------------------- status = gettimeofday(&Tps, &Tpf); cBYE(status); t_before_sec = (long long)Tps.tv_sec; t_before_usec = (long long)Tps.tv_usec; t_before = t_before_sec * 1000000 + t_before_usec; int reduction_factor = (int)(sqrt((double)nR)); sprintf(str_rslt, "%d", reduction_factor); status = subsample(tbl, f1, str_rslt, t2, "f2"); cBYE(status); status = gettimeofday(&Tps, &Tpf); cBYE(status); t_after_sec = (long long)Tps.tv_sec; t_after_usec = (long long)Tps.tv_usec; t_after = t_after_sec * 1000000 + t_after_usec; fprintf(stderr, "TIME0 = %lld \n", t_after - t_before); t_before = t_after; // Must have sufficient diversity of values status = f1opf2(t2, "f2", "op=shift:val=-1", "nextf2"); cBYE(status); status = drop_nn_fld(t2, "nextf2"); cBYE(status); status = f1f2opf3(t2, "f2", "nextf2", "==", "x"); cBYE(status); status = f_to_s(t2, "x", "sum", str_rslt); char *endptr; long long lltemp = strtoll(str_rslt, &endptr, 10); if ( lltemp != 0 ) { go_BYE(-1); } //------------------------------------------------- // Get range of values of f1 status = f_to_s(tbl, f1, "max", str_rslt); int f1max = strtoll(str_rslt, &endptr, 10); status = f_to_s(tbl, f1, "min", str_rslt); int f1min = strtoll(str_rslt, &endptr, 10); //------------------------------------------------- // Now we sort the values that we sampled status = fop(t2, "f2", "sortA"); cBYE(status); // status = pr_fld(t2, "f2", "", stdout); status = get_nR(t2, &nR2); // Now each thread selects a range to work on int nT; for ( int i = 0; i < MAX_NUM_THREADS; i++ ) { g_thread_id[i] = i; } status = get_num_threads(&nT); cBYE(status); //-------------------------------------------- #define MIN_ROWS_FOR_PARSORT1 1048576 if ( nR <= MIN_ROWS_FOR_PARSORT1 ) { nT = 1; } /* Don't create more threads than you can use */ if ( nT > nR ) { nT = nR; } //-------------------------------------------- double block_size = (double)nR2 / (double)nT; status = is_fld(t2, -1, "f2", &t2f2_id); cBYE(status); chk_range(t2f2_id, 0, g_n_fld); t2f2_meta = &(g_fld[t2f2_id]); status = rs_mmap(t2f2_meta->filename, &t2f2_X, &t2f2_nX, 0); cBYE(status); int *iptr = (int *)t2f2_X; xxx = malloc(nT * sizeof(int)); return_if_malloc_failed(xxx); f1lb = malloc(nT * sizeof(int)); return_if_malloc_failed(f1lb); f1ub = malloc(nT * sizeof(int)); return_if_malloc_failed(f1ub); /* FOR OLD_WAY count = malloc(nT * sizeof(long long)); return_if_malloc_failed(count); */ chk_count = malloc(nT * sizeof(long long)); return_if_malloc_failed(chk_count); g_count = malloc(nT * sizeof(long long)); return_if_malloc_failed(g_count); for ( int i = 0; i < nT; i++ ) { // FOR OLD_WAY count[i]= 0; chk_count[i]= 0; int j = i+1; long long idx = j * block_size; if ( idx >= nR2 ) { idx = nR2 -1 ; } int y = iptr[idx]; xxx[i] = y; // fprintf(stdout,"idx = %lld: j = %d: y = %d \n", idx, j, y); } for ( int i = 0; i < nT; i++ ) { if ( ( i == 0 ) && ( i == (nT - 1 ) ) ) { f1lb[i] = f1min; f1ub[i] = f1max; } else if ( i == 0 ) { f1lb[i] = f1min; f1ub[i] = xxx[i]; } else if ( i == (nT -1 ) ) { f1lb[i] = xxx[i-1] + 1; f1ub[i] = f1max; } else { f1lb[i] = xxx[i-1] + 1; f1ub[i] = xxx[i]; } } // STOP: Each thread has now a range to work on // Create a temporary table t3 to store ranges char t3[MAX_LEN_TBL_NAME]; int t3_id; zero_string(t3, MAX_LEN_TBL_NAME); status = qd_uq_str(t3, MAX_LEN_TBL_NAME); strcpy(t3, "t3"); // TODO DELETE THIS sprintf(str_rslt, "%d", nT); status = add_tbl(t3, str_rslt, &t3_id); // Add lower bound to t3 status = open_temp_file(&tfp, &tempfile, -1); cBYE(status); fclose_if_non_null(tfp); tfp = fopen(tempfile, "wb"); return_if_fopen_failed(tfp, tempfile, "wb"); fwrite(f1lb, sizeof(int), nT, tfp); fclose_if_non_null(tfp); sprintf(str_meta_data, "fldtype=%s:n_sizeof=%d:filename=%s", f1_meta->fldtype, f1_meta->n_sizeof, tempfile); status = add_fld(t3, "lb", str_meta_data, &itemp); cBYE(status); free_if_non_null(tempfile); // Add upper bound to t3 status = open_temp_file(&tfp, &tempfile, -1); cBYE(status); fclose_if_non_null(tfp); tfp = fopen(tempfile, "wb"); return_if_fopen_failed(tfp, tempfile, "wb"); fwrite(f1ub, sizeof(int), nT, tfp); fclose_if_non_null(tfp); sprintf(str_meta_data, "fldtype=%s:n_sizeof=%d:filename=%s", f1_meta->fldtype, f1_meta->n_sizeof, tempfile); status = add_fld(t3, "ub", str_meta_data, &itemp); cBYE(status); free_if_non_null(tempfile); #undef OLD_WAY #ifdef OLD_WAY // Now we count how much there is in each range inptr = (int *)f1_X; for ( long long i = 0; i < nR; i++ ) { int ival = *inptr++; int range_idx = INT_MIN; // TODO: Improve sequential search for ( int j = 0; j < nT; j++ ) { if ( ival >= f1lb[j] && ( ival <= f1ub[j] ) ) { range_idx = j; break; } } count[range_idx]++; } /* for ( int i = 0; i < nT; i++ ) { fprintf(stdout,"%d: (%d, %d) = %lld \n", i, f1lb[i], f1ub[i], count[i]); } */ #else status = num_in_range(tbl, f1, t3, "lb", "ub", "cnt"); cBYE(status); // Get a pointer to the count field status = is_tbl(t3, &t3_id); chk_range(t3_id, 0, g_n_tbl); status = is_fld(NULL, t3_id, "cnt", &cnt_id); chk_range(cnt_id, 0, g_n_fld); cnt_meta = &(g_fld[cnt_id]); status = rs_mmap(cnt_meta->filename, &cnt_X, &cnt_nX, 0); cBYE(status); count = (long long *)cnt_X; #endif status = gettimeofday(&Tps, &Tpf); cBYE(status); t_after_sec = (long long)Tps.tv_sec; t_after_usec = (long long)Tps.tv_usec; t_after = t_after_sec * 1000000 + t_after_usec; fprintf(stderr, "TIME1 = %lld \n", t_after - t_before); t_before = t_after; bak_offsets = malloc(nT * sizeof(int *)); return_if_malloc_failed(bak_offsets); g_offsets = malloc(nT * sizeof(int *)); return_if_malloc_failed(g_offsets); #ifdef OLD_WAY // Make space for output long long filesz = nR * f1_meta->n_sizeof; status = open_temp_file(&ofp, &opfile, filesz); cBYE(status); status = mk_file(opfile, filesz); cBYE(status); status = rs_mmap(opfile, &op_X, &op_nX, 1); cBYE(status); offsets = malloc(nT * sizeof(int *)); return_if_malloc_failed(offsets); long long cum_count = 0; for ( int i = 0; i < nT; i++ ) { bak_offsets[i] = offsets[i] = (int *)op_X; if ( i > 0 ) { cum_count += count[i-1]; offsets[i] += cum_count; bak_offsets[i] = offsets[i]; } } inptr = (int *)f1_X; // Now we place each item into its thread bucket for ( long long i = 0; i < nR; i++ ) { int ival = *inptr++; int range_idx = INT_MIN; // TODO: Improve sequential search for ( int j = 0; j < nT; j++ ) { if ( ival >= f1lb[j] && ( ival <= f1ub[j] ) ) { range_idx = j; break; } } int *xptr = offsets[range_idx]; *xptr = ival; offsets[range_idx]++; chk_count[range_idx]++; if ( chk_count[range_idx] > count[range_idx] ) { go_BYE(-1); } } cum_count = 0; for ( int i = 0; i < nT-1; i++ ) { if ( offsets[i] != bak_offsets[i+1] ) { go_BYE(-1); } } #else status = mv_range(tbl, f1, f2, t3, "lb", "ub", "cnt"); cBYE(status); status = is_fld(NULL, tbl_id, f2, &f2_id); chk_range(f2_id, 0, g_n_fld); f2_meta = &(g_fld[f2_id]); status = rs_mmap(f2_meta->filename, &op_X, &op_nX, 1); cBYE(status); #endif long long cum_count = 0; for ( int i = 0; i < nT; i++ ) { bak_offsets[i] = (int *)op_X; if ( i > 0 ) { cum_count += count[i-1]; bak_offsets[i] += cum_count; } } status = gettimeofday(&Tps, &Tpf); cBYE(status); t_after_sec = (long long)Tps.tv_sec; t_after_usec = (long long)Tps.tv_usec; t_after = t_after_sec * 1000000 + t_after_usec; fprintf(stderr, "TIME2 = %lld \n", t_after - t_before); t_before = t_after; // Set up global variables g_nT = nT; for ( int i = 0; i < nT; i++ ) { g_offsets[i] = bak_offsets[i]; g_count[i] = count[i]; } if ( g_nT == 1 ) { core_parsort1(&(g_thread_id[0])); } else { pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for ( int t = 0; t < g_nT; t++ ) { rc = pthread_create(&threads[t], NULL, core_parsort1, &(g_thread_id[t])); if ( rc ) { go_BYE(-1); } } /* Free attribute and wait for the other threads */ pthread_attr_destroy(&attr); for ( int t = 0; t < g_nT; t++ ) { rc = pthread_join(threads[t], &thread_status); if ( rc ) { go_BYE(-1); } } } /* SEQUENTIAL CODE for ( int i = 0; i < nT; i++ ) { qsort_asc_int(bak_offsets[i], count[i], sizeof(int), NULL); } */ status = gettimeofday(&Tps, &Tpf); cBYE(status); t_after_sec = (long long)Tps.tv_sec; t_after_usec = (long long)Tps.tv_usec; t_after = t_after_sec * 1000000 + t_after_usec; fprintf(stderr, "TIME3 = %lld \n", t_after - t_before); // Indicate the dst_fld is sorted ascending status = set_fld_info(tbl, f2, "sort=1"); rs_munmap(op_X, op_nX); status = del_tbl(t2, -1); cBYE(status); status = del_tbl(t3, -1); cBYE(status); BYE: rs_munmap(op_X, op_nX); rs_munmap(cnt_X, cnt_nX); free_if_non_null(xxx); free_if_non_null(f1lb); free_if_non_null(f1ub); // Do not delete unless using OLD_WAY free_if_non_null(count); free_if_non_null(g_count); free_if_non_null(g_offsets); free_if_non_null(offsets); free_if_non_null(bak_offsets); free_if_non_null(chk_count); fclose_if_non_null(ofp); g_write_to_temp_dir = false; rs_munmap(f1_X, f1_nX); rs_munmap(op_X, op_nX); free_if_non_null(opfile); return(status); }
//--------------------------------------------------------------- // START FUNC DECL int crossprod( char *t1, char *f1, char *t2, char *f2, char *t3 ) // STOP FUNC DECL { int status = 0; char *Y1 = NULL; size_t nY1 = 0; char *Y2 = NULL; size_t nY2 = 0; char *f1_X = NULL; size_t f1_nX = 0; char *f1_opfile = NULL; int f1type, f2type; char *f2_X = NULL; size_t f2_nX = 0; char *f2_opfile = NULL; long long f1size, f2size; int t2f1_fld_id = INT_MIN, t2f2_fld_id = INT_MIN; int t3_id = INT_MIN, itemp; long long chk_nR1 = 0, nR1, nR2, nR3; char str_meta_data[1024]; char *t3f1_opfile = NULL, *t3f2_opfile = NULL; FILE *ofp = NULL; char buffer[32]; // For multi-threading int rc; // result code for thread create pthread_t threads[MAX_NUM_THREADS]; pthread_attr_t attr; void *thread_status; //---------------------------------------------------------------- zero_string(str_meta_data, 1024); zero_string(buffer, 32); if ( strcmp(f1, f2) == 0 ) { go_BYE(-1); } /* Remove f1 != f2 restriction later. To do so, we need to specify * fields of t3 explicitly */ //---------------------------------------------------------------- status = get_data(t1, f1, &nR1, &f1_X, &f1_nX, &f1_opfile, &f1type, &f1size); cBYE(status); status = get_data(t2, f2, &nR2, &f2_X, &f2_nX, &f2_opfile, &f2type, &f2size); cBYE(status); nR3 = nR1 * nR2; if ( nR3 == 0 ) { fprintf(stderr, "No data to create t3 \n"); goto BYE; } // Create storage for field 1 in Table t3 */ long long filesz = nR3 * f1size; status = open_temp_file(&ofp, &t3f1_opfile, filesz); cBYE(status); fclose_if_non_null(ofp); status = mk_file(t3f1_opfile, filesz); cBYE(status); status = rs_mmap(t3f1_opfile, &Y1, &nY1, 1); cBYE(status); // Create storage for field 2 in Table t3 */ filesz = nR3 * f2size; status = open_temp_file(&ofp, &t3f2_opfile, filesz); cBYE(status); fclose_if_non_null(ofp); status = mk_file(t3f2_opfile, filesz); cBYE(status); status = rs_mmap(t3f2_opfile, &Y2, &nY2, 1); cBYE(status); //---------------------------------------------------------------- /* Set up parallelism computations. Parallelization strategy is * simple. Partition field 1 (nR1 rows) among the threads */ g_nR1 = nR1; g_nR2 = nR2; g_nR3 = nR3; g_f1type = f1type; g_f2type = f2type; g_f1size = f1size; g_f2size = f2size; g_f1_X = f1_X; g_f2_X = f2_X; g_Y1 = Y1; g_Y2 = Y2; for ( int i = 0; i < MAX_NUM_THREADS; i++ ) { g_thread_id[i] = i; g_num_rows_processed[i] = 0; } status = get_num_threads(&g_nT); cBYE(status); //-------------------------------------------- #define MIN_ROWS_FOR_CROSSPROD 4 // 1024 if ( nR1 <= MIN_ROWS_FOR_CROSSPROD ) { g_nT = 1; } /* Don't create more threads than you can use */ if ( g_nT > nR1 ) { g_nT = nR1; } if ( g_nT == 1 ) { core_crossprod(&(g_thread_id[0])); chk_nR1 = g_num_rows_processed[0]; } else { /* Create threads */ pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for ( int t = 0; t < g_nT; t++ ) { rc = pthread_create(&threads[t], NULL, core_crossprod, &(g_thread_id[t])); if ( rc ) { go_BYE(-1); } } /* Free attribute and wait for the other threads */ pthread_attr_destroy(&attr); for ( int t = 0; t < g_nT; t++ ) { rc = pthread_join(threads[t], &thread_status); if ( rc ) { go_BYE(-1); } chk_nR1 += g_num_rows_processed[t]; } } if ( chk_nR1 != nR1 ) { go_BYE(-1); } //---------------------------------------------------------------- // Add output fields to t3 meta data status = is_tbl(t3, &t3_id); cBYE(status); if ( t3_id >= 0 ) { status = del_tbl(NULL, t3_id); cBYE(status); } sprintf(buffer, "%lld", nR3); status = add_tbl(t3, buffer, &itemp); cBYE(status); sprintf(str_meta_data, "fldtype=int:n_sizeof=%u:filename=%s", f1size, t3f1_opfile); status = add_fld(t3, f1, str_meta_data, &t2f1_fld_id); cBYE(status); zero_string(str_meta_data, 1024); sprintf(str_meta_data, "fldtype=int:n_sizeof=%u:filename=%s", f2size, t3f2_opfile); status = add_fld(t3, f2, str_meta_data, &t2f2_fld_id); cBYE(status); BYE: fclose_if_non_null(ofp); rs_munmap(f1_X, f1_nX); rs_munmap(f2_X, f2_nX); if ( f1_opfile != NULL ) { unlink(f1_opfile); free_if_non_null(f1_opfile); } if ( f2_opfile != NULL ) { unlink(f2_opfile); free_if_non_null(f2_opfile); } free_if_non_null(t3f1_opfile); free_if_non_null(t3f2_opfile); return(status); }