/** * Create random unique keys starting from firstkey */ void * random_unique_gen_thread(void * args) { create_arg_t * arg = (create_arg_t *) args; relation_t * rel = & arg->rel; int64_t firstkey = arg->firstkey; int64_t maxid = arg->maxid; uint64_t i; value_t randstart = 5; /* rand() % 1000; */ /* for randomly seeding nrand48() */ unsigned short state[3] = {0, 0, 0}; unsigned int seed = time(NULL) + * (unsigned int *) pthread_self(); memcpy(state, &seed, sizeof(seed)); for (i = 0; i < rel->num_tuples; i++) { rel->tuples[i].key = firstkey; rel->tuples[i].payload = randstart + i; if(firstkey == maxid) firstkey = 0; firstkey ++; } /* randomly shuffle elements */ /* knuth_shuffle48(rel, state); */ /* wait at a barrier until all threads finish initializing data */ int rv; BARRIER_ARRIVE(arg->barrier, rv); /* parallel synchronized knuth-shuffle */ volatile char * locks = (volatile char *)(arg->locks); relation_t * fullrel = arg->fullrel; uint64_t rel_offset_in_full = rel->tuples - fullrel->tuples; uint64_t k = rel_offset_in_full + rel->num_tuples - 1; for (i = rel->num_tuples - 1; i > 0; i--, k--) { int64_t j = RAND_RANGE48(k, state); lock(locks+k); /* lock this rel-idx=i, fullrel-idx=k */ lock(locks+j); /* lock full rel-idx=j */ intkey_t tmp = fullrel->tuples[k].key; fullrel->tuples[k].key = fullrel->tuples[j].key; fullrel->tuples[j].key = tmp; unlock(locks+j); unlock(locks+k); } return 0; }
/** * The main thread of parallel radix join. It does partitioning in parallel with * other threads and during the join phase, picks up join tasks from the task * queue and calls appropriate JoinFunction to compute the join task. * * @param param * * @return */ void * prj_thread(void * param) { arg_t * args = (arg_t*) param; int32_t my_tid = args->my_tid; const int fanOut = 1 << (NUM_RADIX_BITS / NUM_PASSES); const int R = (NUM_RADIX_BITS / NUM_PASSES); const int D = (NUM_RADIX_BITS - (NUM_RADIX_BITS / NUM_PASSES)); const int thresh1 = MAX((1<<D), (1<<R)) * THRESHOLD1(args->nthreads); uint64_t results = 0; int i; int rv; part_t part; task_t * task; task_queue_t * part_queue; task_queue_t * join_queue; #ifdef SKEW_HANDLING task_queue_t * skew_queue; #endif int32_t * outputR = (int32_t *) calloc((fanOut+1), sizeof(int32_t)); int32_t * outputS = (int32_t *) calloc((fanOut+1), sizeof(int32_t)); MALLOC_CHECK((outputR && outputS)); part_queue = args->part_queue; join_queue = args->join_queue; #ifdef SKEW_HANDLING skew_queue = args->skew_queue; #endif args->histR[my_tid] = (int32_t *) calloc(fanOut, sizeof(int32_t)); args->histS[my_tid] = (int32_t *) calloc(fanOut, sizeof(int32_t)); /* in the first pass, partitioning is done together by all threads */ args->parts_processed = 0; #ifdef PERF_COUNTERS if(my_tid == 0){ PCM_initPerformanceMonitor(NULL, NULL); PCM_start(); } #endif /* wait at a barrier until each thread starts and then start the timer */ BARRIER_ARRIVE(args->barrier, rv); /* if monitoring synchronization stats */ SYNC_TIMERS_START(args, my_tid); #ifndef NO_TIMING if(my_tid == 0){ /* thread-0 checkpoints the time */ gettimeofday(&args->start, NULL); startTimer(&args->timer1); startTimer(&args->timer2); startTimer(&args->timer3); } #endif /********** 1st pass of multi-pass partitioning ************/ part.R = 0; part.D = NUM_RADIX_BITS / NUM_PASSES; part.thrargs = args; part.padding = PADDING_TUPLES; /* 1. partitioning for relation R */ part.rel = args->relR; part.tmp = args->tmpR; part.hist = args->histR; part.output = outputR; part.num_tuples = args->numR; part.total_tuples = args->totalR; part.relidx = 0; #ifdef USE_SWWC_OPTIMIZED_PART parallel_radix_partition_optimized(&part); #else parallel_radix_partition(&part); #endif /* 2. partitioning for relation S */ part.rel = args->relS; part.tmp = args->tmpS; part.hist = args->histS; part.output = outputS; part.num_tuples = args->numS; part.total_tuples = args->totalS; part.relidx = 1; #ifdef USE_SWWC_OPTIMIZED_PART parallel_radix_partition_optimized(&part); #else parallel_radix_partition(&part); #endif /* wait at a barrier until each thread copies out */ BARRIER_ARRIVE(args->barrier, rv); /********** end of 1st partitioning phase ******************/ /* 3. first thread creates partitioning tasks for 2nd pass */ if(my_tid == 0) { for(i = 0; i < fanOut; i++) { int32_t ntupR = outputR[i+1] - outputR[i] - PADDING_TUPLES; int32_t ntupS = outputS[i+1] - outputS[i] - PADDING_TUPLES; #ifdef SKEW_HANDLING if(ntupR > thresh1 || ntupS > thresh1){ DEBUGMSG(1, "Adding to skew_queue= R:%d, S:%d\n", ntupR, ntupS); task_t * t = task_queue_get_slot(skew_queue); t->relR.num_tuples = t->tmpR.num_tuples = ntupR; t->relR.tuples = args->tmpR + outputR[i]; t->tmpR.tuples = args->relR + outputR[i]; t->relS.num_tuples = t->tmpS.num_tuples = ntupS; t->relS.tuples = args->tmpS + outputS[i]; t->tmpS.tuples = args->relS + outputS[i]; task_queue_add(skew_queue, t); } else #endif if(ntupR > 0 && ntupS > 0) { task_t * t = task_queue_get_slot(part_queue); t->relR.num_tuples = t->tmpR.num_tuples = ntupR; t->relR.tuples = args->tmpR + outputR[i]; t->tmpR.tuples = args->relR + outputR[i]; t->relS.num_tuples = t->tmpS.num_tuples = ntupS; t->relS.tuples = args->tmpS + outputS[i]; t->tmpS.tuples = args->relS + outputS[i]; task_queue_add(part_queue, t); } } /* debug partitioning task queue */ DEBUGMSG(1, "Pass-2: # partitioning tasks = %d\n", part_queue->count); } SYNC_TIMER_STOP(&args->localtimer.sync3); /* wait at a barrier until first thread adds all partitioning tasks */ BARRIER_ARRIVE(args->barrier, rv); /* global barrier sync point-3 */ SYNC_GLOBAL_STOP(&args->globaltimer->sync3, my_tid); /************ 2nd pass of multi-pass partitioning ********************/ /* 4. now each thread further partitions and add to join task queue **/ #if NUM_PASSES==1 /* If the partitioning is single pass we directly add tasks from pass-1 */ task_queue_t * swap = join_queue; join_queue = part_queue; /* part_queue is used as a temporary queue for handling skewed parts */ part_queue = swap; #elif NUM_PASSES==2 while((task = task_queue_get_atomic(part_queue))){ serial_radix_partition(task, join_queue, R, D); } #else #warning Only 2-pass partitioning is implemented, set NUM_PASSES to 2! #endif #ifdef SKEW_HANDLING /* Partitioning pass-2 for skewed relations */ part.R = R; part.D = D; part.thrargs = args; part.padding = SMALL_PADDING_TUPLES; while(1) { if(my_tid == 0) { *args->skewtask = task_queue_get_atomic(skew_queue); } BARRIER_ARRIVE(args->barrier, rv); if( *args->skewtask == NULL) break; DEBUGMSG((my_tid==0), "Got skew task = R: %d, S: %d\n", (*args->skewtask)->relR.num_tuples, (*args->skewtask)->relS.num_tuples); int32_t numperthr = (*args->skewtask)->relR.num_tuples / args->nthreads; const int fanOut2 = (1 << D); free(outputR); free(outputS); outputR = (int32_t*) calloc(fanOut2 + 1, sizeof(int32_t)); outputS = (int32_t*) calloc(fanOut2 + 1, sizeof(int32_t)); free(args->histR[my_tid]); free(args->histS[my_tid]); args->histR[my_tid] = (int32_t*) calloc(fanOut2, sizeof(int32_t)); args->histS[my_tid] = (int32_t*) calloc(fanOut2, sizeof(int32_t)); /* wait until each thread allocates memory */ BARRIER_ARRIVE(args->barrier, rv); /* 1. partitioning for relation R */ part.rel = (*args->skewtask)->relR.tuples + my_tid * numperthr; part.tmp = (*args->skewtask)->tmpR.tuples; part.hist = args->histR; part.output = outputR; part.num_tuples = (my_tid == (args->nthreads-1)) ? ((*args->skewtask)->relR.num_tuples - my_tid * numperthr) : numperthr; part.total_tuples = (*args->skewtask)->relR.num_tuples; part.relidx = 2; /* meaning this is pass-2, no syncstats */ parallel_radix_partition(&part); numperthr = (*args->skewtask)->relS.num_tuples / args->nthreads; /* 2. partitioning for relation S */ part.rel = (*args->skewtask)->relS.tuples + my_tid * numperthr; part.tmp = (*args->skewtask)->tmpS.tuples; part.hist = args->histS; part.output = outputS; part.num_tuples = (my_tid == (args->nthreads-1)) ? ((*args->skewtask)->relS.num_tuples - my_tid * numperthr) : numperthr; part.total_tuples = (*args->skewtask)->relS.num_tuples; part.relidx = 2; /* meaning this is pass-2, no syncstats */ parallel_radix_partition(&part); /* wait at a barrier until each thread copies out */ BARRIER_ARRIVE(args->barrier, rv); /* first thread adds join tasks */ if(my_tid == 0) { const int THR1 = THRESHOLD1(args->nthreads); for(i = 0; i < fanOut2; i++) { int32_t ntupR = outputR[i+1] - outputR[i] - SMALL_PADDING_TUPLES; int32_t ntupS = outputS[i+1] - outputS[i] - SMALL_PADDING_TUPLES; if(ntupR > THR1 || ntupS > THR1){ DEBUGMSG(1, "Large join task = R: %d, S: %d\n", ntupR, ntupS); /* use part_queue temporarily */ for(int k=0; k < args->nthreads; k++) { int ns = (k == args->nthreads-1) ? (ntupS - k*(ntupS/args->nthreads)) : (ntupS/args->nthreads); task_t * t = task_queue_get_slot(part_queue); t->relR.num_tuples = t->tmpR.num_tuples = ntupR; t->relR.tuples = (*args->skewtask)->tmpR.tuples + outputR[i]; t->tmpR.tuples = (*args->skewtask)->relR.tuples + outputR[i]; t->relS.num_tuples = t->tmpS.num_tuples = ns; //ntupS; t->relS.tuples = (*args->skewtask)->tmpS.tuples + outputS[i] //; + k*(ntupS/args->nthreads); t->tmpS.tuples = (*args->skewtask)->relS.tuples + outputS[i] //; + k*(ntupS/args->nthreads); task_queue_add(part_queue, t); } } else if(ntupR > 0 && ntupS > 0) { task_t * t = task_queue_get_slot(join_queue); t->relR.num_tuples = t->tmpR.num_tuples = ntupR; t->relR.tuples = (*args->skewtask)->tmpR.tuples + outputR[i]; t->tmpR.tuples = (*args->skewtask)->relR.tuples + outputR[i]; t->relS.num_tuples = t->tmpS.num_tuples = ntupS; t->relS.tuples = (*args->skewtask)->tmpS.tuples + outputS[i]; t->tmpS.tuples = (*args->skewtask)->relS.tuples + outputS[i]; task_queue_add(join_queue, t); DEBUGMSG(1, "Join added = R: %d, S: %d\n", t->relR.num_tuples, t->relS.num_tuples); } } } } /* add large join tasks in part_queue to the front of the join queue */ if(my_tid == 0) { while((task = task_queue_get_atomic(part_queue))) task_queue_add(join_queue, task); } #endif free(outputR); free(outputS); SYNC_TIMER_STOP(&args->localtimer.sync4); /* wait at a barrier until all threads add all join tasks */ BARRIER_ARRIVE(args->barrier, rv); /* global barrier sync point-4 */ SYNC_GLOBAL_STOP(&args->globaltimer->sync4, my_tid); #ifndef NO_TIMING if(my_tid == 0) stopTimer(&args->timer3);/* partitioning finished */ #endif DEBUGMSG((my_tid == 0), "Number of join tasks = %d\n", join_queue->count); #ifdef PERF_COUNTERS if(my_tid == 0){ PCM_stop(); PCM_log("======= Partitioning phase profiling results ======\n"); PCM_printResults(); PCM_start(); } /* Just to make sure we get consistent performance numbers */ BARRIER_ARRIVE(args->barrier, rv); #endif while((task = task_queue_get_atomic(join_queue))){ /* do the actual join. join method differs for different algorithms, i.e. bucket chaining, histogram-based, histogram-based with simd & prefetching */ results += args->join_function(&task->relR, &task->relS, &task->tmpR); args->parts_processed ++; } args->result = results; /* this thread is finished */ SYNC_TIMER_STOP(&args->localtimer.finish_time); #ifndef NO_TIMING /* this is for just reliable timing of finish time */ BARRIER_ARRIVE(args->barrier, rv); if(my_tid == 0) { /* Actually with this setup we're not timing build */ stopTimer(&args->timer2);/* build finished */ stopTimer(&args->timer1);/* probe finished */ gettimeofday(&args->end, NULL); } #endif /* global finish time */ SYNC_GLOBAL_STOP(&args->globaltimer->finish_time, my_tid); #ifdef PERF_COUNTERS if(my_tid == 0) { PCM_stop(); PCM_log("=========== Build+Probe profiling results =========\n"); PCM_printResults(); PCM_log("===================================================\n"); PCM_cleanup(); } /* Just to make sure we get consistent performance numbers */ BARRIER_ARRIVE(args->barrier, rv); #endif return 0; }