Hint crc (void * list_ptr, Huint size, Huint * done) { // done variable not used for slave accelerator calls Hint result = SUCCESS; // Get VHWTI Huint vhwti_base = 0; // Get VHWTI from PVRs getpvr(1,&vhwti_base); // Increment SW counter Huint sw_counter = _hwti_get_accelerator_sw_counter( vhwti_base ); _hwti_set_accelerator_sw_counter( vhwti_base, ++sw_counter); // if the size is more than 4k words, // grab the amount of iterations you // need to perform. Huint iterations = size / BRAM_SIZE; // If there is a remainder in this // division, round up. if ((size % BRAM_SIZE) != 0) { iterations++; } Huint i = 0; Huint new_size = 0; Huint * new_list_ptr = 0; for (i = 0; i < iterations; i++) { // Calculate the size for this iteration new_size = (size < (BRAM_SIZE*(i+1))) ? size-(BRAM_SIZE*i) : BRAM_SIZE; // Calculate the starting pointer for this iteration. Typcasting // may be repetitive here. new_list_ptr = (Huint *) ((Huint *)list_ptr + BRAM_SIZE*i); // Run crc in software result = (sw_crc((void *) new_list_ptr, new_size)); if (result != SUCCESS) break; } /* end of iterations loop */ return result; }
// -------------------------------------------------------------- // // Initialization routine for all polymorphic functions // // -------------------------------------------------------------- // Hbool poly_init(Hint acc, Huint size) { // Get VHWTI Huint vhwti_base = 0; // Get VHWTI from PVRs getpvr(1,vhwti_base); // Use Accelerator? Hbool use_accelerator = useHW(acc,size); if (use_accelerator) { // Increment HW counter Huint hw_counter = _hwti_get_accelerator_hw_counter( vhwti_base ); _hwti_set_accelerator_hw_counter( vhwti_base, ++hw_counter); } else { // Increment SW counter Huint sw_counter = _hwti_get_accelerator_sw_counter( vhwti_base ); _hwti_set_accelerator_sw_counter( vhwti_base, ++sw_counter); } return use_accelerator; }
int main() { printf("--- combined2 Kernel benchmark ---\n"); printf("Number of Slave processors: %d\n", NUM_AVAILABLE_HETERO_CPUS); #ifdef OPCODE_FLAGGING printf("-->Opcode flagging ENABLED\n"); #else printf("-->Opcode flagging DISABLED\n"); #endif // Initialize various host tables once. init_host_tables(); // Create Queue software thread hthread_t queue_tid; hthread_attr_t queue_attr; hthread_attr_init(&queue_attr); if (thread_create(&queue_tid, &queue_attr,queue_thread_FUNC_ID,(void *) &exec_time[0], SOFTWARE_THREAD,0)){ printf("Error creating Queue thread\n"); while(1); } // Reset create_overhead = 0; Huint i = 0; // PI pi_t thread_data[PI_NUM_THREADS]; for (i = 0; i < PI_NUM_THREADS; i++) { thread_data[i].pi = 0; thread_data[i].MaxIterations = PI_MAX_ITERATIONS; } // HISTOGRAM // Thread attribute structures histogram_t * thread_arg = (histogram_t *) malloc(sizeof(histogram_t) * HISTOGRAM_NUM_THREADS); assert (thread_arg != NULL); // Array Structures int my_array[HISTOGRAM_NUM_THREADS][ARR_SIZE]; int my_hist[HISTOGRAM_NUM_THREADS][NUM_BINS]; int num_ops = 0, j = 0;; // Initialize histograms for (j = 0; j < HISTOGRAM_NUM_THREADS; j++) { int i; for (i = 0; i < NUM_BINS; i++) my_hist[j][i] = 0; for (i = 0; i < ARR_SIZE; i++) my_array[j][i] = i+num_ops % MOD_VAL; } // Initialize thread argument for (j = 0; j < HISTOGRAM_NUM_THREADS; j++) { thread_arg[j].array = (int *)&my_array[j][0]; thread_arg[j].hist = (int *)&my_hist[j][0]; thread_arg[j].max_value = MOD_VAL - 1; thread_arg[j].min_value = 0; } // -------- DISTANCE --------------- // // Thread attribute structures distance_t distance_arg[DISTANCE_NUM_THREADS]; float vals_x0[DISTANCE_ARR_LENGTH]; float vals_x1[DISTANCE_ARR_LENGTH]; float vals_y0[DISTANCE_ARR_LENGTH]; float vals_y1[DISTANCE_ARR_LENGTH]; float vals_ds[DISTANCE_ARR_LENGTH]; for (j = 0; j < DISTANCE_ARR_LENGTH; j++) { vals_x0[j] = (float) DISTANCE_ARR_LENGTH - j; vals_y0[j] = (float) DISTANCE_ARR_LENGTH - j; vals_x1[j] = (float) j + 1; vals_y1[j] = (float) DISTANCE_ARR_LENGTH - j + 1; } // Initialize thread arguments int num_items = DISTANCE_ARR_LENGTH/DISTANCE_NUM_THREADS; int extra_items = DISTANCE_ARR_LENGTH - (num_items*DISTANCE_NUM_THREADS); for ( j= 0; j < DISTANCE_NUM_THREADS; j++) { distance_arg[j].x0s = &vals_x0[j*(num_items)]; distance_arg[j].y0s = &vals_y0[j*(num_items)]; distance_arg[j].x1s = &vals_x1[j*(num_items)]; distance_arg[j].y1s = &vals_y1[j*(num_items)]; distance_arg[j].distances = &vals_ds[j*(num_items)]; distance_arg[j].length = num_items; } // Add in extra items for the last thread if needed distance_arg[j-1].length += extra_items; // Matrix Multiply matrix_t matrix_arg[MATRIX_NUM_THREADS]; int n; for (n = 0; n < MATRIX_NUM_THREADS; n++) { for (i = 0; i < MATRIX_A_ROW; i++) { for (j = 0; j < MATRIX_A_COL; j++) { matrix_arg[n].matrixA[i][j] = i + j; matrix_arg[n].matrixB[i][j] = i + j; matrix_arg[n].matrixC[i][j] = 0; } } } // -------- Find MAx-------------- // max_t findmax_arg[FINDMAX_NUM_THREADS]; for (i = 0; i < FINDMAX_NUM_THREADS; i++) { findmax_arg[i].length = FINDMAX_LENGTH; findmax_arg[i].shift_amount = sizeof(findmax_arg[i].A[0]); for (j = 0; j < FINDMAX_LENGTH; j++) { findmax_arg[i].A[j] = (int) (rand() % FINDMAX_LENGTH); findmax_arg[i].B[j] = (int) (rand() % FINDMAX_LENGTH); findmax_arg[i].result[j] = 0; } } // Set all threads to detached for(i = 0; i < NUM_THREADS; i++) { hthread_attr_init(&attr[i]); hthread_attr_setdetachstate(&attr[i], HTHREAD_CREATE_DETACHED); } hthread_time_t start = hthread_time_get(); thread_create( &tid[0 ], &attr[0 ], distance_thread_FUNC_ID, (void *) &distance_arg[2], DYNAMIC_HW, 0); thread_create( &tid[1 ], &attr[1 ], pi_thread_FUNC_ID, (void *) &thread_data[10], DYNAMIC_HW, 0); thread_create( &tid[2 ], &attr[2 ], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[6], DYNAMIC_HW, 0); thread_create( &tid[3 ], &attr[3 ], distance_thread_FUNC_ID, (void *) &distance_arg[0], DYNAMIC_HW, 0); thread_create( &tid[4 ], &attr[4 ], distance_thread_FUNC_ID, (void *) &distance_arg[6], DYNAMIC_HW, 0); thread_create( &tid[5 ], &attr[5 ], pi_thread_FUNC_ID, (void *) &thread_data[4], DYNAMIC_HW, 0); thread_create( &tid[6 ], &attr[6 ], find_max_thread_FUNC_ID, (void *) &findmax_arg[9], DYNAMIC_HW, 0); thread_create( &tid[7 ], &attr[7 ], pi_thread_FUNC_ID, (void *) &thread_data[12], DYNAMIC_HW, 0); thread_create( &tid[8 ], &attr[8 ], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[0], DYNAMIC_HW, 0); thread_create( &tid[9 ], &attr[9 ], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[10], &attr[10], find_max_thread_FUNC_ID, (void *) &findmax_arg[8], DYNAMIC_HW, 0); thread_create( &tid[11], &attr[11], pi_thread_FUNC_ID, (void *) &thread_data[2], DYNAMIC_HW, 0); thread_create( &tid[12], &attr[12], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[13], &attr[13], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[14], &attr[14], distance_thread_FUNC_ID, (void *) &distance_arg[7], DYNAMIC_HW, 0); thread_create( &tid[15], &attr[15], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[16], &attr[16], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[17], DYNAMIC_HW, 0); thread_create( &tid[17], &attr[17], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[18], &attr[18], find_max_thread_FUNC_ID, (void *) &findmax_arg[4], DYNAMIC_HW, 0); thread_create( &tid[19], &attr[19], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[11], DYNAMIC_HW, 0); thread_create( &tid[20], &attr[20], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[8], DYNAMIC_HW, 0); thread_create( &tid[21], &attr[21], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[13], DYNAMIC_HW, 0); thread_create( &tid[22], &attr[22], pi_thread_FUNC_ID, (void *) &thread_data[6], DYNAMIC_HW, 0); thread_create( &tid[23], &attr[23], distance_thread_FUNC_ID, (void *) &distance_arg[4], DYNAMIC_HW, 0); thread_create( &tid[24], &attr[24], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[25], &attr[25], distance_thread_FUNC_ID, (void *) &distance_arg[1], DYNAMIC_HW, 0); thread_create( &tid[26], &attr[26], pi_thread_FUNC_ID, (void *) &thread_data[11], DYNAMIC_HW, 0); thread_create( &tid[27], &attr[27], find_max_thread_FUNC_ID, (void *) &findmax_arg[3], DYNAMIC_HW, 0); thread_create( &tid[28], &attr[28], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[15], DYNAMIC_HW, 0); thread_create( &tid[29], &attr[29], distance_thread_FUNC_ID, (void *) &distance_arg[8], DYNAMIC_HW, 0); thread_create( &tid[30], &attr[30], pi_thread_FUNC_ID, (void *) &thread_data[9], DYNAMIC_HW, 0); thread_create( &tid[31], &attr[31], find_max_thread_FUNC_ID, (void *) &findmax_arg[10], DYNAMIC_HW, 0); thread_create( &tid[32], &attr[32], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[33], &attr[33], pi_thread_FUNC_ID, (void *) &thread_data[1], DYNAMIC_HW, 0); thread_create( &tid[34], &attr[34], histogram_thread_FUNC_ID, (void*)(&thread_arg[1]),DYNAMIC_HW,0 ); thread_create( &tid[35], &attr[35], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[36], &attr[36], distance_thread_FUNC_ID, (void *) &distance_arg[5], DYNAMIC_HW, 0); thread_create( &tid[37], &attr[37], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[38], &attr[38], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[39], &attr[39], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[40], &attr[40], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[41], &attr[41], pi_thread_FUNC_ID, (void *) &thread_data[0], DYNAMIC_HW, 0); thread_create( &tid[42], &attr[42], pi_thread_FUNC_ID, (void *) &thread_data[14], DYNAMIC_HW, 0); thread_create( &tid[43], &attr[43], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[12], DYNAMIC_HW, 0); thread_create( &tid[44], &attr[44], histogram_thread_FUNC_ID, (void*)(&thread_arg[2]),DYNAMIC_HW,0 ); thread_create( &tid[45], &attr[45], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[3], DYNAMIC_HW, 0); thread_create( &tid[46], &attr[46], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[47], &attr[47], distance_thread_FUNC_ID, (void *) &distance_arg[3], DYNAMIC_HW, 0); thread_create( &tid[48], &attr[48], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[2], DYNAMIC_HW, 0); thread_create( &tid[49], &attr[49], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[16], DYNAMIC_HW, 0); thread_create( &tid[50], &attr[50], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[7], DYNAMIC_HW, 0); thread_create( &tid[51], &attr[51], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[5], DYNAMIC_HW, 0); thread_create( &tid[52], &attr[52], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[53], &attr[53], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[14], DYNAMIC_HW, 0); thread_create( &tid[54], &attr[54], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[10], DYNAMIC_HW, 0); thread_create( &tid[55], &attr[55], pi_thread_FUNC_ID, (void *) &thread_data[13], DYNAMIC_HW, 0); thread_create( &tid[56], &attr[56], find_max_thread_FUNC_ID, (void *) &findmax_arg[12], DYNAMIC_HW, 0); thread_create( &tid[57], &attr[57], pi_thread_FUNC_ID, (void *) &thread_data[8], DYNAMIC_HW, 0); thread_create( &tid[58], &attr[58], find_max_thread_FUNC_ID, (void *) &findmax_arg[5], DYNAMIC_HW, 0); thread_create( &tid[59], &attr[59], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[60], &attr[60], find_max_thread_FUNC_ID, (void *) &findmax_arg[1], DYNAMIC_HW, 0); thread_create( &tid[61], &attr[61], find_max_thread_FUNC_ID, (void *) &findmax_arg[0], DYNAMIC_HW, 0); thread_create( &tid[62], &attr[62], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[1], DYNAMIC_HW, 0); thread_create( &tid[63], &attr[63], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[64], &attr[64], pi_thread_FUNC_ID, (void *) &thread_data[7], DYNAMIC_HW, 0); thread_create( &tid[65], &attr[65], find_max_thread_FUNC_ID, (void *) &findmax_arg[11], DYNAMIC_HW, 0); thread_create( &tid[66], &attr[66], find_max_thread_FUNC_ID, (void *) &findmax_arg[6], DYNAMIC_HW, 0); thread_create( &tid[67], &attr[67], find_max_thread_FUNC_ID, (void *) &findmax_arg[2], DYNAMIC_HW, 0); thread_create( &tid[68], &attr[68], pi_thread_FUNC_ID, (void *) &thread_data[5], DYNAMIC_HW, 0); thread_create( &tid[69], &attr[69], histogram_thread_FUNC_ID, (void*)(&thread_arg[1]),DYNAMIC_HW,0 ); thread_create( &tid[70], &attr[70], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[71], &attr[71], find_max_thread_FUNC_ID, (void *) &findmax_arg[7], DYNAMIC_HW, 0); thread_create( &tid[72], &attr[72], pi_thread_FUNC_ID, (void *) &thread_data[3], DYNAMIC_HW, 0); thread_create( &tid[73], &attr[73], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); thread_create( &tid[74], &attr[74], histogram_thread_FUNC_ID, (void*)(&thread_arg[0]),DYNAMIC_HW,0 ); thread_create( &tid[75], &attr[75], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[9], DYNAMIC_HW, 0); thread_create( &tid[76], &attr[76], matrix_mult_thread_FUNC_ID, (void *) &matrix_arg[4], DYNAMIC_HW, 0); thread_create( &tid[77], &attr[77], mandel_thread_FUNC_ID, (void *) MANDEL_MAX_ITERATIONS, DYNAMIC_HW, 0); // Wait until all threads are finished while(get_num_free_slaves() < NUM_AVAILABLE_HETERO_CPUS || thread_entries != 0) { if (thread_entries != 0) hthread_yield(); } hthread_time_t stop = hthread_time_get(); printf("---------------------------\n"); hthread_time_t diff; hthread_time_diff(diff, stop, start); printf("Total Execution Time: %.2f ms\n", hthread_time_msec(diff)); printf("Total Execution Time: %.2f us\n", hthread_time_usec(diff)); #if 0 // Grab the total number of calls statistic. printf("Total number of thread_create (DYNAMIC) calls: %d\n", total_calls); printf("---------------------------------------------------\n"); printf("Perfect Ratio: %03d / %03d = %0.2f\n", perfect_match_counter, total_calls, perfect_match_counter / (1.0f * total_calls)); printf("Best Ratio: %03d / %03d = %0.2f\n", best_match_counter, total_calls, best_match_counter / (1.0f * total_calls)); printf("Better Ratio: %03d / %03d = %0.2f\n", better_match_counter, total_calls, better_match_counter / (1.0f * total_calls)); printf("Possible Ratio: %03d / %03d = %0.2f\n", possible_match_counter, total_calls, possible_match_counter / (1.0f * total_calls)); perfect_match_counter = 0; best_match_counter = 0; better_match_counter = 0; possible_match_counter = 0; Huint hw_counter[NUM_AVAILABLE_HETERO_CPUS]; Huint sw_counter[NUM_AVAILABLE_HETERO_CPUS]; Huint pr_counter[NUM_AVAILABLE_HETERO_CPUS]; Huint total_hw_count = 0; Huint total_sw_count = 0; Huint total_pr_count = 0; for (i = 0; i < NUM_AVAILABLE_HETERO_CPUS; i++) { hw_counter[i] = _hwti_get_accelerator_hw_counter(hwti_array[i]); sw_counter[i] = _hwti_get_accelerator_sw_counter(hwti_array[i]); pr_counter[i] = _hwti_get_accelerator_pr_counter(hwti_array[i]); total_hw_count += hw_counter[i]; total_sw_count += sw_counter[i]; total_pr_count += pr_counter[i]; // Manually Reset _hwti_set_accelerator_hw_counter(hwti_array[i], 0); _hwti_set_accelerator_sw_counter(hwti_array[i], 0); _hwti_set_accelerator_pr_counter(hwti_array[i], 0); } printf("Total HW Counter: %d\n", total_hw_count); printf("Total SW Counter: %d\n", total_sw_count); printf("Total PR Counter: %d\n", total_pr_count); printf("-----------------------\n"); if (total_hw_count) // if total_hw_count != 0 printf("Total PR Counter / HW Counter = %f\n", total_pr_count / (1.0 *total_hw_count)); printf("Total PR Counter / HW+SW Counter = %f\n", total_pr_count / (1.0 *(total_hw_count+total_sw_count))); #endif printf("Total OS overhead (thread_create) = %f msec\n", hthread_time_msec(create_overhead)); #if 0 hthread_time_t software_time = 0; for (i = 0; i < NUM_AVAILABLE_HETERO_CPUS; i++) { volatile hthread_time_t * temp = (hthread_time_t *) (hwti_array[i] + 0x100); printf("%d: Software Execution = %f msec\n",i, hthread_time_msec(*temp)); software_time += *temp; } printf("Total Software Execution = %f msec\n", hthread_time_msec(software_time)); #endif // Display thread times for (i = 0; i < NUM_THREADS; i++) { // Determine which slave ran this thread based on address Huint base = attr[i].hardware_addr - HT_HWTI_COMMAND_OFFSET; Huint slave_num = (base & 0x00FF0000) >> 16; printf("Execution time (TID : %d, Slave : %d, HW ADDRESS = 0x%08x)\n", tid[i], slave_num, attr[i].hardware_addr); } printf("--- Done ---\n"); return 0; }