void * task_pingpong_black(void *arg) { arg_t * Arg = (arg_t *)arg; register int i; int result = 0; ERRHAND_TILERA(tmc_cpus_set_my_cpu(Arg->cpu)); /*ERRHAND_TILERA*/(tmc_udn_activate()); pthread_barrier_wait(&computation_start); #if TEST_VERBOSE >= 1 printf("[INFO] black: %d\n", tmc_cpus_get_my_cpu()); #endif for (i=0; i<Arg->num_scambi; i++) { int * received; received = ch_receive(CH0_IMPL)(Arg->ch[0]); ch_send(CH1_IMPL)(Arg->ch[1], received); if (NULL == received) { result ++; fprintf(stderr, "[ERROR] black: null received\n"); } } ERRHAND_TILERA(tmc_udn_close()); return (void *)result; }
void * task_pingpong_white(void *arg) { arg_t * Arg = (arg_t *)arg; register int i; int integer; int result = 0; ERRHAND_TILERA(tmc_cpus_set_my_cpu(Arg->cpu)); /*ERRHAND_TILERA*/(tmc_udn_activate()); pthread_barrier_wait(&computation_start); #if TEST_VERBOSE >= 1 printf("[INFO] white: cpu %d\n", tmc_cpus_get_my_cpu()); #endif for (i=0; i<Arg->num_scambi; i++) { int * received = NULL; uint_reg_t a, b; integer = i; a = GET_CLOCK_CYCLE; atomic_compiler_barrier(); ch_send(CH0_IMPL)(Arg->ch[0], &integer); received = ch_receive(CH1_IMPL)(Arg->ch[1]); atomic_compiler_barrier(); b = GET_CLOCK_CYCLE; if (b>a) { prepareStatistics(Tscambio, b-a); } else { //fprintf(stderr, "\n\n>>> %u %u \n\n", a, b); } #if TEST_DEBUG >= 1 if (b-a > 700) { fprintf(stderr, "i %-20d Tscambio %"PRIu64"\n", i, Tscambio[0]); } #endif if (NULL == received) { result ++; fprintf(stderr, "[ERROR] white: null received\n"); } else { if (i != *received) result ++; } } ERRHAND_TILERA(tmc_udn_close()); return (void *)result; }
void ssmp_mem_init_platf(int id, int num_ues) { ssmp_id_ = id; ssmp_num_ues_ = num_ues; // Now that we're bound to a core, attach to our UDN rectangle. if (tmc_udn_activate() < 0) tmc_task_die("Failure in 'tmc_udn_activate()'."); udn_header = (DynamicHeader* ) memalign(SSMP_CACHE_LINE_SIZE, num_ues * sizeof (DynamicHeader)); if (udn_header == NULL) { tmc_task_die("Failure in allocating dynamic headers"); } int r; for (r = 0; r < num_ues; r++) { int _cpu = tmc_cpus_find_nth_cpu(&cpus, id_to_core[r]); DynamicHeader header = tmc_udn_header_from_cpu(_cpu); udn_header[r] = header; } }
/** Main function. */ int main(int argc, char** argv) { // Number of instances of this program to run // (including the initial parent process). int instances = 4; // Detect whether we're the parent or an exec'd child, int is_parent = is_parent_process(); // Get the application's affinity set. // We'll use the first N available cpus from this set. // NOTE: this means parent should _not_ call any functions // that shrink the affinity set prior to go_parellel(); cpu_set_t cpus; int status = tmc_cpus_get_my_affinity(&cpus); check_tmc_status(status, "tmc_cpus_get_my_affinity()"); // Define UDN cpu set as first N available cpus status = udn_init(instances, &cpus); check_tmc_status(status, "udn_init()"); // Initialize "common" shared memory with default size. status = tmc_cmem_init(0); check_tmc_status(status, "tmc_cmem_init()"); // Allocate barrier data structure in shared memory. tmc_sync_barrier_t* barrier = NULL; if (is_parent) { // Allocate/initialize barrier data structure in common memory. barrier = (tmc_sync_barrier_t*) tmc_cmem_malloc(sizeof(*barrier)); if (barrier == NULL) tmc_task_die("barrier_init(): " "Failed to allocate barrier data structure."); tmc_sync_barrier_init(barrier, instances); } // Pass the barrier pointer to any exec'd children. share_pointer("SHARED_BARRIER_POINTER", (void**) &barrier); // Fork/exec any additional child processes, // each locked to its own tile, // and get index [0 -- instances-1] of current process. int index = go_parallel(instances, &cpus, argc, argv); pid_t pid = getpid(); printf("Process(pid=%i), index=%i: started.\n", pid, index); // Enable UDN access for this process (parent or child). // Note: this needs to be done after we're locked to a tile. status = tmc_udn_activate(); check_tmc_status(status, "tmc_udn_activate()"); // Wait here until all other processes have caught up. tmc_sync_barrier_wait(barrier); // Send/receive a value over the UDN. int from = 0; int to = instances - 1; if (index == from) { int value = 42; printf("Process(pid=%i), index=%i: sending value %i to cpu %i...\n", pid, index, value, to); udn_send_to_nth_cpu(to, &cpus, value); printf("Process(pid=%i), index=%i: sent value %i to cpu %i.\n", pid, index, value, to); } else if (index == to) { int received = 0; printf("Process(pid=%i), index=%i: receiving value...\n", pid, index); received = udn_receive(); printf("Process(pid=%i), index=%i: received value %i...\n", pid, index, received); } // Wait here until all other processes have caught up. tmc_sync_barrier_wait(barrier); printf("Process(pid=%i), index=%i: finished.\n", pid, index); // We're done. return 0; }
int main(int argc, char** argv) { // Process arguments. int i = 1; while (i < argc) { // Allow "-i FILE" to override STDIN. if (i + 2 <= argc && !strcmp(argv[i], "-i")) { const char* file = argv[i+1]; if (dup2(open(file, O_RDONLY), STDIN_FILENO) < 0) { fprintf(stderr, "Could not open '%s'.\n", file); exit(1); } i += 2; } // Allow "-o FILE" to override STDOUT. else if (i + 2 <= argc && !strcmp(argv[i], "-o")) { const char* file = argv[i+1]; int fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0666); if (dup2(fd, STDOUT_FILENO) < 0) { fprintf(stderr, "Could not open '%s'.\n", file); exit(1); } i += 2; } else { break; } } // Get the UDN coordinates of the BME server tile from our arguments. int server_x, server_y; if (i + 1 != argc || sscanf(argv[i], "%d,%d", &server_x, &server_y) != 2) { fprintf(stderr, "usage: linux_client [-i IN] [-o OUT] <server_x>,<server_y>\n"); exit(1); } // Create a UDN header for the server. DynamicHeader bme_server = { .bits.dest_x = server_x, .bits.dest_y = server_y }; // Bind ourselves to our current CPU, and set up a UDN hardwall // which encompasses the entire chip, so that we can communicate // with the BME server. cpu_set_t cpus; tmc_cpus_clear(&cpus); tmc_cpus_grid_add_all(&cpus); tmc_cpus_set_my_cpu(tmc_cpus_get_my_current_cpu()); if (tmc_udn_init(&cpus) != 0) { perror("UDN hardwall create failed"); exit(1); } if (tmc_udn_activate() != 0) { perror("UDN hardwall activate failed"); exit(1); } // Get one huge page of memory. tmc_alloc_t alloc = TMC_ALLOC_INIT; tmc_alloc_set_huge(&alloc); tmc_alloc_set_home(&alloc, 0); tmc_alloc_set_shared(&alloc); int mlength = 1 << 24; void* maddr = tmc_alloc_map(&alloc, mlength); if (maddr == NULL) { perror("can't mmap"); exit(1); } // Lock down that memory and get its physical address and caching // information, using the bme_mem device driver. struct bme_user_mem_desc_io user_mem_desc; struct bme_phys_mem_desc_io phys_mem_desc; int fd = open("/dev/bme/mem", O_RDWR); if (fd < 0) { perror("couldn't open /dev/bme/mem"); exit(1); } // First we find out how many pages are in the region to be locked down. // (Given our allocation above, we know we must have exactly one large page, // but this is an example of what you would do for large regions.) //user_mem_desc.user.va = maddr; user_mem_desc.user.va = (uintptr_t)maddr; // user_mem_desc.user.va = (__u64)maddr; user_mem_desc.user.len = mlength; if (ioctl(fd, BME_IOC_GET_NUM_PAGES, &user_mem_desc) != 0) { perror("BME_IOC_GET_NUM_PAGES ioctl failed"); exit(1); } // Now that we know how many pages are there, we can request that they be // locked into physical memory, and retrieve their physical address and // cache mapping information. phys_mem_desc.user.va = (uintptr_t)maddr; phys_mem_desc.user.len = mlength; phys_mem_desc.phys = (uintptr_t)malloc(sizeof(struct bme_phys_mem_desc) * user_mem_desc.num_pages); phys_mem_desc.num_pages = user_mem_desc.num_pages; if (ioctl(fd, BME_IOC_LOCK_MEMORY, &phys_mem_desc) != 0) { perror("BME_IOC_LOCK_MEMORY ioctl failed"); exit(1); } // Send the BME application a message telling it about the memory we // just locked down. Since this is an example, we're only sending one // message, for one page. DynamicHeader my_hdr = tmc_udn_header_from_cpu(tmc_cpus_get_my_cpu()); struct bme_phys_mem_desc *phys = (struct bme_phys_mem_desc *)(uintptr_t)phys_mem_desc.phys; tmc_udn_send_6(bme_server, UDN0_DEMUX_TAG, EX_MSG_MAPPING, my_hdr.word, phys->pa, phys->pa >> 32, phys->pte, phys->pte >> 32); uint32_t reply = udn0_receive(); if (reply) { fprintf(stderr, "client: got bad response %d to MAPPING message\n", reply); exit(1); } // Now read our standard input into a buffer in the shared page; send // a request to the BME tile to process that data, putting the output // elsewhere in the shared page; and then write it to standard output. char* inbuf = maddr; char* outbuf = inbuf + PROCESSING_BUFSIZE; int len; while ((len = read(STDIN_FILENO, inbuf, PROCESSING_BUFSIZE)) > 0) { // Note that our message gives the server the offsets of the input and // output buffers, rather than pointers to them. This is because the // server has not mapped in the data at the same set of virtual addresses // we're using. We could arrange this, if desired, although it required // more coordination between the client and server. tmc_udn_send_5(bme_server, UDN0_DEMUX_TAG, EX_MSG_PROCESS, my_hdr.word, 0, len, PROCESSING_BUFSIZE); reply = udn0_receive(); if (reply != len) { fprintf(stderr, "client: got bad response %d to PROCESS " "message (expected %d)\n", reply, len); exit(1); } if (write(STDOUT_FILENO, outbuf, len) != len) { perror("write"); exit(1); } } return 0; }
//------------------------------------------------------------------------------ //--------------------------Thread function------------------------------------- //------------------------------------------------------------------------------ void *thread_fn(void *arg) { int ID=(*((int*)arg)); //Necessary local variables int count, count1, count2, start_addr, iterator; int row_count_A, col_count_B, el_count; DATA_TYPE temp_sum; uint_reg_t gather_sig; DATA_TYPE weight[3][3] = {{ -1, 0, 1 },{ -2, 0, 2 },{ -1, 0, 1 }}; //Set cpu and activate udn if(tmc_cpus_set_my_cpu(core_map[ID])!=0) { printf("Thread: %d CPU setting failed.\n",ID); exit(1); } tmc_udn_activate(); //Thread memory initialization DATA_TYPE *image, *gx, *gy; int n_out_rows=nrows-2; int factor=sizeof(uint_reg_t)/sizeof(DATA_TYPE); if(ID==0) { image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols); gx = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols); gy = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols); for(count=0; count<nrows; count++) { for(count1=0; count1<ncols; count1++) { image[count*ncols+count1]=count; } } } else { if(ID<n_out_rows%nthreads) { image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1+2)*ncols); gx = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1 )*ncols); gy = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1 )*ncols); } else { image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+2)*ncols); gx = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads )*ncols); gy = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads )*ncols); } } //calculate x and y co-ordinates int x=ID%(xmax+1); int y=ID/(xmax+1); //Time management variables double zero; //Reference time for iterations double scatter_s, scatter_e, scatter_d, compute_s, compute_e, compute_d, gather_s, gather_e, gather_d,total_s,total_e,total_d; struct timespec st; for(iterator=0; iterator<iterations; iterator++) { //-------------------------------------------------------------------------- //----------------------------Start of benchmark---------------------------- //--------------------Do this shit over iteration times--------------------- //-------------------------------------------------------------------------- //-------------------------Set reference time ------------------------------ clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); zero=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3; clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); total_s=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3; //------------------------Step 1: Naive scatter----------------------------- //Set start time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); scatter_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; if(ID==0) { start_addr=((n_out_rows%nthreads==0)?(n_out_rows/nthreads+1):(n_out_rows/nthreads+1+1)); for(count=1; count<nthreads; count++) { if(count<n_out_rows%nthreads) { send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=n_out_rows/nthreads+1; } else { send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=n_out_rows/nthreads; } } } else { if(ID<n_out_rows%nthreads) { receive126((uint_reg_t*)image,(n_out_rows/nthreads+1+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } else { receive126((uint_reg_t*)image,(n_out_rows/nthreads+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } } //Set end time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); scatter_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; scatter_d=scatter_e-scatter_s; //-------------------------------Sobel compute------------------------------ //Set start time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); compute_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; //Compute Gx int i,j; if(ID<n_out_rows%nthreads) { temp_sum=(DATA_TYPE)0; for(count=1; count<(n_out_rows/nthreads+1+2-1); count++) { for(count1=1; count1<(ncols-1); count1++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i]; } } gx[(count-1)*ncols+count1]=temp_sum; } } } else { temp_sum=(DATA_TYPE)0; for(count=1; count<(n_out_rows/nthreads+2-1); count++) { for(count1=1; count1<(ncols-1); count1++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i]; } } gx[(count-1)*ncols+count2]=temp_sum; } } } //Compute Gy if(ID<n_out_rows%nthreads) { temp_sum=(DATA_TYPE)0; for(count1=1; count1<(ncols-1); count1++) { for(count=1; count<(n_out_rows/nthreads+1+2-1); count++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j]; } } gy[(count-1)*ncols+count1]=temp_sum; } } } else { temp_sum=(DATA_TYPE)0; for(count1=1; count1<(ncols-1); count1++) { for(count=1; count<(n_out_rows/nthreads+2-1); count++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j]; } } gy[(count-1)*ncols+count2]=temp_sum; } } } //Set end time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); compute_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; compute_d=compute_e-compute_s; //---------------------------Gather----------------------------------------- //Set start time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); gather_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; if(ID==0) { start_addr=1; for(count=1; count<nthreads; count++) { if(count<n_out_rows%nthreads) { //Send a signal to a certain waiting thread DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]); tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig); //Collect Gx from that thread receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); //Collect Gy from that thread receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=(n_out_rows/nthreads+1); } else { //Send a signal to a certain waiting thread DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]); tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig); //Collect Gx from that thread receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); //Collect Gy from that thread receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=(n_out_rows/nthreads); } } } else { if(ID<n_out_rows%nthreads) { //Wait for signal from main signal gather_sig=tmc_udn2_receive(); //Send the partial gx send126((uint_reg_t*)gx,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); //Send the partial gy send126((uint_reg_t*)gy,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } else { //Wait for signal from main signal gather_sig=tmc_udn2_receive(); //Send the partial gx send126((uint_reg_t*)gx,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); //Send the partial gy send126((uint_reg_t*)gy,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } } //Set end time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); gather_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; gather_d=gather_e-gather_s; clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); total_e=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3; total_d+=(total_e-total_s); //---------------------------Log the data----------------------------------- log_memory[iterator*nthreads*9+ID*9+SCA_S]=scatter_s; log_memory[iterator*nthreads*9+ID*9+SCA_E]=scatter_e; log_memory[iterator*nthreads*9+ID*9+SCA_D]=scatter_d; log_memory[iterator*nthreads*9+ID*9+COMP_S]=compute_s; log_memory[iterator*nthreads*9+ID*9+COMP_E]=compute_e; log_memory[iterator*nthreads*9+ID*9+COMP_D]=compute_d; log_memory[iterator*nthreads*9+ID*9+GATH_S]=gather_s; log_memory[iterator*nthreads*9+ID*9+GATH_E]=gather_e; log_memory[iterator*nthreads*9+ID*9+GATH_D]=gather_d; //---------------------Barrier---------------------------------------------- barrier(ID); if(ID==0) printf("Iteration: %d\n",iterator); //-------------------------------------------------------------------------- //------------------------------End of benchmark---------------------------- //-------------------------------------------------------------------------- } //Print total time printf("%lf\n",total_d/iterations); //Free thread local memory free(image); free(gx); free(gy); pthread_exit(NULL); }