void barrier(int ID) { uint_reg_t bcast_sig; if(core_map[ID]==0) { DynamicHeader header= tmc_udn_header_from_cpu(barrier_map[0]); tmc_udn_send_1(header,UDN1_DEMUX_TAG,bcast_sig); } bcast_sig=tmc_udn1_receive(); if(core_map[ID]!=0) { DynamicHeader header= tmc_udn_header_from_cpu(barrier_map[ID]); tmc_udn_send_1(header,UDN1_DEMUX_TAG,bcast_sig); } }
void ssmp_mem_init_platf(int id, int num_ues) { ssmp_id_ = id; ssmp_num_ues_ = num_ues; // Now that we're bound to a core, attach to our UDN rectangle. if (tmc_udn_activate() < 0) tmc_task_die("Failure in 'tmc_udn_activate()'."); udn_header = (DynamicHeader* ) memalign(SSMP_CACHE_LINE_SIZE, num_ues * sizeof (DynamicHeader)); if (udn_header == NULL) { tmc_task_die("Failure in allocating dynamic headers"); } int r; for (r = 0; r < num_ues; r++) { int _cpu = tmc_cpus_find_nth_cpu(&cpus, id_to_core[r]); DynamicHeader header = tmc_udn_header_from_cpu(_cpu); udn_header[r] = header; } }
int main(int argc, char** argv) { // Process arguments. int i = 1; while (i < argc) { // Allow "-i FILE" to override STDIN. if (i + 2 <= argc && !strcmp(argv[i], "-i")) { const char* file = argv[i+1]; if (dup2(open(file, O_RDONLY), STDIN_FILENO) < 0) { fprintf(stderr, "Could not open '%s'.\n", file); exit(1); } i += 2; } // Allow "-o FILE" to override STDOUT. else if (i + 2 <= argc && !strcmp(argv[i], "-o")) { const char* file = argv[i+1]; int fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0666); if (dup2(fd, STDOUT_FILENO) < 0) { fprintf(stderr, "Could not open '%s'.\n", file); exit(1); } i += 2; } else { break; } } // Get the UDN coordinates of the BME server tile from our arguments. int server_x, server_y; if (i + 1 != argc || sscanf(argv[i], "%d,%d", &server_x, &server_y) != 2) { fprintf(stderr, "usage: linux_client [-i IN] [-o OUT] <server_x>,<server_y>\n"); exit(1); } // Create a UDN header for the server. DynamicHeader bme_server = { .bits.dest_x = server_x, .bits.dest_y = server_y }; // Bind ourselves to our current CPU, and set up a UDN hardwall // which encompasses the entire chip, so that we can communicate // with the BME server. cpu_set_t cpus; tmc_cpus_clear(&cpus); tmc_cpus_grid_add_all(&cpus); tmc_cpus_set_my_cpu(tmc_cpus_get_my_current_cpu()); if (tmc_udn_init(&cpus) != 0) { perror("UDN hardwall create failed"); exit(1); } if (tmc_udn_activate() != 0) { perror("UDN hardwall activate failed"); exit(1); } // Get one huge page of memory. tmc_alloc_t alloc = TMC_ALLOC_INIT; tmc_alloc_set_huge(&alloc); tmc_alloc_set_home(&alloc, 0); tmc_alloc_set_shared(&alloc); int mlength = 1 << 24; void* maddr = tmc_alloc_map(&alloc, mlength); if (maddr == NULL) { perror("can't mmap"); exit(1); } // Lock down that memory and get its physical address and caching // information, using the bme_mem device driver. struct bme_user_mem_desc_io user_mem_desc; struct bme_phys_mem_desc_io phys_mem_desc; int fd = open("/dev/bme/mem", O_RDWR); if (fd < 0) { perror("couldn't open /dev/bme/mem"); exit(1); } // First we find out how many pages are in the region to be locked down. // (Given our allocation above, we know we must have exactly one large page, // but this is an example of what you would do for large regions.) //user_mem_desc.user.va = maddr; user_mem_desc.user.va = (uintptr_t)maddr; // user_mem_desc.user.va = (__u64)maddr; user_mem_desc.user.len = mlength; if (ioctl(fd, BME_IOC_GET_NUM_PAGES, &user_mem_desc) != 0) { perror("BME_IOC_GET_NUM_PAGES ioctl failed"); exit(1); } // Now that we know how many pages are there, we can request that they be // locked into physical memory, and retrieve their physical address and // cache mapping information. phys_mem_desc.user.va = (uintptr_t)maddr; phys_mem_desc.user.len = mlength; phys_mem_desc.phys = (uintptr_t)malloc(sizeof(struct bme_phys_mem_desc) * user_mem_desc.num_pages); phys_mem_desc.num_pages = user_mem_desc.num_pages; if (ioctl(fd, BME_IOC_LOCK_MEMORY, &phys_mem_desc) != 0) { perror("BME_IOC_LOCK_MEMORY ioctl failed"); exit(1); } // Send the BME application a message telling it about the memory we // just locked down. Since this is an example, we're only sending one // message, for one page. DynamicHeader my_hdr = tmc_udn_header_from_cpu(tmc_cpus_get_my_cpu()); struct bme_phys_mem_desc *phys = (struct bme_phys_mem_desc *)(uintptr_t)phys_mem_desc.phys; tmc_udn_send_6(bme_server, UDN0_DEMUX_TAG, EX_MSG_MAPPING, my_hdr.word, phys->pa, phys->pa >> 32, phys->pte, phys->pte >> 32); uint32_t reply = udn0_receive(); if (reply) { fprintf(stderr, "client: got bad response %d to MAPPING message\n", reply); exit(1); } // Now read our standard input into a buffer in the shared page; send // a request to the BME tile to process that data, putting the output // elsewhere in the shared page; and then write it to standard output. char* inbuf = maddr; char* outbuf = inbuf + PROCESSING_BUFSIZE; int len; while ((len = read(STDIN_FILENO, inbuf, PROCESSING_BUFSIZE)) > 0) { // Note that our message gives the server the offsets of the input and // output buffers, rather than pointers to them. This is because the // server has not mapped in the data at the same set of virtual addresses // we're using. We could arrange this, if desired, although it required // more coordination between the client and server. tmc_udn_send_5(bme_server, UDN0_DEMUX_TAG, EX_MSG_PROCESS, my_hdr.word, 0, len, PROCESSING_BUFSIZE); reply = udn0_receive(); if (reply != len) { fprintf(stderr, "client: got bad response %d to PROCESS " "message (expected %d)\n", reply, len); exit(1); } if (write(STDOUT_FILENO, outbuf, len) != len) { perror("write"); exit(1); } } return 0; }
//------------------------------------------------------------------------------ //--------------------------Thread function------------------------------------- //------------------------------------------------------------------------------ void *thread_fn(void *arg) { int ID=(*((int*)arg)); //Necessary local variables int count, count1, count2, start_addr, iterator; int row_count_A, col_count_B, el_count; DATA_TYPE temp_sum; uint_reg_t gather_sig; DATA_TYPE weight[3][3] = {{ -1, 0, 1 },{ -2, 0, 2 },{ -1, 0, 1 }}; //Set cpu and activate udn if(tmc_cpus_set_my_cpu(core_map[ID])!=0) { printf("Thread: %d CPU setting failed.\n",ID); exit(1); } tmc_udn_activate(); //Thread memory initialization DATA_TYPE *image, *gx, *gy; int n_out_rows=nrows-2; int factor=sizeof(uint_reg_t)/sizeof(DATA_TYPE); if(ID==0) { image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols); gx = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols); gy = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols); for(count=0; count<nrows; count++) { for(count1=0; count1<ncols; count1++) { image[count*ncols+count1]=count; } } } else { if(ID<n_out_rows%nthreads) { image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1+2)*ncols); gx = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1 )*ncols); gy = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1 )*ncols); } else { image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+2)*ncols); gx = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads )*ncols); gy = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads )*ncols); } } //calculate x and y co-ordinates int x=ID%(xmax+1); int y=ID/(xmax+1); //Time management variables double zero; //Reference time for iterations double scatter_s, scatter_e, scatter_d, compute_s, compute_e, compute_d, gather_s, gather_e, gather_d,total_s,total_e,total_d; struct timespec st; for(iterator=0; iterator<iterations; iterator++) { //-------------------------------------------------------------------------- //----------------------------Start of benchmark---------------------------- //--------------------Do this shit over iteration times--------------------- //-------------------------------------------------------------------------- //-------------------------Set reference time ------------------------------ clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); zero=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3; clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); total_s=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3; //------------------------Step 1: Naive scatter----------------------------- //Set start time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); scatter_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; if(ID==0) { start_addr=((n_out_rows%nthreads==0)?(n_out_rows/nthreads+1):(n_out_rows/nthreads+1+1)); for(count=1; count<nthreads; count++) { if(count<n_out_rows%nthreads) { send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=n_out_rows/nthreads+1; } else { send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=n_out_rows/nthreads; } } } else { if(ID<n_out_rows%nthreads) { receive126((uint_reg_t*)image,(n_out_rows/nthreads+1+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } else { receive126((uint_reg_t*)image,(n_out_rows/nthreads+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } } //Set end time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); scatter_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; scatter_d=scatter_e-scatter_s; //-------------------------------Sobel compute------------------------------ //Set start time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); compute_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; //Compute Gx int i,j; if(ID<n_out_rows%nthreads) { temp_sum=(DATA_TYPE)0; for(count=1; count<(n_out_rows/nthreads+1+2-1); count++) { for(count1=1; count1<(ncols-1); count1++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i]; } } gx[(count-1)*ncols+count1]=temp_sum; } } } else { temp_sum=(DATA_TYPE)0; for(count=1; count<(n_out_rows/nthreads+2-1); count++) { for(count1=1; count1<(ncols-1); count1++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i]; } } gx[(count-1)*ncols+count2]=temp_sum; } } } //Compute Gy if(ID<n_out_rows%nthreads) { temp_sum=(DATA_TYPE)0; for(count1=1; count1<(ncols-1); count1++) { for(count=1; count<(n_out_rows/nthreads+1+2-1); count++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j]; } } gy[(count-1)*ncols+count1]=temp_sum; } } } else { temp_sum=(DATA_TYPE)0; for(count1=1; count1<(ncols-1); count1++) { for(count=1; count<(n_out_rows/nthreads+2-1); count++) { for(j=-1;j<=1;j++) { for(i=-1; i<=1; i++) { temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j]; } } gy[(count-1)*ncols+count2]=temp_sum; } } } //Set end time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); compute_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; compute_d=compute_e-compute_s; //---------------------------Gather----------------------------------------- //Set start time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); gather_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; if(ID==0) { start_addr=1; for(count=1; count<nthreads; count++) { if(count<n_out_rows%nthreads) { //Send a signal to a certain waiting thread DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]); tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig); //Collect Gx from that thread receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); //Collect Gy from that thread receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=(n_out_rows/nthreads+1); } else { //Send a signal to a certain waiting thread DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]); tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig); //Collect Gx from that thread receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); //Collect Gy from that thread receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG); start_addr+=(n_out_rows/nthreads); } } } else { if(ID<n_out_rows%nthreads) { //Wait for signal from main signal gather_sig=tmc_udn2_receive(); //Send the partial gx send126((uint_reg_t*)gx,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); //Send the partial gy send126((uint_reg_t*)gy,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } else { //Wait for signal from main signal gather_sig=tmc_udn2_receive(); //Send the partial gx send126((uint_reg_t*)gx,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); //Send the partial gy send126((uint_reg_t*)gy,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG); } } //Set end time clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); gather_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero; gather_d=gather_e-gather_s; clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st); total_e=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3; total_d+=(total_e-total_s); //---------------------------Log the data----------------------------------- log_memory[iterator*nthreads*9+ID*9+SCA_S]=scatter_s; log_memory[iterator*nthreads*9+ID*9+SCA_E]=scatter_e; log_memory[iterator*nthreads*9+ID*9+SCA_D]=scatter_d; log_memory[iterator*nthreads*9+ID*9+COMP_S]=compute_s; log_memory[iterator*nthreads*9+ID*9+COMP_E]=compute_e; log_memory[iterator*nthreads*9+ID*9+COMP_D]=compute_d; log_memory[iterator*nthreads*9+ID*9+GATH_S]=gather_s; log_memory[iterator*nthreads*9+ID*9+GATH_E]=gather_e; log_memory[iterator*nthreads*9+ID*9+GATH_D]=gather_d; //---------------------Barrier---------------------------------------------- barrier(ID); if(ID==0) printf("Iteration: %d\n",iterator); //-------------------------------------------------------------------------- //------------------------------End of benchmark---------------------------- //-------------------------------------------------------------------------- } //Print total time printf("%lf\n",total_d/iterations); //Free thread local memory free(image); free(gx); free(gy); pthread_exit(NULL); }