void process_image_simple(struct image* img){ unsigned char *input, *output, *temp; unsigned int addr1, addr2, i, j, k, r, g, b; int block_nr = img->block_nr; vector unsigned char *v1, *v2, *v3, *v4, *v5 ; input = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4); temp = malloc_align(NUM_CHANNELS * img->width, 4); v1 = (vector unsigned char *) &input[0]; v2 = (vector unsigned char *) &input[1 * img->width * NUM_CHANNELS]; v3 = (vector unsigned char *) &input[2 * img->width * NUM_CHANNELS]; v4 = (vector unsigned char *) &input[3 * img->width * NUM_CHANNELS]; v5 = (vector unsigned char *) temp; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS * img->width / NUM_IMAGES_WIDTH; for (i=0; i<img->height / SCALE_FACTOR; i++){ //get 4 lines addr1 = ((unsigned int)img->src) + i * img->width * NUM_CHANNELS * SCALE_FACTOR; mfc_get(input, addr1, SCALE_FACTOR * img->width * NUM_CHANNELS, MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); //compute the scaled line for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[j], v2[j]), spu_avg(v3[j], v4[j])); } for (j=0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } //put the scaled line back mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); } free_align(temp); free_align(input); free_align(output); }
image alloc_img(unsigned int width, unsigned int height) { image img; img = malloc_align(sizeof (image_t), 4); img->buf = malloc_align((width * height + 1) * sizeof (pixel_t), 4); img->width = width; img->height = height; return img; }
void* malloc(size_t count) { if (count < PAGE_SIZE) { return malloc_align(count, 1); } else { return malloc_align(count, PAGE_SIZE); } }
SortedMap *createSortedMap(int length) { // Create sorted list of length "length". SortedMap *map = (SortedMap *) malloc_align(sizeof(SortedMap), 7); map->size = 0; map->length = length; map->keys = (double *) malloc_align(map->length * sizeof(double), 7); map->values = (int *) malloc_align(map->length * sizeof(int), 7); return map; }
void read_from_file(FILE *fin, struct pixel **a, int *width, int *height, int *max_color) { printf("PPU reading from file\n"); char line[256]; char *numbers, *tok; long line_no = 0, i = 0; int red, green, blue; /* Check if the file is ppm */ fgets(line, sizeof(line), fin); if (strncmp(line, "P3", 2)) { perror("The input file is not ppm"); return; } /* Read initial parameters */ fscanf(fin, "%d", width); fscanf(fin, "%d", height); fscanf(fin, "%d", max_color); printf("PPU reads %d, %d, %d\n", *width, *height, *max_color); *a = malloc_align(*width * *height * sizeof(struct pixel), 4); if (!(*a)) { perror("Error on allocating memory for image"); return; } /* Read the pixels */ while(fscanf(fin, "%d %d %d", &red, &green, &blue) != EOF){ (*a)[i].red = red; (*a)[i].green = green; (*a)[i].blue = blue; i++; } }
END_TEST /******************************************************************************* * mallac_align/free_align */ START_TEST(test_malloc_align) { void *ptr[128][256]; int size, align; for (size = 0; size < countof(ptr); size++) { for (align = 0; align < countof(ptr[0]); align++) { ptr[size][align] = malloc_align(size, align); if (align) { ck_assert((uintptr_t)ptr[size][align] % align == 0); } if (size) { ck_assert(ptr[size][align]); memset(ptr[size][align], 0xEF, size); } } } for (size = 0; size < countof(ptr); size++) { for (align = 0; align < countof(ptr[0]); align++) { free_align(ptr[size][align]); } } }
int main(){ int i; int N=1024; float pi=0.0; pthread_t pthreads[SPU_THREADS]; context ctxs[SPU_THREADS] __attribute__ ((aligned(16))); for(i=0;i<SPU_THREADS;i++){ ctxs[i].N=N; ctxs[i].Nstart=(N/SPU_THREADS)*i; ctxs[i].Nend=(N/SPU_THREADS)*(i+1); ctxs[i].pi=(float*) malloc_align(sizeof(float),7); pthread_create(&pthreads[i], NULL, &pthread_run_spe, &ctxs[i]); } for (i=0; i<SPU_THREADS; i++) pthread_join (pthreads[i], NULL); for(i=0;i<SPU_THREADS;i++) pi+=*(ctxs[i].pi); for(i=0;i<SPU_THREADS;i++) free_align(ctxs[i].pi); printf("PI = %f\n",pi); return (0); }
int main(int argc, char* argv[]) { printf("coucou\n"); int i = 42; int* i_p = &i; long* l_p = (long*) i_p; char* c_p = (char*) i_p; /* bitprint((long)i_p); bitprint((long)(i_p+1)); // + 4 bitprint((long)l_p); bitprint((long)(l_p+1)); // + 8 bitprint((long)c_p); bitprint((long)(c_p+1)); // + 8 */ int align = 24; if (argc > 1) align = atoi(argv[1]); free_align(malloc_align(1024, align)); exit(0); }
static void do_alloc(char *file, uint32_t align, uint8_t zone) { int id = fw_cfg_file_id(file); int n = fw_cfg_file_size(id); char *p; if (id == -1) panic(); if (align < 16) align = 16; if (zone == ALLOC_FSEG) p = malloc_fseg_align(n, align); else p = malloc_align(n, align); set_file_addr(id, p); fw_cfg_read_file(id, p, n); /* For PVH boot, save the PA where the RSDP is stored */ if (zone == ALLOC_FSEG) { if (!memcmp(p, "RSD PTR ", 8)) { start_info.rsdp_paddr = (uintptr_t)id_to_addr(id); } } }
/* allocate image data */ void alloc_image(struct image* img) { //img->data = calloc(NUM_CHANNELS * img->width * img->height, sizeof(char)); img->data = malloc_align(NUM_CHANNELS * img->width * img->height * sizeof(char), 4); if (!img->data){ PRINT_ERR_MSG_AND_EXIT("Calloc failed\n"); } }
static void alloc_bufs(OLTraceCtx *ctx) { ctx->aw = (ctx->p.width+15) & ~15; ctx->ah = (ctx->p.height+15) & ~15; ctx->ksize = ((int)round(ctx->p.sigma * 6 + 1)) | 1; if (ctx->ksize <= 1) { ctx->ksize = 0; ctx->k = NULL; ctx->kpad = 0; ctx->bibuf = NULL; ctx->btbuf = NULL; ctx->sibuf = NULL; } else { ctx->k = malloc_align(16 * ctx->ksize, 64); ctx->kpad = ctx->ksize / 2; ctx->bibuf = malloc_align(ctx->aw * (ctx->ah + 2 * ctx->kpad), 64); ctx->btbuf = malloc_align(ctx->ah * (ctx->aw + 2 * ctx->kpad), 64); ctx->sibuf = malloc_align(ctx->aw * (ctx->ah + 2), 64); } if (ctx->p.mode == OL_TRACE_CANNY) { if (!ctx->sibuf) ctx->sibuf = malloc_align(ctx->aw * (ctx->ah + 2), 64); ctx->stbuf = malloc_align(sizeof(*ctx->stbuf) * ctx->ah * (ctx->aw + 2), 64); ctx->sxbuf = malloc_align(sizeof(*ctx->sxbuf) * ctx->aw * ctx->ah, 64); ctx->sybuf = malloc_align(sizeof(*ctx->sybuf) * ctx->aw * ctx->ah, 64); ctx->smbuf = malloc_align(sizeof(*ctx->smbuf) * ctx->aw * ctx->ah, 64); } else { ctx->stbuf = NULL; ctx->sxbuf = NULL; ctx->sybuf = NULL; ctx->smbuf = NULL; } ctx->tracebuf = malloc(ctx->p.width * ctx->p.height * sizeof(*ctx->tracebuf)); memset(ctx->tracebuf, 0, ctx->p.width * ctx->p.height * sizeof(*ctx->tracebuf)); ctx->sb_size = ctx->p.width * 16; ctx->sb = malloc(ctx->sb_size * sizeof(*ctx->sb)); ctx->sbp = ctx->sb; ctx->sb_end = ctx->sb + ctx->sb_size; ctx->pb_size = ctx->p.width * 16; ctx->pb = malloc(ctx->pb_size * sizeof(*ctx->pb)); ctx->pbp = ctx->pb; ctx->pb_end = ctx->pb + ctx->pb_size; }
int main () { int data=1000, offset=1024; char *buffer; buffer = (char *)malloc_align(data, offset); free(buffer); return 1; }
int* make_seed_vector() { int i; int* rand_seed = malloc_align(SPU_THREADS * sizeof(int), 4); if (rand_seed == NULL) { perror("malloc_align failed in make_seed_vector()"); return NULL; } for (i = 0; i < SPU_THREADS; i++) rand_seed[i]=rand()%12345612; return rand_seed; }
void work(param_t param) { printf("SPU[%u] work()\n", param.proc); unsigned int inbox, offset; unsigned int *in = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *out = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *use = malloc_align(param.bitset_size, ALIGN_EXP); unsigned int *def = malloc_align(param.bitset_size, ALIGN_EXP); if(in == NULL || out == NULL || use == NULL || def == NULL) { printf("malloc_align() failed\n"); exit(1); } unsigned tag_1, tag_2, tag_3, tag_4; unsigned int tag_id; /* Reserve a tag for application usage */ if ((tag_1 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_1\n"); } if ((tag_2 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_2\n"); } if ((tag_3 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_3\n"); } if ((tag_4 = mfc_tag_reserve()) == MFC_TAG_INVALID) { printf("ERROR: unable to reserve a tag_4\n"); } while(1) { inbox = spu_read_in_mbox(); if(inbox == UINT_MAX) { printf("SPU[%u] received exit signal.. exiting.\n", param.proc); return; } offset = param.bitset_subsets*inbox; mfc_get(in, (unsigned int) (param.bs_in_addr + offset), param.bitset_size, tag_1, 0, 0); mfc_get(out, (unsigned int) (param.bs_out_addr + offset), param.bitset_size, tag_2, 0, 0); mfc_get(use, (unsigned int) (param.bs_use_addr + offset), param.bitset_size, tag_3, 0, 0); mfc_get(def, (unsigned int) (param.bs_def_addr + offset), param.bitset_size, tag_4, 0, 0); mfc_write_tag_mask(1 << tag_1 | 1 << tag_2 | 1 << tag_3 | 1 << tag_4); mfc_read_tag_status_all(); D(printf("SPU[%d] index: %u bitset_subsets: %u offset: %u\n", param.proc, inbox, param.bitset_subsets, offset); printf("SPU[%d]\t&use: %p\n\t&def: %p\n\t&out: %p\n\t&in: %p\n", param.proc, (void*)param.bs_use_addr, (void*)param.bs_def_addr, (void*)param.bs_out_addr, (void*)param.bs_in_addr); void *tmp_ptr = (void*) (param.bs_use_addr + offset); printf("SPU[%d] read\t\t&%p = use(%p)={", param.proc, (void*)use, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(use, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_def_addr + offset); printf("SPU[%d] read\t\t&%p = def(%p)={", param.proc, (void*)def, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(def, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_out_addr + offset); printf("SPU[%d] read\t\t&%p = out(%p)={", param.proc, (void*)out, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(out, i) ) { printf("%d ", i); } } printf("}\n"); tmp_ptr = (void*) (param.bs_in_addr + offset); printf("SPU[%d] read\t\t&%p = in (%p)={", param.proc, (void*)in, tmp_ptr); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(in, i) ) { printf("%d ", i); } } printf("}\n")); bitset_megaop(param, in, out, use, def); D(printf("SPU[%d] calculated\tin={", param.proc); for (int i = 0; i < 100; ++i){ if ( bitset_get_bit(in, i) ) { printf("%d ", i); } } printf("}\n");) mfc_put(in, (unsigned int) (param.bs_in_addr + offset), param.bitset_size, tag_1, 0, 0); mfc_write_tag_mask(1 << tag_1); mfc_read_tag_status_all(); spu_write_out_intr_mbox(inbox); }
/* Does the actual processing of the frame */ static void do_work(ppu_data_t ppu_data) { struct image input; struct image big_image; dprintf("SPU[%d] ppu_data.input:%p ppu_big_img:%p sizeof(struct image):%lu\n", ppu_data.spe_id, (void *)ppu_data.input, (void *)ppu_data.big_image, sizeof(struct image)); /* Get input image and big_image details */ mfc_get((void *)(&input), (uint32_t)(ppu_data.input), (uint32_t)(sizeof(struct image)), tag_id, 0, 0); mfc_get((void *)(&big_image), (uint32_t)(ppu_data.big_image), (uint32_t)(sizeof(struct image)), tag_id, 0, 0); waittag(tag_id); dprintf("SPU[%d] got structs\n"\ "input.width=%u\tinput.height=%u\n"\ "big_image.width=%u\tbig_image.height=%u\n"\ "input.data=%p\tbig_image.data=%p\n", ppu_data.spe_id, input.width, input.height, big_image.width, big_image.height, (void *)input.data, (void *)big_image.data); struct image img_chunk; unsigned int buf_line_sz = input.width * NUM_CHANNELS; int transfer_sz = 4 * buf_line_sz; img_chunk.width = input.width; img_chunk.height = 4; alloc_image(&img_chunk); struct image img_scaled_line; img_scaled_line.width = input.width / SCALE_FACTOR; img_scaled_line.height = 1; /* Hack for memory align of local image data to have the same 4 bits in its * address as the remote corresponding address in PPU */ int left_padding = (ppu_data.spe_id % 4) * 4; unsigned char* addr_to_free = malloc_align(NUM_CHANNELS * 3 * sizeof(char) + left_padding, 4); img_scaled_line.data = addr_to_free + left_padding; unsigned int i; /* Process 4 lines from the initial image at a time */ for (i = 0; i < input.height / img_chunk.height; ++i) { /* Get the image chunk from PPU through DMA transfer */ dprintf("SPU[%d] getting image_chunk %d of size %d\n", ppu_data.spe_id, i, transfer_sz); dprintf("SPU[%d] input.data=%p img_chunk.data=%p "\ "start_addr=%p\n", ppu_data.spe_id, (void *)input.data, (void *)img_chunk.data, (void *)((uint32_t)(input.data) + i * transfer_sz)); mfc_get((void *)(img_chunk.data), (uint32_t)(input.data) + i * transfer_sz, (uint32_t)(transfer_sz), tag_id, 0, 0); waittag(tag_id); dprintf("SPU[%d] got image_chunk %d\n", ppu_data.spe_id, i); compute_lines_average(&img_chunk, buf_line_sz); /* Make average for column. avg = (c0.r + c1.r) / 2 etc*/ compute_columns_average(&img_chunk, &img_scaled_line); store_line(&img_scaled_line, ppu_data, &big_image, i); } free_image(&img_chunk); free_align(addr_to_free); }
void process_image_2lines(struct image* img){ unsigned char *input, *output, *output2, *temp; unsigned int addr1, addr2, i, j, k, r1, g1, b1, r2, g2, b2; int block_nr = img->block_nr; vector unsigned char *v1_1, *v1_2, *v1_3, *v1_4, *v1_5; vector unsigned char *v2_1, *v2_2, *v2_3, *v2_4, *v2_5; // optimization unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width; unsigned int num_channels_X_img_width_X_SCALE_FACTOR = num_channels_X_img_width * SCALE_FACTOR; input = malloc_align(2 * num_channels_X_img_width_X_SCALE_FACTOR, 4); output = malloc_align(num_channels_X_img_width / SCALE_FACTOR, 4); output2 = malloc_align(num_channels_X_img_width / SCALE_FACTOR, 4); temp = malloc_align(2 * NUM_CHANNELS * img->width, 4); // first line v1_1 = (vector unsigned char *) &input[0]; v1_2 = (vector unsigned char *) &input[1 * num_channels_X_img_width]; v1_3 = (vector unsigned char *) &input[2 * num_channels_X_img_width]; v1_4 = (vector unsigned char *) &input[3 * num_channels_X_img_width]; v1_5 = (vector unsigned char *) temp; // second line v2_1 = (vector unsigned char *) &input[4 * num_channels_X_img_width]; v2_2 = (vector unsigned char *) &input[5 * num_channels_X_img_width]; v2_3 = (vector unsigned char *) &input[6 * num_channels_X_img_width]; v2_4 = (vector unsigned char *) &input[7 * num_channels_X_img_width]; v2_5 = (vector unsigned char *) &temp[num_channels_X_img_width]; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS * img->width / NUM_IMAGES_WIDTH; for (i = 0; i<img->height / SCALE_FACTOR / 2; i++){ // get 8 lines addr1 = ((unsigned int)img->src) + 2 * i * num_channels_X_img_width_X_SCALE_FACTOR; mfc_get(input, addr1, 2 * num_channels_X_img_width * SCALE_FACTOR, MY_TAG, 0, 0); mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); // compute the 2 scaled line for (j = 0; j < num_channels_X_img_width / 16; j++){ v1_5[j] = spu_avg(spu_avg(v1_1[j], v1_2[j]), spu_avg(v1_3[j], v1_4[j])); v2_5[j] = spu_avg(spu_avg(v2_1[j], v2_2[j]), spu_avg(v2_3[j], v2_4[j])); } for (j = 0; j < img->width; j += SCALE_FACTOR){ r1 = g1 = b1 = 0; r2 = b2 = g2 = 0; for (k = j; k < j + SCALE_FACTOR; k++) { unsigned int k_X_NUM_CHANNELS = k * NUM_CHANNELS; r1 += temp[k_X_NUM_CHANNELS + 0]; g1 += temp[k_X_NUM_CHANNELS + 1]; b1 += temp[k_X_NUM_CHANNELS + 2]; r2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 0]; g2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 1]; b2 += temp[num_channels_X_img_width + k_X_NUM_CHANNELS + 2]; } r1 /= SCALE_FACTOR; b1 /= SCALE_FACTOR; g1 /= SCALE_FACTOR; r2 /= SCALE_FACTOR; b2 /= SCALE_FACTOR; g2 /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r1; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g1; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b1; output2[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r2; output2[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g2; output2[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b2; } //put the scaled line back mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block // trimite si al 2-lea set mfc_put(output2, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); } free_align(temp); free_align(input); free_align(output); free_align(output2); }
void process_image_double(struct image* img){ unsigned char *input[2], *output, *temp; unsigned int addr1, addr2, i, j, k, r, g, b; int block_nr = img->block_nr; vector unsigned char *v1[2], *v2[2], *v3[2], *v4[2], *v5; int buf, nxt_buf; //index of the buffer (0/1) input[0] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); input[1] = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4); output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4); temp = malloc_align(NUM_CHANNELS * img->width, 4); //optimization unsigned int num_channels_X_img_width = NUM_CHANNELS * img->width; v1[0] = (vector unsigned char *) &input[0][0]; v2[0] = (vector unsigned char *) &input[0][1 * num_channels_X_img_width]; v3[0] = (vector unsigned char *) &input[0][2 * num_channels_X_img_width]; v4[0] = (vector unsigned char *) &input[0][3 * num_channels_X_img_width]; v5 = (vector unsigned char *) temp; v1[1] = (vector unsigned char *) &input[1][0]; v2[1] = (vector unsigned char *) &input[1][1 * num_channels_X_img_width]; v3[1] = (vector unsigned char *) &input[1][2 * num_channels_X_img_width]; v4[1] = (vector unsigned char *) &input[1][3 * num_channels_X_img_width]; addr2 = (unsigned int)img->dst; //start of image addr2 += (block_nr / NUM_IMAGES_HEIGHT) * num_channels_X_img_width * img->height / NUM_IMAGES_HEIGHT; //start line of spu block addr2 += (block_nr % NUM_IMAGES_WIDTH) * num_channels_X_img_width / NUM_IMAGES_WIDTH; addr1 = ((unsigned int)img->src); buf = 0; // first data transfer mfc_getb(input[buf], addr1, SCALE_FACTOR * num_channels_X_img_width, 0, 0, 0); for (i = 1; i<img->height / SCALE_FACTOR; i++){ // get 4 lines nxt_buf = buf ^ 1; //ask for next data buffer from PPU //mfg_get with barrier addr1 = ((unsigned int)img->src) + i * num_channels_X_img_width * SCALE_FACTOR; mfc_getb(input[nxt_buf], addr1, SCALE_FACTOR * num_channels_X_img_width, nxt_buf, 0, 0); mfc_write_tag_mask(1 << buf); mfc_read_tag_status_all(); // process current buffer for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j])); } for (j = 0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } // sent precedent buffer to PPU mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; //line inside spu block mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); buf = nxt_buf; //prepare next iteration } mfc_write_tag_mask(1 << buf); mfc_read_tag_status_all(); // process last buffer for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){ v5[j] = spu_avg(spu_avg(v1[buf][j], v2[buf][j]), spu_avg(v3[buf][j], v4[buf][j])); } for (j=0; j < img->width; j+=SCALE_FACTOR){ r = g = b = 0; for (k = j; k < j + SCALE_FACTOR; k++) { r += temp[k * NUM_CHANNELS + 0]; g += temp[k * NUM_CHANNELS + 1]; b += temp[k * NUM_CHANNELS + 2]; } r /= SCALE_FACTOR; b /= SCALE_FACTOR; g /= SCALE_FACTOR; output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r; output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g; output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b; } // send last buffer to PPU mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0); addr2 += img->width * NUM_CHANNELS; mfc_write_tag_mask(1 << MY_TAG); mfc_read_tag_status_all(); free_align(temp); free_align(input[0]); free_align(input[1]); free_align(output); }
u32int *malloc(u32int size) { return malloc_align(size, 0x1); }
int main(int argc, char **argv) { if (argc != 8) { printf("Usage: ./tema3 mod_vect mod_dma num_spus in.pgm out.cmp out.pgm results.txt"); return -1; } int mod_vect = atoi(argv[1]); int mod_dma = atoi(argv[2]); int num_spus = atoi(argv[3]); char *inpgm = argv[4]; char *outcmp = argv[5]; char *outpgm = argv[6]; char *results = argv[7]; int i; struct img initial_image, decompressed_image; struct c_img compressed_image; struct timeval start_total, end_total, start_op, end_op; double total_time = 0, op_time = 0; gettimeofday(&start_total, NULL); // citeste imaginea initiala read_pgm(inpgm, &initial_image); gettimeofday(&start_op, NULL); compressed_image.width = initial_image.width; compressed_image.height = initial_image.height; int nr_cmp_blocks = (1LL * initial_image.width * initial_image.height) / (BLOCK_SIZE * BLOCK_SIZE); compressed_image.blocks = (struct block *)malloc_align(nr_cmp_blocks * sizeof(struct block), 7); pthread_t *compress_threads = (pthread_t*)malloc_align(num_spus * sizeof(pthread_t), 7); struct package_t *cthread_arg = (struct package_t *)malloc_align(num_spus * sizeof(struct package_t), 7); int nr_of_blocks = (initial_image.width * initial_image.height) / (BLOCK_SIZE * BLOCK_SIZE); int average_blocks = nr_of_blocks / num_spus; int rest_blocks = nr_of_blocks % num_spus; int offset = 0; for(i = 0; i < num_spus; i++) { /* completeaza structura package_t de trimis la spu pentru fiecare spu*/ cthread_arg[i].action_type = 0; cthread_arg[i].mod_vect = mod_vect; cthread_arg[i].mod_dma = mod_dma; cthread_arg[i].num_spus = num_spus; cthread_arg[i].nr_blocks = average_blocks; cthread_arg[i].index_block = offset; cthread_arg[i].img_pgm.width = initial_image.width; cthread_arg[i].img_pgm.height = initial_image.height; cthread_arg[i].img_pgm.pixels = initial_image.pixels + ((offset / (initial_image.width / BLOCK_SIZE)) * BLOCK_SIZE * initial_image.width + (offset % (initial_image.width / BLOCK_SIZE)) * BLOCK_SIZE); cthread_arg[i].img_cmp.width = compressed_image.width; cthread_arg[i].img_cmp.height = compressed_image.height; cthread_arg[i].img_cmp.blocks = compressed_image.blocks + ((offset / (initial_image.width / BLOCK_SIZE)) * (initial_image.width / BLOCK_SIZE) + (offset % (initial_image.width / BLOCK_SIZE))); offset += average_blocks; nr_of_blocks -= average_blocks; if (rest_blocks != 0 && i != num_spus - 1) { average_blocks = nr_of_blocks / (num_spus - 1 - i); rest_blocks = nr_of_blocks % (num_spus - 1 - i); } /* Create thread for each SPE context */ if (pthread_create (&compress_threads[i], NULL, &ppu_pthread_function, &cthread_arg[i])) { perror ("Failed creating thread"); exit (1); } } /* Wait for SPU-thread to complete execution. */ for (i = 0; i < num_spus; i++) { if (pthread_join (compress_threads[i], NULL)) { perror("Failed pthread_join"); exit (1); } } free_align(compress_threads); free_align(cthread_arg); decompressed_image.width = initial_image.width; decompressed_image.height = initial_image.height; int nr_dec_blocks = (1LL * initial_image.width * initial_image.height) / (BLOCK_SIZE * BLOCK_SIZE); decompressed_image.pixels = (unsigned char *)malloc_align(initial_image.height * initial_image.width * sizeof(unsigned char), 7); pthread_t *decompress_threads = (pthread_t*)malloc_align(num_spus * sizeof(pthread_t), 7); struct package_t *dthread_arg = (struct package_t *)malloc_align(num_spus * sizeof(struct package_t), 7); int dec_average_blocks = nr_dec_blocks / num_spus; int dec_rest_blocks = nr_dec_blocks % num_spus; int dec_offset = 0; for(i = 0; i < num_spus; i++) { /* completeaza structura package_t de trimis la spu pentru fiecare spu*/ dthread_arg[i].action_type = 1; dthread_arg[i].mod_vect = mod_vect; dthread_arg[i].mod_dma = mod_dma; dthread_arg[i].num_spus = num_spus; dthread_arg[i].nr_blocks = dec_average_blocks; dthread_arg[i].index_block = dec_offset; dthread_arg[i].img_pgm.width = initial_image.width; dthread_arg[i].img_pgm.height = initial_image.height; dthread_arg[i].img_pgm.pixels = decompressed_image.pixels + ((dec_offset / (initial_image.width / BLOCK_SIZE)) * BLOCK_SIZE * initial_image.width + (dec_offset % (initial_image.width / BLOCK_SIZE)) * BLOCK_SIZE); dthread_arg[i].img_cmp.width = compressed_image.width; dthread_arg[i].img_cmp.height = compressed_image.height; dthread_arg[i].img_cmp.blocks = compressed_image.blocks + ((dec_offset / (initial_image.width / BLOCK_SIZE)) * (initial_image.width / BLOCK_SIZE) + (dec_offset % (initial_image.width / BLOCK_SIZE))); dec_offset += dec_average_blocks; nr_dec_blocks -= dec_average_blocks; if (dec_rest_blocks != 0 && i != num_spus - 1) { dec_average_blocks = nr_dec_blocks / (num_spus - 1 - i); dec_rest_blocks = nr_dec_blocks % (num_spus - 1 - i); } /* Create thread for each SPE context */ if (pthread_create (&decompress_threads[i], NULL, &ppu_pthread_function, &dthread_arg[i])) { perror ("Failed creating thread"); exit (1); } } /* Wait for SPU-thread to complete execution. */ for (i = 0; i < num_spus; i++) { if (pthread_join (decompress_threads[i], NULL)) { perror("Failed pthread_join"); exit (1); } } gettimeofday(&end_op, NULL); write_cmp(outcmp, &compressed_image); write_pgm(outpgm, &decompressed_image); free_align(compressed_image.blocks); free_align(decompressed_image.pixels); free_align(decompress_threads); free_align(dthread_arg); gettimeofday(&end_total, NULL); total_time += GET_TIME_DELTA(start_total, end_total); op_time += GET_TIME_DELTA(start_op, end_op); freopen(results, "a+", stdout); printf("%i %lf %lf\n", num_spus, op_time, total_time); fclose(stdout); return 0; }
int main(int argc, char **argv) { init_spus(); srand((unsigned)time(NULL)); char *fis_in, *fis_out; int zoom, rows, cols, i, j, overlap_spu, overlap_ppu, patch_w, patch_h, nr_patches; if (argc < 8) { fprintf(stderr, "Error: Missing some parameters.\n"); fprintf(stderr, "Run: ./program fis_in fis_out zoom nr_bucati_dim1 nr_bucati_dim2 banda_de_suprapunere_dim1 banda_de_suprapunere_dim2\n"); return -1; } fis_in = argv[1]; fis_out = argv[2]; zoom = atoi(argv[3]); rows = atoi(argv[4]); cols = atoi(argv[5]); overlap_spu = atoi(argv[6]); overlap_ppu = atoi(argv[7]); image img_src = read_ppm(fis_in); if (img_src == NULL) { fprintf(stderr, "Error reading image file.\n"); return -1; } patch_w = (zoom * img_src->width) / cols; patch_h = (zoom * img_src->height) / rows; nr_patches = rows * cols; printf("PPU: NR PATCHES NECESARY = %d\n", nr_patches); int **spu_patch_id_vector = alloc_patch_id_vector(rows); if (spu_patch_id_vector == NULL) return -1; printf("PPU: ZOOM=%d ROWS=%d COLS=%d img->width=%d img->height=%d patch_w=%d patch_h=%d\n", zoom, rows, cols, img_src->width, img_src->height, patch_w, patch_h); int* rand_seed = make_seed_vector(); if (rand_seed == NULL) return -1; int ***min_borders = malloc_align(SPU_THREADS * sizeof(int**), 4); if (min_borders == NULL) { perror("PPU: malloc_align failed in main"); return -1; } for (i = 0; i < SPU_THREADS; i++) { min_borders[i] = alloc_aligned_matrix((rows-1), overlap_spu); if (min_borders[i] == NULL) return -1; } pixel_t **patches_to_send = make_patches(img_src, patch_w, patch_h, nr_patches); send_patch_info(&patch_w, &patch_h, &rows, &nr_patches, spu_patch_id_vector, patches_to_send, rand_seed, &overlap_spu, min_borders); stop_spus(); int out_img_width = zoom * img_src->width; int out_img_height = zoom * img_src->height; image img_dst = alloc_img(out_img_width, out_img_height); for (i = 0; i < SPU_THREADS; i++) { printf("PPU: spu[%d]: ID= ", i); for (j = 0; j < rows; j++) printf("%d ", spu_patch_id_vector[i][j]); printf("\n"); } make_final_image(img_dst, patch_w, patch_h, spu_patch_id_vector, rows, patches_to_send); write_ppm(fis_out, img_dst); free_img(img_src); free_img(img_dst); free_seed_vector(rand_seed); free_patch_id_vector(spu_patch_id_vector); for (i = 0; i < SPU_THREADS; i++) free_aligned_matrix(min_borders[i], rows-1); return 0; }
inline void* operator new[](size_t sz) { return malloc_align(sz); }