int main(int argc, char **argv) { // show_qpu_frament(address, instruction count); for (int i=8; i<17; i++) { int size = 1<<i; char filename[256]; sprintf(filename, "shader_%d%s.s", size<1024?size:size/1024, size<1024?"":"k"); FILE *fp = freopen (filename, "w", stdout); puts( "/*\n" "Disassembly of Andrew Holme's QPU FFT for Raspberry Pi.\n" "From: https://github.com/raspberrypi/firmware/tree/master/opt/vc/src/hello_pi/hello_fft\n" "*/\n" ); copyright(); printf("shader_%d%s:\n", (1<<i)<=512?(1<<i):(1<<i)/1024, (1<<i)<=512?"":"k"); show_qpu_fragment(gpu_fft_shader_code(i), gpu_fft_shader_size(i)/4); printf("\n"); fclose(fp); } }
int gpu_fft_prepare( int mb, // mailbox file_desc int log2_N, // log2(FFT_length) = 8...22 int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft() int jobs, // number of transforms in batch struct GPU_FFT **fft) { unsigned info_bytes, twid_bytes, fft_bytes, data_bytes, fft_bytes_4k, buff_bytes, num_buff, code_bytes, unif_bytes, mail_bytes; unsigned size, *uptr, vc_tw, vc_buff, vc_data; int i, j, q, shared, unique, passes, ret; struct GPU_FFT_PTR ptr; struct GPU_FFT *info; if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2; info_bytes = 4096; fft_bytes = sizeof(COMPLEX)<<log2_N; code_bytes = gpu_fft_shader_size(log2_N); twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique); unif_bytes = sizeof(int)*GPU_FFT_QPUS*(4+2*jobs*passes); mail_bytes = sizeof(int)*GPU_FFT_QPUS*2; // (MM) Use as few temporary buffers as possible if (fft_bytes >= GPU_FFT_MIN_BUFF_SIZE) // FFT size larger than cache => use exactly one buffer num_buff = 1; else { // use up to GPU_FFT_TEMP_BUFF_SIZE bytes buffer, but no more than needed num_buff = GPU_FFT_MIN_BUFF_SIZE / fft_bytes; i = (passes-1) * jobs; if (i > GPU_FFT_MAX_BUFF_COUNT) i = GPU_FFT_MAX_BUFF_COUNT; if (num_buff > i) num_buff = i; } // Missing 4k alignment for 256 points fft_bytes_4k = ((fft_bytes-1)|4095)+1; buff_bytes = fft_bytes_4k * num_buff; data_bytes = fft_bytes * jobs; size = info_bytes + // header buff_bytes + // additional buffers data_bytes + // source data code_bytes + // shader, aligned twid_bytes + // twiddles unif_bytes + // uniforms mail_bytes; // mailbox message //fprintf(stderr, \ "fft_bytes = %x\n" \ "code_bytes = %x\n" \ "twid_bytes = %x\n" \ "unif_bytes = %x\n" \ "mail_bytes = %x\n" \ "buff_bytes = %x\n" \ "data_bytes = %x\n" \ "num_buff = %x\n" \ "size = %x\n", \ fft_bytes, code_bytes, twid_bytes, unif_bytes, mail_bytes, buff_bytes, data_bytes, num_buff, size); ret = gpu_fft_alloc(mb, size, &ptr); if (ret) return ret; // Header info = (struct GPU_FFT *) ptr.arm.vptr; gpu_fft_ptr_inc(&ptr, info_bytes); // For transpose info->x = 1<<log2_N; info->y = jobs; // (MM) Use dedicated temporary buffers so all transforms can operate in place, // except if we only have 1 buffer. info->out = ptr.arm.cptr; vc_buff = gpu_fft_ptr_inc(&ptr, buff_bytes); info->step = fft_bytes / sizeof(COMPLEX); info->in = ptr.arm.cptr; if (num_buff > 1 || !(passes & 1)) info->out = info->in; vc_data = gpu_fft_ptr_inc(&ptr, data_bytes); //fprintf(stderr, \ "vc_buff = %x\n" \ "vc_data = %x\n" \ "in = %x\n" \ "out = %x\n", \ vc_buff, vc_data, info->in, info->out); // Shader code memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes); info->base.vc_code = gpu_fft_ptr_inc(&ptr, code_bytes); // Twiddles gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr); vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes); uptr = ptr.arm.uptr; // Uniforms for (q=0; q<GPU_FFT_QPUS; q++) { int current_buff = num_buff-1; *uptr++ = vc_tw; *uptr++ = vc_tw + sizeof(COMPLEX)*16*(shared + q*unique); *uptr++ = q; for (i=0; i < jobs; ++i) { unsigned data = vc_data + i * fft_bytes; *uptr++ = data; if (num_buff == 1) { // use ping pong buffers unsigned buff = vc_buff + current_buff * fft_bytes_4k; for (j = 1; j < passes; j++) { uptr[0] = uptr[1] = buff; // swap buffers buff = data; data = uptr[0]; uptr += 2; } if (passes & 1) ++current_buff; *uptr++ = buff; } else { // use dedicated buffers for (j = 1; j < passes; j++) { // round robin current_buff = (current_buff+1) % num_buff; uptr[0] = uptr[1] = vc_buff + current_buff * fft_bytes_4k; uptr += 2; } *uptr++ = data; } } *uptr++ = 0; //if (q == 0) for (i = -(4+2*jobs*passes); i < 0; ++i) fprintf(stderr, "%x\n", uptr[i]); info->base.vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int)*(4+2*jobs*passes)); } if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) { // Direct register poking with busy wait info->base.vc_msg = 0; } else { // Mailbox message for (q=0; q<GPU_FFT_QPUS; q++) { *uptr++ = info->base.vc_unifs[q]; *uptr++ = info->base.vc_code; } info->base.vc_msg = ptr.vc; } *fft = info; return 0; }
int gpu_fft_prepare( int mb, // mailbox file_desc int log2_N, // log2(FFT_length) = 8...22 int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft() int jobs, // number of transforms in batch struct GPU_FFT **fft) { uint32_t *uptr; unsigned info_bytes, twid_bytes, data_bytes, code_bytes, unif_bytes, mail_bytes; unsigned size, vc_tw, vc_data; int i, q, shared, unique, passes, ret; struct GPU_FFT_BASE *base; struct GPU_FFT_PTR ptr; struct GPU_FFT *info; if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2; info_bytes = 4096; data_bytes = (1+((sizeof(COMPLEX)<<log2_N)|4095)); code_bytes = gpu_fft_shader_size(log2_N); twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique); unif_bytes = sizeof(int32_t)*GPU_FFT_QPUS*(5+jobs*2); mail_bytes = sizeof(int32_t)*GPU_FFT_QPUS*2; size = info_bytes + // header data_bytes*jobs*2 + // ping-pong data, aligned code_bytes + // shader, aligned twid_bytes + // twiddles unif_bytes + // uniforms mail_bytes; // mailbox message ret = gpu_fft_alloc(mb, size, &ptr); if (ret) return ret; // Header info = (struct GPU_FFT *) ptr.arm.vptr; base = (struct GPU_FFT_BASE *) info; gpu_fft_ptr_inc(&ptr, info_bytes); // For transpose info->x = 1<<log2_N; info->y = jobs; // Ping-pong buffers leave results in or out of place info->in = info->out = ptr.arm.cptr; info->step = data_bytes / sizeof(COMPLEX); if (passes&1) info->out += info->step * jobs; // odd => out of place vc_data = gpu_fft_ptr_inc(&ptr, data_bytes*jobs*2); // Shader code memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes); base->vc_code = gpu_fft_ptr_inc(&ptr, code_bytes); // Twiddles gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr); vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes); uptr = ptr.arm.uptr; // Uniforms for (q=0; q<GPU_FFT_QPUS; q++) { *uptr++ = vc_tw; *uptr++ = vc_tw + sizeof(COMPLEX)*16*(shared + q*unique); *uptr++ = q; for (i=0; i<jobs; i++) { *uptr++ = vc_data + data_bytes*i; *uptr++ = vc_data + data_bytes*i + data_bytes*jobs; } *uptr++ = 0; *uptr++ = (q==0); // For mailbox: IRQ enable, master only base->vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int32_t)*(5+jobs*2)); } if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) { // Direct register poking with busy wait base->vc_msg = 0; } else { // Mailbox message for (q=0; q<GPU_FFT_QPUS; q++) { *uptr++ = base->vc_unifs[q]; *uptr++ = base->vc_code; } base->vc_msg = ptr.vc; } *fft = info; return 0; }