Example #1
0
int main(int argc, char **argv) {
	// show_qpu_frament(address, instruction count);
	for (int i=8; i<17; i++) {
		int size = 1<<i;
		char filename[256];
		sprintf(filename, "shader_%d%s.s", size<1024?size:size/1024, size<1024?"":"k");
		FILE *fp = freopen (filename, "w", stdout);

		puts(
			"/*\n"
			"Disassembly of Andrew Holme's QPU FFT for Raspberry Pi.\n"
			"From: https://github.com/raspberrypi/firmware/tree/master/opt/vc/src/hello_pi/hello_fft\n"
			"*/\n"
		);
		copyright();
		printf("shader_%d%s:\n", (1<<i)<=512?(1<<i):(1<<i)/1024, (1<<i)<=512?"":"k");
		show_qpu_fragment(gpu_fft_shader_code(i), gpu_fft_shader_size(i)/4);
		printf("\n");
		fclose(fp);
	}
}
Example #2
0
int gpu_fft_prepare(
    int mb,         // mailbox file_desc
    int log2_N,     // log2(FFT_length) = 8...22
    int direction,  // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
    int jobs,       // number of transforms in batch
    struct GPU_FFT **fft) {

    unsigned info_bytes, twid_bytes, fft_bytes, data_bytes,
             fft_bytes_4k, buff_bytes, num_buff,
             code_bytes, unif_bytes, mail_bytes;
    unsigned size, *uptr, vc_tw, vc_buff, vc_data;
    int i, j, q, shared, unique, passes, ret;

    struct GPU_FFT_PTR ptr;
    struct GPU_FFT *info;

    if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2;

    info_bytes = 4096;
    fft_bytes  = sizeof(COMPLEX)<<log2_N;
    code_bytes = gpu_fft_shader_size(log2_N);
    twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique);
    unif_bytes = sizeof(int)*GPU_FFT_QPUS*(4+2*jobs*passes);
    mail_bytes = sizeof(int)*GPU_FFT_QPUS*2;

    // (MM) Use as few temporary buffers as possible
    if (fft_bytes >= GPU_FFT_MIN_BUFF_SIZE)
        // FFT size larger than cache => use exactly one buffer
        num_buff = 1;
    else
    {   // use up to GPU_FFT_TEMP_BUFF_SIZE bytes buffer, but no more than needed
        num_buff = GPU_FFT_MIN_BUFF_SIZE / fft_bytes;
        i = (passes-1) * jobs;
        if (i > GPU_FFT_MAX_BUFF_COUNT)
            i = GPU_FFT_MAX_BUFF_COUNT;
        if (num_buff > i)
            num_buff = i;
    }
    // Missing 4k alignment for 256 points
    fft_bytes_4k = ((fft_bytes-1)|4095)+1;
    buff_bytes = fft_bytes_4k * num_buff;
    data_bytes = fft_bytes * jobs;

    size  = info_bytes +  // header
            buff_bytes +  // additional buffers
            data_bytes +  // source data
            code_bytes +  // shader, aligned
            twid_bytes +  // twiddles
            unif_bytes +  // uniforms
            mail_bytes;   // mailbox message

    //fprintf(stderr, \
        "fft_bytes = %x\n" \
        "code_bytes = %x\n" \
        "twid_bytes = %x\n" \
        "unif_bytes = %x\n" \
        "mail_bytes = %x\n" \
        "buff_bytes = %x\n" \
        "data_bytes = %x\n" \
        "num_buff = %x\n" \
        "size = %x\n", \
        fft_bytes, code_bytes, twid_bytes, unif_bytes, mail_bytes, buff_bytes, data_bytes, num_buff, size);
    ret = gpu_fft_alloc(mb, size, &ptr);
    if (ret) return ret;

    // Header
    info = (struct GPU_FFT *) ptr.arm.vptr;
    gpu_fft_ptr_inc(&ptr, info_bytes);

    // For transpose
    info->x = 1<<log2_N;
    info->y = jobs;

    // (MM) Use dedicated temporary buffers so all transforms can operate in place,
    // except if we only have 1 buffer.
    info->out = ptr.arm.cptr;
    vc_buff = gpu_fft_ptr_inc(&ptr, buff_bytes);
    info->step = fft_bytes / sizeof(COMPLEX);
    info->in = ptr.arm.cptr;
    if (num_buff > 1 || !(passes & 1))
        info->out = info->in;
    vc_data = gpu_fft_ptr_inc(&ptr, data_bytes);
    //fprintf(stderr, \
        "vc_buff = %x\n" \
        "vc_data = %x\n" \
        "in = %x\n" \
        "out = %x\n", \
        vc_buff, vc_data, info->in, info->out);

    // Shader code
    memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes);
    info->base.vc_code = gpu_fft_ptr_inc(&ptr, code_bytes);

    // Twiddles
    gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr);
    vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes);

    uptr = ptr.arm.uptr;

    // Uniforms
    for (q=0; q<GPU_FFT_QPUS; q++) {
        int current_buff = num_buff-1;
        *uptr++ = vc_tw;
        *uptr++ = vc_tw + sizeof(COMPLEX)*16*(shared + q*unique);
        *uptr++ = q;
        for (i=0; i < jobs; ++i) {
            unsigned data = vc_data + i * fft_bytes;
            *uptr++ = data;
            if (num_buff == 1)
            {   // use ping pong buffers
                unsigned buff = vc_buff + current_buff * fft_bytes_4k;
                for (j = 1; j < passes; j++)
                {   uptr[0] = uptr[1] = buff;
                    // swap buffers
                    buff = data;
                    data = uptr[0];
                    uptr += 2;
                }
                if (passes & 1)
                    ++current_buff;
                *uptr++ = buff;
            } else
            {   // use dedicated buffers
                for (j = 1; j < passes; j++)
                {   // round robin
                    current_buff = (current_buff+1) % num_buff;
                    uptr[0] = uptr[1] = vc_buff + current_buff * fft_bytes_4k;
                    uptr += 2;
                }
                *uptr++ = data;
            }
        }
        *uptr++ = 0;
        //if (q == 0) for (i = -(4+2*jobs*passes); i < 0; ++i) fprintf(stderr, "%x\n", uptr[i]);
        info->base.vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int)*(4+2*jobs*passes));
    }

    if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) {
        // Direct register poking with busy wait
        info->base.vc_msg = 0;
    }
    else {
        // Mailbox message
        for (q=0; q<GPU_FFT_QPUS; q++) {
            *uptr++ = info->base.vc_unifs[q];
            *uptr++ = info->base.vc_code;
        }

        info->base.vc_msg = ptr.vc;
    }

    *fft = info;
    return 0;
}
Example #3
0
int gpu_fft_prepare(
    int mb,         // mailbox file_desc
    int log2_N,     // log2(FFT_length) = 8...22
    int direction,  // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
    int jobs,       // number of transforms in batch
    struct GPU_FFT **fft) {

    uint32_t *uptr;
    unsigned info_bytes, twid_bytes, data_bytes, code_bytes, unif_bytes, mail_bytes;
    unsigned size, vc_tw, vc_data;
    int i, q, shared, unique, passes, ret;

    struct GPU_FFT_BASE *base;
    struct GPU_FFT_PTR ptr;
    struct GPU_FFT *info;

    if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2;

    info_bytes = 4096;
    data_bytes = (1+((sizeof(COMPLEX)<<log2_N)|4095));
    code_bytes = gpu_fft_shader_size(log2_N);
    twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique);
    unif_bytes = sizeof(int32_t)*GPU_FFT_QPUS*(5+jobs*2);
    mail_bytes = sizeof(int32_t)*GPU_FFT_QPUS*2;

    size  = info_bytes +        // header
            data_bytes*jobs*2 + // ping-pong data, aligned
            code_bytes +        // shader, aligned
            twid_bytes +        // twiddles
            unif_bytes +        // uniforms
            mail_bytes;         // mailbox message

    ret = gpu_fft_alloc(mb, size, &ptr);
    if (ret) return ret;

    // Header
    info = (struct GPU_FFT *) ptr.arm.vptr;
    base = (struct GPU_FFT_BASE *) info;
    gpu_fft_ptr_inc(&ptr, info_bytes);

    // For transpose
    info->x = 1<<log2_N;
    info->y = jobs;

    // Ping-pong buffers leave results in or out of place
    info->in = info->out = ptr.arm.cptr;
    info->step = data_bytes / sizeof(COMPLEX);
    if (passes&1) info->out += info->step * jobs; // odd => out of place
    vc_data = gpu_fft_ptr_inc(&ptr, data_bytes*jobs*2);

    // Shader code
    memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes);
    base->vc_code = gpu_fft_ptr_inc(&ptr, code_bytes);

    // Twiddles
    gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr);
    vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes);

    uptr = ptr.arm.uptr;

    // Uniforms
    for (q=0; q<GPU_FFT_QPUS; q++) {
        *uptr++ = vc_tw;
        *uptr++ = vc_tw + sizeof(COMPLEX)*16*(shared + q*unique);
        *uptr++ = q;
        for (i=0; i<jobs; i++) {
            *uptr++ = vc_data + data_bytes*i;
            *uptr++ = vc_data + data_bytes*i + data_bytes*jobs;
        }
        *uptr++ = 0;
        *uptr++ = (q==0); // For mailbox: IRQ enable, master only

        base->vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int32_t)*(5+jobs*2));
    }

    if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) {
        // Direct register poking with busy wait
        base->vc_msg = 0;
    }
    else {
        // Mailbox message
        for (q=0; q<GPU_FFT_QPUS; q++) {
            *uptr++ = base->vc_unifs[q];
            *uptr++ = base->vc_code;
        }

        base->vc_msg = ptr.vc;
    }

    *fft = info;
    return 0;
}