Exemplo n.º 1
0
//---
//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
//IN: src - src pointer for copy.  Must be accessible from host CPU.
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved.  May be NULL indicating no dependency.
void UnpinnedCopyEngine::CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
    {
        std::lock_guard<std::mutex> l (_copyLock);

        const char *srcp0 = static_cast<const char*> (src);
        char *dstp1 = static_cast<char*> (dst);

        for (int i=0; i<_numBuffers; i++) {
            hsa_signal_store_relaxed(_completionSignal[i], 0);
        }

        if (sizeBytes >= UINT64_MAX/2) {
            THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT);
        }

        int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer.
        int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest

        while (bytesRemaining1 > 0)
        {
            // First launch the async copies to copy from dest to host
            for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers);  bytesRemaining0 -= _bufferSize, bufferIndex++) {

                size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0;

                tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu  async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
                hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1);
                hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]);
                if (hsa_status != HSA_STATUS_SUCCESS) {
                    THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
                }

                srcp0 += theseBytes;


                // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
                waitFor = NULL;
            }

            // Now unload the staging buffers:
            for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers);  bytesRemaining1 -= _bufferSize, bufferIndex++) {

                size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1;

                tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1);
                hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);

                tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
                memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes);

                dstp1 += theseBytes;
            }
		}
    }
}
Exemplo n.º 2
0
//---
//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
//IN: dst - dest pointer - must be accessible from host CPU.
//IN: src - src pointer for copy.  Must be accessible from agent this buffer is associated with (via _hsaAgent)
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved.  May be NULL indicating no dependency.
void UnpinnedCopyEngine::CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
	{
        std::lock_guard<std::mutex> l (_copyLock);

        const char *srcp = static_cast<const char*> (src);
        char *dstp = static_cast<char*> (dst);

        for (int i=0; i<_numBuffers; i++) {
            hsa_signal_store_relaxed(_completionSignal[i], 0);
        }

        if (sizeBytes >= UINT64_MAX/2) {
            THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT);
        }
        int bufferIndex = 0;
        for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ;  bytesRemaining -= _bufferSize) {

            size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining;

            tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle);
            hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);

            tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]);
            // TODO - use uncached memcpy, someday.
            memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes);


            hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1);
            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]);
            tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);
            if (hsa_status != HSA_STATUS_SUCCESS) {
                THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
            }

            srcp += theseBytes;
            dstp += theseBytes;
            if (++bufferIndex >= _numBuffers) {
                bufferIndex = 0;
            }

            // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
            waitFor = NULL;
        }


        for (int i=0; i<_numBuffers; i++) {
            hsa_signal_wait_acquire(_completionSignal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
        }
	}
}
Exemplo n.º 3
0
//---
//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
//IN: dst - dest pointer - must be accessible from host CPU.
//IN: src - src pointer for copy.  Must be accessible from agent this buffer is associated with (via _hsaAgent)
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved.  May be NULL indicating no dependency.
void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
    std::lock_guard<std::mutex> l (_copyLock);

    const char *srcp = static_cast<const char*> (src);
    char *dstp = static_cast<char*> (dst);

    for (int i=0; i<_numBuffers; i++) {
        hsa_signal_store_relaxed(_completionSignal[i], 0);
    }

    if (sizeBytes >= UINT64_MAX/2) {
        THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT);
    }
    int bufferIndex = 0;

    size_t theseBytes= sizeBytes;
    //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle);
    //hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);

    //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO
    void *locked_srcp;
    //hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsaAgent, 1, &locked_srcp);
    hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast<char*> (srcp), theseBytes, &_hsaAgent, 1, &locked_srcp);
    //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex);
    //printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp);

    if (hsa_status != HSA_STATUS_SUCCESS) {
        THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
    }

    hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1);

    hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]);
    //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);

    if (hsa_status != HSA_STATUS_SUCCESS) {
        THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
    }
    tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle);
    hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
    hsa_amd_memory_unlock(const_cast<char*> (srcp));
    // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
    waitFor = NULL;
}
Exemplo n.º 4
0
void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
{
    std::lock_guard<std::mutex> l (_copyLock);

    const char *srcp = static_cast<const char*> (src);
    char *dstp = static_cast<char*> (dst);

    for (int i=0; i<_numBuffers; i++) {
        hsa_signal_store_relaxed(_completionSignal[i], 0);
    }

    if (sizeBytes >= UINT64_MAX/2) {
        THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT);
    }
    int bufferIndex = 0;
    size_t theseBytes= sizeBytes;
    void *locked_destp;

    hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast<char*> (dstp), theseBytes, &_hsaAgent, 1, &locked_destp);


    if (hsa_status != HSA_STATUS_SUCCESS) {
        THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
    }

    hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1);

    hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]);

    if (hsa_status != HSA_STATUS_SUCCESS) {
        THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
    }
    tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle);
    hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
    hsa_amd_memory_unlock(const_cast<char*> (dstp));

    // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
    waitFor = NULL;
}
Exemplo n.º 5
0
void hsa_enqueue()
{
    hsa_status_t err = 0;
    /*
     * Obtain the current queue write index.
     */
    uint64_t index = hsa_queue_load_write_index_relaxed(queue);

    /*
     * Write the aql packet at the calculated queue index address.
     */
    const uint32_t queueMask = queue->size - 1;
    hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]);

    dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
    dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
    dispatch_packet->setup  |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    dispatch_packet->workgroup_size_x = (uint16_t)LOCAL_SIZE;
    dispatch_packet->workgroup_size_y = (uint16_t)1;
    dispatch_packet->workgroup_size_z = (uint16_t)1;
    dispatch_packet->grid_size_x = (uint32_t) (N_DCNT);
    dispatch_packet->grid_size_y = 1;
    dispatch_packet->grid_size_z = 1;
    dispatch_packet->completion_signal = signal;
    dispatch_packet->kernel_object = kernel_object;
    dispatch_packet->kernarg_address = (void*) kernarg_address;
    dispatch_packet->private_segment_size = private_segment_size;
    dispatch_packet->group_segment_size = group_segment_size;
    __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE);

    /*
     * Increment the write index and ring the doorbell to dispatch the kernel.
     */
    hsa_queue_store_write_index_relaxed(queue, index+1);
    hsa_signal_store_relaxed(queue->doorbell_signal, index);
    check(Dispatching the kernel, err);

    /*
     * Wait on the dispatch completion signal until the kernel is finished.
     */
    hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);


}
Exemplo n.º 6
0
int main(int argc, char **argv)
{
    struct timespec timer_1, timer_2;

    hsa_status_t err;

    err = hsa_init();
    check(Initializing the hsa runtime, err);

    /* 
     * Iterate over the agents and pick the gpu agent using 
     * the get_gpu_agent callback.
     */
    hsa_agent_t agent;
    err = hsa_iterate_agents(get_gpu_agent, &agent);
    if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; }
    check(Getting a gpu agent, err);

    /*
     * Query the name of the agent.
     */
    char name[64] = { 0 };
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
    check(Querying the agent name, err);
    printf("The agent name is %s.\n", name);

    /*
     * Query the maximum size of the queue.
     */
    uint32_t queue_size = 0;
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
    check(Querying the agent maximum queue size, err);
    printf("The maximum queue size is %u.\n", (unsigned int) queue_size);

    /*
     * Create a queue using the maximum size.
     */
    hsa_queue_t* queue; 
    err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
    check(Creating the queue, err);

    /*
     * Load the BRIG binary.
     */
    hsa_ext_module_t module;
    load_module_from_file("vector_copy.brig",&module);

    /*
     * Create hsa program.
     */
    hsa_ext_program_t program;
    memset(&program,0,sizeof(hsa_ext_program_t));
    err = hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program);
    check(Create the program, err);

    /*
     * Add the BRIG module to hsa program.
     */
    err = hsa_ext_program_add_module(program, module);
    check(Adding the brig module to the program, err);

    /*
     * Determine the agents ISA.
     */
    hsa_isa_t isa;
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa);
    check(Query the agents isa, err);

    /*
     * Finalize the program and extract the code object.
     */
    hsa_ext_control_directives_t control_directives;
    memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t));
    hsa_code_object_t code_object;
    err = hsa_ext_program_finalize(program, isa, 0, control_directives, "-O0", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object);
    check(Finalizing the program, err);

    /*
     * Destroy the program, it is no longer needed.
     */
    err=hsa_ext_program_destroy(program);
    check(Destroying the program, err);

    /*
     * Create the empty executable.
     */
    hsa_executable_t executable;
    err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable);
    check(Create the executable, err);

    /*
     * Load the code<F3> object.
     */
    err = hsa_executable_load_code_object(executable, agent, code_object, "");
    check(Loading the code object, err);

    /*
     * Freeze the executable; it can now be queried for symbols.
     */
    err = hsa_executable_freeze(executable, "");
    check(Freeze the executable, err);

   /*
    * Extract the symbol from the executable.
    */
    hsa_executable_symbol_t symbol;
    err = hsa_executable_get_symbol(executable, "", "&__OpenCL_vector_copy_kernel", agent, 0, &symbol);
    check(Extract the symbol from the executable, err);

    /*
     * Extract dispatch information from the symbol
     */
    uint64_t kernel_object;
    uint32_t kernarg_segment_size;
    uint32_t group_segment_size;
    uint32_t private_segment_size;

    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object);
    check(Extracting the symbol from the executable, err);
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size);
    check(Extracting the kernarg segment size from the executable, err);
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
    check(Extracting the group segment size from the executable, err);
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size);
    check(Extracting the private segment from the executable, err);
     /*
     * Create a signal to wait for the dispatch to finish.
     */ 
    hsa_signal_t signal;
    err=hsa_signal_create(1, 0, NULL, &signal);
    check(Creating a HSA signal, err);

    /*
     * Allocate and initialize the kernel arguments and data.
     */
    int* in=(int*)malloc(SIZE);
    int i;
    for(i=0;i<ELEMENT;i++)
        in[i]=(rand()%50000+1);
    err=hsa_memory_register(in, SIZE);
    check(Registering argument memory for input parameter, err);

    int* out=(int*)malloc(SIZE);
    memset(out, 0, SIZE);
    err=hsa_memory_register(out, SIZE);
    check(Registering argument memory for output parameter, err);
    
    int element = ELEMENT;
    int iter = ITER;
    
    

    struct __attribute__ ((aligned(16))) args_t {
       	uint64_t global_offset_0;
	uint64_t global_offset_1;
	uint64_t global_offset_2;
	uint64_t printf_buffer;
	uint64_t vqueue_pointer;
	uint64_t aqlwrap_pointer;
        void* in;
        void* out;
        int iter;
        int element;
    } args;
    memset(&args, 0, sizeof(args));
    args.in=in;
    args.out=out;
    args.element=element;
    args.iter=iter;

    /*
     * Find a memory region that supports kernel arguments.
     */
    hsa_region_t kernarg_region;
    kernarg_region.handle=(uint64_t)-1;
    hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region);
    err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
    check(Finding a kernarg memory region, err);
    void* kernarg_address = NULL;

    /*
     * Allocate the kernel argument buffer from the correct region.
     */   
    err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address);
    check(Allocating kernel argument memory buffer, err);
    memcpy(kernarg_address, &args, sizeof(args));
 
    /*
     * Obtain the current queue write index.
     */
    uint64_t index = hsa_queue_load_write_index_relaxed(queue);

    /*
     * Write the aql packet at the calculated queue index address.
     */
    const uint32_t queueMask = queue->size - 1;
    hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]);

    dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
    dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
    dispatch_packet->setup  |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    dispatch_packet->workgroup_size_x = (uint16_t)LOCAL_SIZE;
    dispatch_packet->workgroup_size_y = (uint16_t)1;
    dispatch_packet->workgroup_size_z = (uint16_t)1;
    dispatch_packet->grid_size_x = (uint32_t) (GLOBAL_SIZE);
    dispatch_packet->grid_size_y = 1;
    dispatch_packet->grid_size_z = 1;
    dispatch_packet->completion_signal = signal;
    dispatch_packet->kernel_object = kernel_object;
    dispatch_packet->kernarg_address = (void*) kernarg_address;
    dispatch_packet->private_segment_size = private_segment_size;
    dispatch_packet->group_segment_size = group_segment_size;
    __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE);

    /*
     * Increment the write index and ring the doorbell to dispatch the kernel.
     */
    tic(&timer_1);
    hsa_queue_store_write_index_relaxed(queue, index+1);
    hsa_signal_store_relaxed(queue->doorbell_signal, index);
    check(Dispatching the kernel, err);

    /*
     * Wait on the dispatch completion signal until the kernel is finished.
     */
    hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
    toc("Execution Period", &timer_1, &timer_2);
    /*
     * Validate the data in the output buffer.
     */
    int temp = 0;
    for(i=0;i<element;i++)
    {
        if(temp<in[i])
            temp = in[i];
    }

    if(temp==out[GLOBAL_SIZE])
        printf("PASS \n");
    else 
        printf("FAIL out=%d in=%d \n",out[GLOBAL_SIZE],temp);

	
    /*
     * Cleanup all allocated resources.
     */
    err=hsa_signal_destroy(signal);
    check(Destroying the signal, err);

    err=hsa_executable_destroy(executable);
    check(Destroying the executable, err);

    err=hsa_code_object_destroy(code_object);
    check(Destroying the code object, err);

    err=hsa_queue_destroy(queue);
    check(Destroying the queue, err);
    
    err=hsa_shut_down();
    check(Shutting down the runtime, err);

    free(in);
    free(out);
//printf("kernarg_segment_size:%d group_segment_size:%d private_segment_size:%d",kernarg_segment_size,group_segment_size,private_segment_size);
 
        return 0;
}
Exemplo n.º 7
0
//---
//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
//IN: src - src pointer for copy.  Must be accessible from host CPU.
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved.  May be NULL indicating no dependency.
void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor)
{
    std::lock_guard<std::mutex> l (_copyLock);

    const char *srcp0 = static_cast<const char*> (src);
    char *dstp1 = static_cast<char*> (dst);

    for (int i=0; i<_numBuffers; i++) {
        hsa_signal_store_relaxed(_completionSignal[i], 0);
        hsa_signal_store_relaxed(_completionSignal2[i], 0);
    }

    if (sizeBytes >= UINT64_MAX/2) {
        THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT);
    }

    int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer.
    int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest

    while (bytesRemaining1 > 0) {
        // First launch the async copies to copy from dest to host
        for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers);  bytesRemaining0 -= _bufferSize, bufferIndex++) {

            size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0;

            // Wait to make sure we are not overwriting a buffer before it has been drained:
            hsa_signal_wait_acquire(_completionSignal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);

            tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu  async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
            hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1);
            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]);
            if (hsa_status != HSA_STATUS_SUCCESS) {
                THROW_ERROR (hipErrorRuntimeMemory, hsa_status);
            }

            srcp0 += theseBytes;


            // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
            waitFor = NULL;
        }

        // Now unload the staging buffers:
        for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers);  bytesRemaining1 -= _bufferSize, bufferIndex++) {

            size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1;

            tprintf (DB_COPY2, "P2P: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1);

            bool hostWait = 0; // TODO - remove me

            if (hostWait) {
                // Host-side wait, should not be necessary:
                hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
            }

            tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
            hsa_signal_store_relaxed(_completionSignal2[bufferIndex], 1);
            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent /*not used*/, theseBytes,
                                      hostWait ? 0:1, hostWait ? NULL : &_completionSignal[bufferIndex],
                                      _completionSignal2[bufferIndex]);

            dstp1 += theseBytes;
        }
    }


    // Wait for the staging-buffer to dest copies to complete:
    for (int i=0; i<_numBuffers; i++) {
        hsa_signal_wait_acquire(_completionSignal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
    }
}
Exemplo n.º 8
0
int main(int argc, char **argv) {
    hsa_status_t err;

    err = hsa_init();
    check(Initializing the hsa runtime, err);

    /*
     * Determine if the finalizer 1.0 extension is supported.
     */
    bool support;

    err = hsa_system_extension_supported(HSA_EXTENSION_FINALIZER, 1, 0, &support);

    check(Checking finalizer 1.0 extension support, err);

    /*
     * Generate the finalizer function table.
     */
    hsa_ext_finalizer_1_00_pfn_t table_1_00;

    err = hsa_system_get_extension_table(HSA_EXTENSION_FINALIZER, 1, 0, &table_1_00);

    check(Generating function table for finalizer, err);

    /* 
     * Iterate over the agents and pick the gpu agent using 
     * the get_gpu_agent callback.
     */
    hsa_agent_t agent;
    err = hsa_iterate_agents(get_gpu_agent, &agent);
    if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; }
    check(Getting a gpu agent, err);

    /*
     * Query the name of the agent.
     */
    char name[64] = { 0 };
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
    check(Querying the agent name, err);
    printf("The agent name is %s.\n", name);

    /*
     * Query the maximum size of the queue.
     */
    uint32_t queue_size = 0;
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
    check(Querying the agent maximum queue size, err);
    printf("The maximum queue size is %u.\n", (unsigned int) queue_size);

    /*
     * Create a queue using the maximum size.
     */
    hsa_queue_t* queue; 
    err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
    check(Creating the queue, err);

    /*
     * Load the BRIG binary.
     */
    hsa_ext_module_t module;
    load_module_from_file("vector_copy.brig",&module);

    /*
     * Create hsa program.
     */
    hsa_ext_program_t program;
    memset(&program,0,sizeof(hsa_ext_program_t));
    err = table_1_00.hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program);
    check(Create the program, err);

    /*
     * Add the BRIG module to hsa program.
     */
    err = table_1_00.hsa_ext_program_add_module(program, module);
    check(Adding the brig module to the program, err);

    /*
     * Determine the agents ISA.
     */
    hsa_isa_t isa;
    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa);
    check(Query the agents isa, err);

    /*
     * Finalize the program and extract the code object.
     */
    hsa_ext_control_directives_t control_directives;
    memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t));
    hsa_code_object_t code_object;
    err = table_1_00.hsa_ext_program_finalize(program, isa, 0, control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object);
    check(Finalizing the program, err);

    /*
     * Destroy the program, it is no longer needed.
     */
    err=table_1_00.hsa_ext_program_destroy(program);
    check(Destroying the program, err);

    /*
     * Create the empty executable.
     */
    hsa_executable_t executable;
    err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable);
    check(Create the executable, err);

    /*
     * Load the code object.
     */
    err = hsa_executable_load_code_object(executable, agent, code_object, "");
    check(Loading the code object, err);

    /*
     * Freeze the executable; it can now be queried for symbols.
     */
    err = hsa_executable_freeze(executable, "");
    check(Freeze the executable, err);

   /*
    * Extract the symbol from the executable.
    */
    hsa_executable_symbol_t symbol;
    err = hsa_executable_get_symbol(executable, NULL, "&__vector_copy_kernel", agent, 0, &symbol);
    check(Extract the symbol from the executable, err);

    /*
     * Extract dispatch information from the symbol
     */
    uint64_t kernel_object;
    uint32_t kernarg_segment_size;
    uint32_t group_segment_size;
    uint32_t private_segment_size;
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object);
    check(Extracting the symbol from the executable, err);
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size);
    check(Extracting the kernarg segment size from the executable, err);
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
    check(Extracting the group segment size from the executable, err);
    err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size);
    check(Extracting the private segment from the executable, err);

    /*
     * Create a signal to wait for the dispatch to finish.
     */ 
    hsa_signal_t signal;
    err=hsa_signal_create(1, 0, NULL, &signal);
    check(Creating a HSA signal, err);

    /*
     * Allocate and initialize the kernel arguments and data.
     */
    char* in=(char*)malloc(1024*1024*4);
    memset(in, 1, 1024*1024*4);
    err=hsa_memory_register(in, 1024*1024*4);
    check(Registering argument memory for input parameter, err);

    char* out=(char*)malloc(1024*1024*4);
    memset(out, 0, 1024*1024*4);
    err=hsa_memory_register(out, 1024*1024*4);
    check(Registering argument memory for output parameter, err);

    struct __attribute__ ((aligned(16))) args_t {
        void* in;
        void* out;
    } args;

    args.in=in;
    args.out=out;

    /*
     * Find a memory region that supports kernel arguments.
     */
    hsa_region_t kernarg_region;
    kernarg_region.handle=(uint64_t)-1;
    hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region);
    err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
    check(Finding a kernarg memory region, err);
    void* kernarg_address = NULL;

    /*
     * Allocate the kernel argument buffer from the correct region.
     */   
    err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address);
    check(Allocating kernel argument memory buffer, err);
    memcpy(kernarg_address, &args, sizeof(args));
 
    /*
     * Obtain the current queue write index.
     */
    uint64_t index = hsa_queue_load_write_index_relaxed(queue);

    /*
     * Write the aql packet at the calculated queue index address.
     */
    const uint32_t queueMask = queue->size - 1;
    hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]);

    dispatch_packet->setup  |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    dispatch_packet->workgroup_size_x = (uint16_t)256;
    dispatch_packet->workgroup_size_y = (uint16_t)1;
    dispatch_packet->workgroup_size_z = (uint16_t)1;
    dispatch_packet->grid_size_x = (uint32_t) (1024*1024);
    dispatch_packet->grid_size_y = 1;
    dispatch_packet->grid_size_z = 1;
    dispatch_packet->completion_signal = signal;
    dispatch_packet->kernel_object = kernel_object;
    dispatch_packet->kernarg_address = (void*) kernarg_address;
    dispatch_packet->private_segment_size = private_segment_size;
    dispatch_packet->group_segment_size = group_segment_size;

    uint16_t header = 0;
    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
    header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;

    __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE);

    /*
     * Increment the write index and ring the doorbell to dispatch the kernel.
     */
    hsa_queue_store_write_index_relaxed(queue, index+1);
    hsa_signal_store_relaxed(queue->doorbell_signal, index);
    check(Dispatching the kernel, err);

    /*
     * Wait on the dispatch completion signal until the kernel is finished.
     */
    hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);

    /*
     * Validate the data in the output buffer.
     */
    int valid=1;
    int fail_index=0;
    for(int i=0; i<1024*1024; i++) {
        if(out[i]!=in[i]) {
            fail_index=i;
            valid=0;
            break;
        }
    }

    if(valid) {
        printf("Passed validation.\n");
    } else {
        printf("VALIDATION FAILED!\nBad index: %d\n", fail_index);
    }

    /*
     * Cleanup all allocated resources.
     */
    err = hsa_memory_free(kernarg_address);
    check(Freeing kernel argument memory buffer, err);

    err=hsa_signal_destroy(signal);
    check(Destroying the signal, err);

    err=hsa_executable_destroy(executable);
    check(Destroying the executable, err);

    err=hsa_code_object_destroy(code_object);
    check(Destroying the code object, err);

    err=hsa_queue_destroy(queue);
    check(Destroying the queue, err);
    
    err=hsa_shut_down();
    check(Shutting down the runtime, err);

    free(in);
    free(out);

    return 0;
}
Exemplo n.º 9
0
void
pocl_hsa_run
(void *data,
 _cl_command_node* cmd)
{
  struct data *d;
  struct pocl_argument *al;
  unsigned i;
  cl_kernel kernel = cmd->command.run.kernel;
  struct pocl_context *pc = &cmd->command.run.pc;
  hsa_signal_value_t initial_value = 1;
#if 0
  /* Not yet supported by the reference library. */
  hsa_kernel_dispatch_packet_t kernel_packet;
#else
  hsa_dispatch_packet_t kernel_packet;
#endif
  hsa_signal_t kernel_completion_signal = 0;
  hsa_region_t region;
  int error;
  amdgpu_args_t *args;
  /* 32b word offset in the kernel arguments buffer we can push the next
     argument to. */
  int args_offset = 0;

  assert (data != NULL);
  d = (struct data *) data;

  d->current_kernel = kernel;

  memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t));

#if 0
  /* TODO: not yet supported by the open source runtime implementation.
     Assume the HSA Full profile so we can simply use host malloc().
   */
  hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, &region);

  if (hsa_memory_allocate(region, sizeof(amdgpu_args_t),
                          (void**)&args) !=
      HSA_STATUS_SUCCESS)
    {
      assert (0 && "hsa_memory_allocate() failed.");
    }
#else
  args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t));
#endif

  kernel_packet.kernarg_address = (uint64_t)args;

  /* Process the kernel arguments. Convert the opaque buffer
     pointers to real device pointers, allocate dynamic local
     memory buffers, etc. */
  for (i = 0; i < kernel->num_args; ++i)
    {
      al = &(cmd->command.run.arguments[i]);
      if (kernel->arg_info[i].is_local)
        {
          POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented.");
#if 0
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
        {
          if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS)
            POCL_ABORT("pocl-hsa: too many kernel arguments!");
          /* Assuming the pointers are 64b (or actually the same as in
             host) due to HSA. TODO: the 32b profile. */
          if (al->value == NULL)
            {
              args->kernel_args[args_offset] = 0;
              args->kernel_args[args_offset + 1] = 0;
            }
          else
            {
              *(uint64_t*)&args->kernel_args[args_offset] =
                (uint64_t)(*(cl_mem *) (al->value))->
                device_ptrs[cmd->device->dev_id].mem_ptr;
            }
          args_offset += 2;

#if 0
          /* It's legal to pass a NULL pointer to clSetKernelArguments. In
             that case we must pass the same NULL forward to the kernel.
             Otherwise, the user must have created a buffer with per device
             pointers stored in the cl_mem. */
          if (al->value == NULL)
            {
              arguments[i] = malloc (sizeof (void *));
              *(void **)arguments[i] = NULL;
            }
          else
            arguments[i] =
              &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
        {
          POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented.");
#if 0
          dev_image_t di;
          fill_dev_image_t (&di, al, cmd->device);

          void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL);
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = devptr;
          pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t));
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
        {
          POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented.");
#if 0
          dev_sampler_t ds;
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = pocl_hsa_malloc
            (data, 0, sizeof(dev_sampler_t), NULL);
          pocl_hsa_write (data, &ds, *(void**)arguments[i], 0,
                            sizeof(dev_sampler_t));
#endif
        }
      else
        {
          if (args_offset >= MAX_KERNEL_ARG_WORDS)
            POCL_ABORT("pocl-hsa: too many kernel arguments!");

          /* Assuming the scalar fits to a 32b slot. TODO! */
          assert (al->size <= 4);
          args->kernel_args[args_offset] = *(uint32_t*)al->value;
          ++args_offset;
        }
    }

  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
      POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented.");
#if 0
      al = &(cmd->command.run.arguments[i]);
      arguments[i] = malloc (sizeof (void *));
      *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL);
#endif
    }


  args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x;
  args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y;
  args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z;

  kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x;
  kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y;
  kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z;

  /* AMDGPU specific OpenCL argument data. */

  args->wgs_x = pc->num_groups[0];
  args->wgs_y = pc->num_groups[1];
  args->wgs_z = pc->num_groups[2];

  kernel_packet.dimensions = 1;
  if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2;
  if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3;

  kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH;
  kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM;
  kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM;
  kernel_packet.header.barrier = 1;

  kernel_packet.kernel_object_address =
    *(hsa_amd_code_t*)cmd->command.run.device_data[1];

  error =  hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal);
  assert (error == HSA_STATUS_SUCCESS);

  kernel_packet.completion_signal = kernel_completion_signal;

  {
    /* Launch the kernel by allocating a slot in the queue, writing the
       command to it, signaling the update with a door bell and finally,
       block waiting until finish signalled with the completion_signal. */
    const uint32_t queue_mask = d->queue->size - 1;
    uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue);
    hsa_signal_value_t sigval;
    ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] =
      kernel_packet;
    hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1);
    hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index);

    sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0,
                                     (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN);
  }

  for (i = 0; i < kernel->num_args; ++i)
    {
      if (kernel->arg_info[i].is_local)
        {
#if 0
          pocl_hsa_free (data, 0, *(void **)(arguments[i]));
          POCL_MEM_FREE(arguments[i]);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
        {
#if 0
          pocl_hsa_free (data, 0, *(void **)(arguments[i]));
          POCL_MEM_FREE(arguments[i]);
#endif
        }
#if 0
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER ||
               (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER &&
                *(void**)args->kernel_args[i] == NULL))
        {
          POCL_MEM_FREE(arguments[i]);
        }
#endif
    }
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
#if 0
      pocl_hsa_free(data, 0, *(void **)(arguments[i]));
      POCL_MEM_FREE(arguments[i]);
#endif
    }
  free(args);
}
Exemplo n.º 10
0
void TemplateDispatch<T>::dispatchKernel(T run_args, hsa_signal_t& signal,
	const Launch_params_t lparm) {
hsa_dispatch_packet_t run_Aql;
HSAContextKaveriImpl::KernelImpl *p_impl =
		(HSAContextKaveriImpl::KernelImpl *) m_p_kernel;

hsa_status_t err;
status_t status = STATUS_SUCCESS;
/*  Create a signal to wait for the dispatch to finish.  */
err = hsa_signal_create(1, 0, NULL, &signal);
STATUS_CHECK(err, __LINE__);

/*  Setup this call to this kernel dispatch packet from scratch.  */
memset(&run_Aql, 0, sizeof(run_Aql));
run_Aql.completion_signal = signal;

/*  Set the dimensions passed from the application */
run_Aql.dimensions = (uint16_t) lparm.ndim;
run_Aql.grid_size_x = lparm.gdims[0];
run_Aql.workgroup_size_x = lparm.ldims[0];
if (lparm.ndim > 1) {
	run_Aql.grid_size_y = lparm.gdims[1];
	run_Aql.workgroup_size_y = lparm.ldims[1];
} else {
	run_Aql.grid_size_y = 1;
	run_Aql.workgroup_size_y = 1;
}
if (lparm.ndim > 2) {
	run_Aql.grid_size_z = lparm.gdims[2];
	run_Aql.workgroup_size_z = lparm.ldims[2];
} else {
	run_Aql.grid_size_z = 1;
	run_Aql.workgroup_size_z = 1;
}

/*  In the future, we may use environment variables for some of these */
run_Aql.header.type = HSA_PACKET_TYPE_DISPATCH;
run_Aql.header.acquire_fence_scope = 2;
run_Aql.header.release_fence_scope = 2;
run_Aql.header.barrier = 1;
run_Aql.group_segment_size =
		p_impl->hsaCodeDescriptor->workgroup_group_segment_byte_size;
run_Aql.private_segment_size =
		p_impl->hsaCodeDescriptor->workitem_private_segment_byte_size;

/*  copy args from the custom run_args structure */
/*  FIXME We should align kernel_arg_buffer because run_args is aligned */
memcpy(run_kernel_arg_buffer, &run_args, sizeof(run_args));

/*  Bind kernelcode to the packet.  */
run_Aql.kernel_object_address = p_impl->hsaCodeDescriptor->code.handle;

/*  Bind kernel argument buffer to the aql packet.  */
run_Aql.kernarg_address = (uint64_t) run_kernel_arg_buffer;

/*  Obtain the current queue write index. increases with each call to kernel  */
uint64_t index = hsa_queue_load_write_index_relaxed(
		p_impl->context->commandQueue);
/* printf("DEBUG:Call #%d to kernel \"%s\" \n",(int) index+1,"run"); */

/*  Write the run_Aql packet at the calculated queue index address.  */
const uint32_t queueMask = p_impl->context->commandQueue->size - 1;
const uint32_t pos = index & queueMask;
((hsa_dispatch_packet_t*) (p_impl->context->commandQueue->base_address))[pos] =
		run_Aql;

/* Increment the write index and ring the doorbell to dispatch the kernel.  */
hsa_queue_store_write_index_relaxed(p_impl->context->commandQueue, index + 1);

hsa_signal_store_relaxed(p_impl->context->commandQueue->doorbell_signal, index);
return;
}
Exemplo n.º 11
0
void MultiKernelDispatch<T>::queueKernel(HSAContext::Kernel* p_kernel,
		T kern_args, Launch_params_t lparm, bool wait) {

	if (wait)
	{
		hsa_status_t err;
		m_expected_value  =1;
		err = hsa_signal_create(m_expected_value, 0, NULL, &m_signal);
	}

	HSAContextKaveriImpl* p_ctx = (HSAContextKaveriImpl*) m_pcontext;
	/*  Obtain the current queue write index. increases with each call to kernel  */
	uint64_t index = hsa_queue_load_write_index_relaxed(p_ctx->commandQueue);
	/* printf("DEBUG:Call #%d to kernel \"%s\" \n",(int) index+1,"run"); */

	HSAContextKaveriImpl::KernelImpl *p_impl =
			(HSAContextKaveriImpl::KernelImpl *) p_kernel;

	/*  Setup this call to this kernel dispatch packet from scratch.  */
	memset(&m_aql, 0, sizeof(m_aql));

	/*  Set the dimensions passed from the application */
	m_aql.dimensions = (uint16_t) lparm.ndim;
	m_aql.grid_size_x = lparm.gdims[0];
	m_aql.workgroup_size_x = lparm.ldims[0];
	m_aql.completion_signal = m_signal;
	if (lparm.ndim > 1) {
		m_aql.grid_size_y = lparm.gdims[1];
		m_aql.workgroup_size_y = lparm.ldims[1];
	} else {
		m_aql.grid_size_y = 1;
		m_aql.workgroup_size_y = 1;
	}
	if (lparm.ndim > 2) {
		m_aql.grid_size_z = lparm.gdims[2];
		m_aql.workgroup_size_z = lparm.ldims[2];
	} else {
		m_aql.grid_size_z = 1;
		m_aql.workgroup_size_z = 1;
	}

	/*  In the future, we may use environment variables for some of these */
	m_aql.header.type = HSA_PACKET_TYPE_DISPATCH;
	m_aql.header.acquire_fence_scope = 2;
	m_aql.header.release_fence_scope = 2;
	m_aql.header.barrier = 1;
	m_aql.group_segment_size =
			p_impl->hsaCodeDescriptor->workgroup_group_segment_byte_size;
	m_aql.private_segment_size =
			p_impl->hsaCodeDescriptor->workitem_private_segment_byte_size;

	void * run_kernel_arg_buffer = p_impl->m_run_arg_buffer;
	/*  copy args from the custom run_args structure */
	/*  FIXME We should align kernel_arg_buffer because run_args is aligned */
	memcpy(run_kernel_arg_buffer, &kern_args, sizeof(kern_args));

	/*  Bind kernelcode to the packet.  */
	m_aql.kernel_object_address = p_impl->hsaCodeDescriptor->code.handle;

	/*  Bind kernel argument buffer to the aql packet.  */
	m_aql.kernarg_address = (uint64_t) run_kernel_arg_buffer;


	const uint32_t queueMask = p_ctx->commandQueue->size - 1;
	const uint32_t pos = (index) & queueMask;
	((hsa_dispatch_packet_t*) (p_ctx->commandQueue->base_address))[pos] = m_aql;

	if (wait)
		hsa_signal_store_relaxed(p_ctx->commandQueue->doorbell_signal, index);
	/* Increment the write index and ring the doorbell to dispatch the kernel.  */
	hsa_queue_store_write_index_relaxed(p_ctx->commandQueue, index + 1);

	if (wait)
	{
		hsa_signal_wait_acquire(m_signal, HSA_LT, m_expected_value, (uint64_t)-1, HSA_WAIT_EXPECTANCY_SHORT);
		hsa_signal_destroy(m_signal);
		m_signal = 0;
	}
	m_expected_value--;

}