Example #1
0
void dfft_cuda_redistribute(dfft_plan *plan, int size, int *embed, int *d_embed,
            cuda_cpx_t *d_work, int c2b)
    {
    int i;

    int d = ((c2b) ? (plan->max_depth + 1) : plan->max_depth);

    if (plan->init)
        {
        for (i = 0; i < plan->ndim; ++i)
            {
            /* no bit reversal */
            plan->rev_global[d][i] = 0;
            plan->rev_partial[d][i] = 0;
            plan->rev_j1[d][i] = 0;
            }

        if (!c2b)
            {
            for (i = 0; i < plan->ndim; ++i)
                {
                /* block to cyclic */
                plan->c0[d][i] = 1;
                plan->c1[d][i] = plan->pdim[i];
                }
            }
        else
            {
            for (i = 0; i < plan->ndim; ++i)
                {
                /* cyclic to block */
                plan->c0[d][i] = plan->pdim[i];
                plan->c1[d][i] = 1;
                }
            }

        cudaMemcpy(plan->d_c0[d], plan->c0[d], sizeof(int)*plan->ndim,cudaMemcpyDefault);
        CHECK_CUDA();
        cudaMemcpy(plan->d_c1[d], plan->c1[d], sizeof(int)*plan->ndim,cudaMemcpyDefault);
        CHECK_CUDA();
        cudaMemcpy(plan->d_rev_global[d], plan->rev_global[d], sizeof(int)*plan->ndim,cudaMemcpyDefault);
        CHECK_CUDA();
        cudaMemcpy(plan->d_rev_partial[d], plan->rev_partial[d], sizeof(int)*plan->ndim,cudaMemcpyDefault);
        CHECK_CUDA();
        cudaMemcpy(plan->d_rev_j1[d], plan->rev_j1[d], sizeof(int)*plan->ndim,cudaMemcpyDefault);
        CHECK_CUDA();
        }
    else
        {
        int dir = (c2b ? 0 : 1 );
        dfft_cuda_redistribute_nd(plan, d, size, embed, d_embed, dir,
              d_work,  NULL, NULL, NULL, NULL, NULL, NULL);
        } 
    }
Example #2
0
 void gpu_data::
 wait_for_transfer_to_finish() const
 {
     if (have_active_transfer)
     {
         CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
         have_active_transfer = false;
         // Check for errors.  These calls to cudaGetLastError() are what help us find
         // out if our kernel launches have been failing.
         CHECK_CUDA(cudaGetLastError());
     }
 }
Example #3
0
void hclib_free_at(place_t *pl, void *ptr) {
    if (is_cpu_place(pl)) {
        if (is_pinned_cpu_mem(ptr)) {
            hclib_memory_tree_remove(ptr, &hclib_context->pinned_host_allocs);
            CHECK_CUDA(cudaFreeHost(ptr));
        } else {
            free(ptr);
        }
#ifdef HC_CUDA
    } else if (is_nvgpu_place(pl)) {
        CHECK_CUDA(cudaFree(ptr));
#endif
    } else {
        unsupported_place_type_err(pl);
    }
}
Example #4
0
 void gpu_data::
 copy_to_host() const
 {
     if (!host_current)
     {
         wait_for_transfer_to_finish();
         CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
         host_current = true;
         // At this point we know our RAM block isn't in use because cudaMemcpy()
         // implicitly syncs with the device. 
         device_in_use = false;
         // Check for errors.  These calls to cudaGetLastError() are what help us find
         // out if our kernel launches have been failing.
         CHECK_CUDA(cudaGetLastError());
     }
 }
Example #5
0
 void gpu_data::
 async_copy_to_device() const
 {
     if (!device_current)
     {
         if (device_in_use)
         {
             // Wait for any possible CUDA kernels that might be using our memory block to
             // complete before we overwrite the memory.
             CHECK_CUDA(cudaStreamSynchronize(0));
             device_in_use = false;
         }
         CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
         have_active_transfer = true;
         device_current = true;
     }
 }
Example #6
0
void *hclib_allocate_at(place_t *pl, size_t nbytes, int flags) {
    HASSERT(pl);
    HASSERT(nbytes > 0);
#ifdef VERBOSE
    fprintf(stderr, "hclib_allocate_at: pl=%p nbytes=%lu flags=%d, is_cpu? %s",
            pl, (unsigned long)nbytes, flags,
            is_cpu_place(pl) ? "true" : "false");
#ifdef HC_CUDA
    fprintf(stderr, ", is_nvgpu? %s, cuda_id=%d",
            is_nvgpu_place(pl) ? "true" : "false", pl->cuda_id);
#endif
    fprintf(stderr, "\n");
#endif

    if (is_cpu_place(pl)) {
#ifdef HC_CUDA
        if (flags & PHYSICAL) {
            void *ptr;
            const cudaError_t alloc_err = cudaMallocHost((void **)&ptr, nbytes);
            if (alloc_err != cudaSuccess) {
#ifdef VERBOSE
                fprintf(stderr, "Physical allocation at CPU place failed with "
                        "reason \"%s\"\n", cudaGetErrorString(alloc_err));
#endif
                return NULL;
            } else {
                hclib_memory_tree_insert(ptr, nbytes,
                                         &hclib_context->pinned_host_allocs);
                return ptr;
            }
        }
#else
        HASSERT(flags == NONE);
#endif
        return malloc(nbytes);
#ifdef HC_CUDA
    } else if (is_nvgpu_place(pl)) {
        HASSERT(flags == NONE);
        void *ptr;
        HASSERT(pl->cuda_id >= 0);
        CHECK_CUDA(cudaSetDevice(pl->cuda_id));
        const cudaError_t alloc_err = cudaMalloc((void **)&ptr, nbytes);
        if (alloc_err != cudaSuccess) {
#ifdef VERBOSE
            fprintf(stderr, "Allocation at NVGPU place failed with reason "
                    "\"%s\"\n", cudaGetErrorString(alloc_err));
#endif
            return NULL;
        } else {
            return ptr;
        }
#endif
    } else {
        unsupported_place_type_err(pl);
        return NULL; // will never reach here
    }
}
Example #7
0
/* plan_long: complete local FFT
   plan_short: partial local FFT
   input and output are M-cyclic (M=pdim[current_dim])
   (out-of-place version, overwrites input)
   */
void cuda_mpifft1d_dif(int *dim,
            int *pdim,
            int ndim,
            int current_dim,
            int* pidx,
            int inverse,
            int size,
            int *embed,
            cuda_cpx_t *d_in,
            cuda_cpx_t *d_out,
            cuda_cpx_t *h_stage_in,
            cuda_cpx_t *h_stage_out,
            cuda_plan_t plan_short,
            cuda_plan_t plan_long,
            int *rho_L,
            int *rho_pk0,
            int *rho_Lk0,
            int *dfft_nsend,
            int *dfft_nrecv,
            int *dfft_offset_send,
            int *dfft_offset_recv,
            MPI_Comm comm,
            int check_err,
            int row_m)
    {
    int p = pdim[current_dim];
    int length = dim[current_dim]/pdim[current_dim];

    /* compute stride for column major matrix storage */
    int stride = size/embed[current_dim];

    int c;
    for (c = p; c >1; c /= length)
        {
        /* do local out-of-place place FFT (long-distance butterflies) */
        dfft_cuda_local_fft(d_in, d_out, plan_long, inverse);

        /* apply twiddle factors */
        double alpha = ((double)(pidx[current_dim] %c))/(double)c;

        gpu_twiddle(size, length, stride, alpha, d_out, d_in, inverse);
        if (check_err) CHECK_CUDA();

        /* in-place redistribute from group-cyclic c -> c1 */
        int rev = 1;
        int c1 = ((c > length) ? (c/length) : 1);
        dfft_cuda_redistribute_cyclic_to_block_1d(dim,pdim,ndim,current_dim,
            c, c1, pidx, rev, size, embed, d_in,d_out,h_stage_in, h_stage_out,
            rho_L,rho_pk0, dfft_nsend,dfft_nrecv,dfft_offset_send,
            dfft_offset_recv, comm, check_err, row_m);
        }

    /* perform remaining short-distance butterflies,
     * out-of-place 1d FFT */
    dfft_cuda_local_fft(d_in, d_out, plan_short,inverse);
    } 
Example #8
0
    void memcpy (
        gpu_data& dest, 
        const gpu_data& src
    )
    {
        DLIB_CASSERT(dest.size() == src.size(), "");
        if (src.size() == 0)
            return;

        // copy the memory efficiently based on which copy is current in each object.
        if (dest.device_ready() && src.device_ready())
            CHECK_CUDA(cudaMemcpy(dest.device(), src.device(),          src.size()*sizeof(float), cudaMemcpyDeviceToDevice));
        else if (!dest.device_ready() && src.device_ready())
            CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.device(), src.size()*sizeof(float), cudaMemcpyDeviceToHost));
        else if (dest.device_ready() && !src.device_ready())
            CHECK_CUDA(cudaMemcpy(dest.device(), src.host(),            src.size()*sizeof(float), cudaMemcpyHostToDevice));
        else 
            CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.host(),   src.size()*sizeof(float), cudaMemcpyHostToHost));
    }
Example #9
0
    void synchronize_stream(cudaStream_t stream)
    {
#if !defined CUDA_VERSION
#error CUDA_VERSION not defined
#elif CUDA_VERSION >= 9020 && CUDA_VERSION <= 10010
        // This should be pretty much the same as cudaStreamSynchronize, which for some
        // reason makes training freeze in some cases.
        // (see https://github.com/davisking/dlib/issues/1513)
        while (true)
        {
            cudaError_t err = cudaStreamQuery(stream);
            switch (err)
            {
            case cudaSuccess: return;      // now we are synchronized
            case cudaErrorNotReady: break; // continue waiting
            default: CHECK_CUDA(err);      // unexpected error: throw
            }
        }
#else // CUDA_VERSION
        CHECK_CUDA(cudaStreamSynchronize(stream));
#endif // CUDA_VERSION
    }
Example #10
0
/*
 * n-dimensional fft routine, based on 1-d transforms (in-place)
 */
void cuda_mpifftnd_dif(int *dim,
            int *pdim,
            int ndim,
            int* pidx,
            int inv,
            int size_in,
            int *inembed,
            int *oembed,
            cuda_cpx_t *d_work,
            cuda_cpx_t *d_scratch,
            cuda_cpx_t *h_stage_in,
            cuda_cpx_t *h_stage_out,
            cuda_plan_t *plans_short,
            cuda_plan_t *plans_long,
            int **rho_L,
            int **rho_pk0,
            int **rho_Lk0,
            int *dfft_nsend,
            int *dfft_nrecv,
            int *dfft_offset_send,
            int *dfft_offset_recv,
            MPI_Comm comm,
            int check_err,
            int row_m)
    {
    int size = size_in;
    int current_dim;
    for (current_dim = 0; current_dim < ndim; ++current_dim)
        {
        /* assume input in local column major */
        cuda_mpifft1d_dif(dim, pdim,ndim,current_dim,pidx, inv,
            size, inembed, d_work, d_scratch,h_stage_in, h_stage_out,
            plans_short[current_dim],
            plans_long[current_dim], rho_L[current_dim],
            rho_pk0[current_dim],rho_Lk0[current_dim],
            dfft_nsend,dfft_nrecv,dfft_offset_send,dfft_offset_recv,
            comm,check_err,row_m);

        int l = dim[current_dim]/pdim[current_dim];
        int stride = size/inembed[current_dim];

        /* transpose local matrix */
        gpu_transpose(size,l,stride, oembed[current_dim],d_scratch, d_work);
        if (check_err) CHECK_CUDA();

        /* update size */
        size *= oembed[current_dim];
        size /= inembed[current_dim];
        }
    }
Example #11
0
/*****************************************************************************
 * Distributed FFT interface
 *****************************************************************************/
int dfft_cuda_execute(cuda_cpx_t *d_in, cuda_cpx_t *d_out, int dir, dfft_plan *p)
    {
    int out_of_place = (d_in == d_out) ? 0 : 1;

    int check_err = p->check_cuda_errors;
    cuda_cpx_t *d_work;

    if (!p->init)
        {
        if (out_of_place)
            {
            d_work = p->d_scratch_3;
            cudaMemcpy(d_work, d_in, p->size_in*sizeof(cuda_cpx_t),cudaMemcpyDefault);
            if (check_err) CHECK_CUDA();
            }
        else
            {
            d_work = d_in;
            }
        }

    if (p->init || (!dir && !p->input_cyclic) || (dir && !p->output_cyclic))
        {
        /* redistribution of input */
        dfft_cuda_redistribute(p,p->size_in, p->inembed, p->d_iembed, d_work, 0); 
        }

    /* multi-dimensional FFT */
    /*cuda_mpifftnd_dif(p.gdim, p.pdim, p.ndim, p.pidx, dir,
        p.size_in,p.inembed,p.oembed, d_work, d_scratch,
        p.h_stage_in, p.h_stage_out,
        dir ? p.cuda_plans_short_inverse : p.cuda_plans_short_forward,
        dir ? p.cuda_plans_long_inverse : p.cuda_plans_long_forward,
        p.rho_L, p.rho_pk0, p.rho_Lk0, p.nsend,p.nrecv,
        p.offset_send,p.offset_recv, p.comm,check_err,p.row_m); */
    
    cuda_fftnd_multi(p, d_work, d_out,
                     dir ? p->cuda_plans_multi_bw : p->cuda_plans_multi_fw,
                     dir ? p->cuda_plans_final_bw : p->cuda_plans_final_fw,
                     dir);

    if (p->init || (dir && !p->input_cyclic) || (!dir && !p->output_cyclic))
        {
        /* redistribution of output */
        dfft_cuda_redistribute(p,p->size_out, p->oembed, p->d_oembed, d_out, 1); 
        }

    return 0;
    }
Example #12
0
char *hclib_get_place_name(place_t *pl) {
    if (is_cpu_place(pl)) {
        return (char *)cpu_place_name;
#ifdef HC_CUDA
    } else if (is_nvgpu_place(pl)) {
        struct cudaDeviceProp props;
        CHECK_CUDA(cudaGetDeviceProperties(&props, pl->cuda_id));
        char *gpu_name = (char *)malloc(sizeof(props.name));
        memcpy(gpu_name, props.name, sizeof(props.name));
        return gpu_name;
#endif
    } else {
        return unsupported_place_type_err(pl);
    }
}
Example #13
0
    void memcpy (
        gpu_data& dest, 
        size_t dest_offset,
        const gpu_data& src,
        size_t src_offset,
        size_t num
    )
    {
        DLIB_CASSERT(dest_offset + num <= dest.size());
        DLIB_CASSERT(src_offset + num <= src.size());
        if (num == 0)
            return;

        // if there is aliasing
        if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
        {
            // if they perfectly alias each other then there is nothing to do
            if (dest_offset == src_offset)
                return;
            else
                std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
        }
        else
        {
            // if we write to the entire thing then we can use device_write_only()
            if (dest_offset == 0 && num == dest.size())
            {
                // copy the memory efficiently based on which copy is current in each object.
                if (src.device_ready())
                    CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset,  num*sizeof(float), cudaMemcpyDeviceToDevice));
                else 
                    CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset,    num*sizeof(float), cudaMemcpyHostToDevice));
            }
            else
            {
                // copy the memory efficiently based on which copy is current in each object.
                if (dest.device_ready() && src.device_ready())
                    CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
                else if (!dest.device_ready() && src.device_ready())
                    CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset,   num*sizeof(float), cudaMemcpyDeviceToHost));
                else if (dest.device_ready() && !src.device_ready())
                    CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset,   num*sizeof(float), cudaMemcpyHostToDevice));
                else 
                    CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset,     num*sizeof(float), cudaMemcpyHostToHost));
            }
        }
    }
    void initialize(std::string test_name) {

        m_test_name = test_name;

        if(LogToFile) {
            m_ofsLog.open("Log.txt");
            m_ofsData.close();
            if (m_ofsLog.is_open()) {
                std::cout << " File opened: " << "Log.txt" <<std::endl;
            }
            m_oLog.rdbuf(m_ofsLog.rdbuf());
        } else {
            m_oLog.rdbuf(std::cout.rdbuf());
        }

        m_filename = m_test_name + TTestVariant::getTestVariantDescription();

        m_ofsData.close();
        m_ofsData.open(m_filename+".dump");
        if (m_ofsData.is_open()) {
            std::cout << " File opened: " << m_filename <<std::endl;
        } else {
            ERRORMSG("Could not open data file: " << m_filename);
        }
        m_oData.rdbuf(m_ofsData.rdbuf());


        // Set GPU Device to use!
        m_oLog << " Set GPU Device: " << UseGPUDeviceID <<std::endl;
        cudaDeviceReset();
        CHECK_CUDA(cudaSetDevice(UseGPUDeviceID));


        m_testVariant.initialize(&m_oLog,&m_oData);

        m_oLog << " Kernel Performance Test  ======================================="<<std::endl;
        m_oData<< "# Kernel Performance Test:  Data Dump for file:" <<m_filename<<".xml" << std::endl;

        // Init XML
        m_dataXML.reset();
        std::stringstream xml("<PerformanceTest type=\"KernelTest\">"
                              "<Description></Description>"
                              "<DataTable>"
                              "<Header></Header>"
                              "<Data></Data>"
                              "</DataTable>"
                              "</PerformanceTest>");

        bool r = m_dataXML.load(xml);
        ASSERTMSG(r,"Could not load initial xml data file");

        // DESCRIPTION ==========================
        // Add GPU INFO
        XMLNodeType descNode = m_dataXML.child("PerformanceTest").child("Description");
        // Write header to file for this TestMethod!
        cudaDeviceProp props;
        CHECK_CUDA(cudaGetDeviceProperties(&props,UseGPUDeviceID));
        std::stringstream s;
        utilCuda::writeCudaDeviceProbs(s,props,UseGPUDeviceID);
        auto gpuInfo = descNode.append_child("GPUInfos").append_child(XMLStringNode);
        gpuInfo.set_value(s.str().c_str());


        // Write variant specific descriptions!
        auto desc = m_testVariant.getDescriptions();
        for(auto & d : desc){
            auto gpuInfo = descNode.append_child(d.first.c_str()).append_child(XMLStringNode);
            gpuInfo.set_value(d.second.c_str());
        }

        // DATA =========================================
        XMLNodeType dataTable = m_dataXML.child("PerformanceTest").child("DataTable");
        auto headerNode = dataTable.child("Header");
        // Write variant specific header
        {
            std::vector<std::string> head = m_testVariant.getDataColumHeader();
            for(auto & h : head){
                auto col = headerNode.append_child("Column").append_child(XMLStringNode);
                col.set_value(h.c_str());
            }
        }
        {
            // Write TestMethod Specific Column Header!
            std::vector<std::string> head;
            head.push_back("nFlop");
            head.push_back("GFlops");
            head.push_back("Memory Bandwith [Bytes/sec]");
            head.push_back("elapsedTimeCopyToGPU_Avg [s]");
            head.push_back("gpuIterationTime_Avg [s]");
            head.push_back("elapsedTimeCopyFromGPU_Avg [s]");
            head.push_back("cpuIterationTime_Avg [s]");
            head.push_back("nIterationsForTradeoff");
            head.push_back("speedUpFactor");
            head.push_back("maxRelTol_Avg (over all TestProblems)");
            head.push_back("avgRelTol_Avg (over all TestProblems)");
            head.push_back("maxUlp_Avg (over all TestProblems)");
            head.push_back("avgUlp_Avg (over all TestProblems)");
            for(auto & h : head){
                auto col = headerNode.append_child("Column").append_child(XMLStringNode);
                col.set_value(h.c_str());
            }
        }

        // Save XML already for savety!
        m_dataXML.save_file((m_filename+".xml").c_str(),"    ");


    }
Example #15
0
/* n-dimensional FFT using local multidimensional FFTs 
 * and n-dimensional redistributions
 */
void cuda_fftnd_multi(dfft_plan *p,
                      cuda_cpx_t *d_in,
                      cuda_cpx_t *d_out,
                      cuda_plan_t **cuda_plans_multi,
                      cuda_plan_t *cuda_plans_final,
                      int inv)
    {
    int d,i,j;
    /* initialize current stage */
    if (p->init && p->max_depth > 0)
        {
        for (i = 0; i < p->ndim; ++i)
            p->c0[p->max_depth-1][i] = p->pdim[i];
        }

    int rev_global, rev_local;
    int res;
    for (d = p->max_depth-1; d>=0; d--)
        {
        cuda_cpx_t *cur_in = d_in;
        cuda_cpx_t *cur_out = p->d_scratch;
        if (!p->init)
            {
            for (j =0; j < p->n_fft[d]; ++j)
                {
                if (p->depth[j] > d)
                    {
                    /* do local FFT */
                    res = dfft_cuda_local_fft(cur_in, cur_out, cuda_plans_multi[d][j], inv);
                    CHECK_LOCAL_FFT(res);
                    if (p->check_cuda_errors) CHECK_CUDA();
                    }
                else
                    {
                    /* transpose only */
                    int l = p->gdim[j]/p->pdim[j];
                    int stride = p->size_in/p->inembed[j];

                    gpu_transpose(p->size_in,l,stride, p->inembed[j],cur_in,cur_out);
                    if (p->check_cuda_errors) CHECK_CUDA();
                    }

                /* swap pointers */
                cuda_cpx_t *tmp;
                tmp = cur_in;
                cur_in = cur_out;
                cur_out = tmp;
                }
            }
        else
            {
            /* initialize twiddle factors */
            for (i =0; i < p->ndim; ++i)
                {
                if (p->depth[i] > d)
                    p->h_alpha[d][i] = ((double)(p->pidx[i] % p->c0[d][i]))/(double)p->c0[d][i];
                else
                    p->h_alpha[d][i] = 0.0;
                }

            /* copy to device */
            cudaMemcpy(p->d_alpha[d], p->h_alpha[d], sizeof(cuda_scalar_t)*p->ndim,cudaMemcpyDefault);
            CHECK_CUDA();
            }

        if (!p->init)
            {
            /* twiddle */
            gpu_twiddle_nd(p->size_in, p->ndim, p->d_iembed, p->d_length,
                p->d_alpha[d], cur_in, d_in, inv);
            if (p->check_cuda_errors) CHECK_CUDA();
            }

        if (p->init)
            {
            /* update cycle */
            for (i = 0; i< p->ndim; ++i)
                {
                int length = p->gdim[i] / p->pdim[i];
                /* only update if necessary */
                if (p->depth[i] > d)
                    {
                    if (d >0)
                        {
                        /* decimate in steps of 'length' */
                        p->c1[d][i] = p->c0[d][i]/length; 

                        /* the previous FFT produced bit-reversed output compared 
                         * to an unordered FFT */
                        p->rev_j1[d][i] = 1;
                        p->rev_global[d][i] = 0;
                        p->rev_partial[d][i] = 0;
                        }
                    else
                        {
                        /* in the last stage, we go back to cyclic, after a bit reversal */
                        p->rev_j1[d][i] = 1;
                        p->rev_global[d][i] = 1;
                        p->rev_partial[d][i] = 1;
                        p->c1[d][i] = p->pdim[i]; 
                        }
                    }
                else
                    {
                    p->c1[d][i] = p->c0[d][i];
                    p->rev_global[d][i] = 0;
                    p->rev_partial[d][i] = 0;
                    p->rev_j1[d][i] = 0;
                    }
                }

            /* copy to device */
            cudaMemcpy(p->d_c0[d], p->c0[d], sizeof(int)*p->ndim,cudaMemcpyDefault);
            CHECK_CUDA();
            cudaMemcpy(p->d_c1[d], p->c1[d], sizeof(int)*p->ndim,cudaMemcpyDefault);
            CHECK_CUDA();
            cudaMemcpy(p->d_rev_global[d], p->rev_global[d], sizeof(int)*p->ndim,cudaMemcpyDefault);
            CHECK_CUDA();
            cudaMemcpy(p->d_rev_partial[d], p->rev_partial[d], sizeof(int)*p->ndim,cudaMemcpyDefault);
            CHECK_CUDA();
            cudaMemcpy(p->d_rev_j1[d], p->rev_j1[d], sizeof(int)*p->ndim,cudaMemcpyDefault);
            CHECK_CUDA();
            }

        if (!p->init)
            {
            /* redistribute */
            dfft_cuda_redistribute_nd(p, d, p->size_in, p->inembed, p->d_iembed,
                      0, d_in, NULL, NULL, NULL, p->rho_pk0, NULL, NULL);
            }
     
        /* old cycle == new cycle */
        if (p->init && d>0)
            {
            for (i = 0; i < p->ndim; ++i)
                p->c0[d-1][i] = p->c1[d][i];
            }
        }

    /* final stage */
    if (!p->init)
        {
        if (!p->final_multi)
            {
            int size = p->size_in;
            for (i = 0; i < p->ndim; ++i)
                {
                /* do 1d FFT */
                cuda_cpx_t *d_in_ptr = ((i == 0) ? d_in : p->d_scratch);
                res = dfft_cuda_local_fft(d_in_ptr, p->d_scratch_2, cuda_plans_final[i] , inv);
                CHECK_LOCAL_FFT(res);
                if (p->check_cuda_errors) CHECK_CUDA();

                /* transpose */
                int l = p->gdim[i]/p->pdim[i];
                int stride = size/p->inembed[i];

                /* transpose local matrix */
                cuda_cpx_t *d_out_ptr = ((i == p->ndim-1) ? d_out : p->d_scratch);
                gpu_transpose(size,l,stride, p->oembed[i],p->d_scratch_2, d_out_ptr);
                if (p->check_cuda_errors) CHECK_CUDA();

                /* update size */
                size *= p->oembed[i];
                size /= p->inembed[i];
                }
            }
        else
            {
            /* do multidimensional fft */
            int res;
            res = dfft_cuda_local_fft(d_in, d_out, cuda_plans_final[0] , inv);
            CHECK_LOCAL_FFT(res);
            if (p->check_cuda_errors) CHECK_CUDA();
            }
        }
    }
void initialize_device(const size_t size,double **d_input,double **d_output){

        CHECK_CUDA(cudaMalloc((void **)d_output, size));
        CHECK_CUDA(cudaMalloc((void **)d_input, size));
}
Example #17
0
int dfft_cuda_create_plan(dfft_plan *p,
    int ndim, int *gdim,
    int *inembed, int *oembed,
    int *pdim, int *pidx, int row_m,
    int input_cyclic, int output_cyclic,
    MPI_Comm comm,
    int *proc_map)
    {
    int res = dfft_create_plan_common(p, ndim, gdim, inembed, oembed,
        pdim, pidx, row_m, input_cyclic, output_cyclic, comm, proc_map, 1);

    #ifndef ENABLE_MPI_CUDA
    /* allocate staging bufs */
    /* we need to use posix_memalign/cudaHostRegister instead
     * of cudaHostAlloc, because cudaHostAlloc doesn't have hooks
     * in the MPI library, and using it would lead to data corruption
     */
    int size = p->scratch_size*sizeof(cuda_cpx_t);
    int page_size = getpagesize();
    size = ((size + page_size - 1) / page_size) * page_size;
    posix_memalign((void **)&(p->h_stage_in),page_size,size);
    posix_memalign((void **)&(p->h_stage_out),page_size,size);
    cudaHostRegister(p->h_stage_in, size, cudaHostAllocDefault);
    CHECK_CUDA();
    cudaHostRegister(p->h_stage_out, size, cudaHostAllocDefault);
    CHECK_CUDA();
    #endif

    /* allocate memory for passing variables */
   cudaMalloc((void **)&(p->d_pidx), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_pdim), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_iembed), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_oembed), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_length), sizeof(int)*ndim);
    CHECK_CUDA();

    /* initialize cuda buffers */
    int *h_length = (int *)malloc(sizeof(int)*ndim);
    int i;
    for (i = 0; i < ndim; ++i)
        h_length[i] = gdim[i]/pdim[i];
    cudaMemcpy(p->d_pidx, pidx, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_pdim, pdim, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_iembed, p->inembed, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_oembed, p->oembed, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_length, h_length, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    free(h_length);

    int dmax = p->max_depth + 2;
    p->d_rev_j1 = (int **) malloc(sizeof(int *)*dmax);
    p->d_rev_global = (int **) malloc(sizeof(int *)*dmax);
    p->d_rev_partial = (int **) malloc(sizeof(int *)*dmax);
    p->d_c0 = (int **) malloc(sizeof(int *)*dmax);
    p->d_c1 = (int **) malloc(sizeof(int *)*dmax);
    if (p->max_depth)
        {
        p->h_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth);
        p->d_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth);
        }

    int d;
    for (d = 0; d < dmax; ++d)
        {
        cudaMalloc((void **)&(p->d_rev_j1[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_rev_partial[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_rev_global[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_c0[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_c1[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        }

    for (d = 0; d < p->max_depth; ++d)
        {
        cudaMalloc((void **)&(p->d_alpha[d]), sizeof(cuda_scalar_t)*ndim); 
        CHECK_CUDA();
        p->h_alpha[d] = (cuda_scalar_t *) malloc(sizeof(cuda_scalar_t)*ndim);
        }

    /* perform initialization run */
    dfft_cuda_execute(NULL, NULL, 0, p);

    /* initialization finished */
    p->init = 0;

    return res;
    } 
void copy_device_to_host(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){

        CHECK_CUDA(cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(h_input, d_input, size, cudaMemcpyDeviceToHost));
}
void copy_host_to_device(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){

        CHECK_CUDA(cudaMemcpy(d_output, h_output, size, cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice));
}
Example #20
0
/*
 * Redistribute from group-cyclic with cycle c0 to cycle c1>=c0
 */
void dfft_cuda_redistribute_block_to_cyclic_1d(
                  int *dim,
                  int *pdim,
                  int ndim,
                  int current_dim,
                  int c0,
                  int c1,
                  int* pidx,
                  int size_in,
                  int *embed,
                  cuda_cpx_t *d_work,
                  cuda_cpx_t *d_scratch,
                  cuda_cpx_t *h_stage_in,
                  cuda_cpx_t *h_stage_out,
                  int *dfft_nsend,
                  int *dfft_nrecv,
                  int *dfft_offset_send,
                  int *dfft_offset_recv,
                  MPI_Comm comm,
                  int check_err,
                  int row_m)
    {
    /* exit early if nothing needs to be done */
    if (c0 == c1) return;

    int length = dim[current_dim]/pdim[current_dim];

    /* compute stride for column major matrix storage */
    int stride = size_in/embed[current_dim];

    /* processor index along current dimension */
    int s = pidx[current_dim];
    int p = pdim[current_dim];

    int ratio = c1/c0;
    int size = ((length/ratio > 1) ? (length/ratio) : 1);
    int npackets = length/size;
    size *= stride;

    int pdim_tot=1;
    int k;
    for (k = 0; k < ndim; ++k)
        pdim_tot *= pdim[k];

    int t;
    for (t = 0; t<pdim_tot; ++t)
        {
        dfft_nsend[t] = 0;
        dfft_nrecv[t] = 0;
        dfft_offset_send[t] = 0;
        dfft_offset_recv[t] = 0;
        }

    int j0;
    int j2;

    j0 = s % c0;
    j2 = s / c0;

    /* initialize send offsets and pack data */
    int j;
    for (j = 0; j < npackets; ++j)
        {
        int offset = j*size;
        int jglob = j2*c0*length + j * c0 + j0;
        int desti = (jglob/(c1*length))*c1+ jglob%c1;
        int destproc = 0;
        if (row_m)
            {
            for (k = ndim-1; k >=0 ;--k)
                {
                destproc *= pdim[k];
                destproc += ((current_dim == k) ? desti : pidx[k]);
                }
            }
        else
            {
            for (k = 0; k < ndim; ++k)
                {
                destproc *= pdim[k];
                destproc += ((current_dim == k) ? desti : pidx[k]);
                }
            }
        dfft_nsend[destproc] = size*sizeof(cuda_cpx_t);
        dfft_offset_send[destproc] = offset*sizeof(cuda_cpx_t);
        }

    /* pack data */
    gpu_b2c_pack(npackets*size, ratio, size, npackets, stride, d_work, d_scratch);
    if (check_err) CHECK_CUDA();

    /* initialize recv offsets */
    int offset = 0;
    j0 = s % c1;
    j2 = s/c1;

    int r;
    for (r = 0; r < npackets; ++r)
        {
        offset = r*size;
        j = r*size/stride;
        int jglob = j2*c1*length+ j * c1 + j0;
        int srci = (jglob/(c0*length))*c0+jglob%c0;
        int srcproc = 0;
        int k;
        if (row_m)
            {
            for (k = ndim-1; k >= 0; --k)
                {
                srcproc *= pdim[k];
                srcproc += ((current_dim == k) ? srci : pidx[k]);
                } 
            }
        else
            {
            for (k = 0; k < ndim; ++k)
                {
                srcproc *= pdim[k];
                srcproc += ((current_dim == k) ? srci : pidx[k]);
                }
            } 
        dfft_nrecv[srcproc] = size*sizeof(cuda_cpx_t);
        dfft_offset_recv[srcproc] = offset*sizeof(cuda_cpx_t);
        }

    /* synchronize */
    MPI_Barrier(comm);

    /* communicate */
    #ifdef ENABLE_MPI_CUDA
    MPI_Alltoallv(d_scratch,dfft_nsend, dfft_offset_send, MPI_BYTE,
                  d_work, dfft_nrecv, dfft_offset_recv, MPI_BYTE,
                  comm);
    #else
    // stage into host buf
    cudaMemcpy(h_stage_in, d_scratch, sizeof(cuda_cpx_t)*npackets*size,cudaMemcpyDefault); 
    if (check_err) CHECK_CUDA();

    MPI_Alltoallv(h_stage_in,dfft_nsend, dfft_offset_send, MPI_BYTE,
                  h_stage_out, dfft_nrecv, dfft_offset_recv, MPI_BYTE,
                  comm);

    // copy back received data
    cudaMemcpy(d_work,h_stage_out, sizeof(cuda_cpx_t)*size_in,cudaMemcpyDefault); 
    if (check_err) CHECK_CUDA();
    #endif
    }
Example #21
0
/* init the hpt and place deques */
void hc_hpt_init(hc_context *context) {
    int i, j;
#ifdef HPT_DESCENTWORKER_PERPLACE
    /*
     * each place has a deque for all workers beneath it (transitively) in the
     * HPT.
     */
    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        int nworkers = pl->ndeques;
        pl->deques = malloc(sizeof(hc_deque_t) * nworkers);
        HASSERT(pl->deques);
        for (j = 0; j < nworkers; j++) {
            hc_deque_t *deq = &(pl->deques[j]);
            init_hc_deque_t(deq, pl);
        }
    }
#else // HPT_ALLWORKER_PERPLACE each place has a deque for each worker
    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        const int ndeques = context->nworkers;
#ifdef TODO
        if (is_device_place(pl)) ndeques = 1;
#endif
        pl->ndeques = ndeques;
        pl->deques = (hc_deque_t *) malloc(sizeof(hc_deque_t) * ndeques);
        for (j = 0; j < ndeques; j++) {
            hc_deque_t *hc_deq = &(pl->deques[j]);
            init_hc_deque_t(hc_deq, pl);
        }
    }
#endif

    /*
     * link the deques for each cpu workers. the deque index is the same as
     * ws->id to simplify the search. For every worker, iterate over all places
     * and store a pointer from the place's deque for that worker to the worker
     * state for that worker.
     *
     * This builds a tree of deques from the worker, to its parent's deque for
     * it, to its grandparent's deque for it, up to the root. It would seem that
     * the majority of deques are therefore unused (i.e. even though we allocate
     * a dequeue for every worker in a platform in every place, only the deques
     * for workers that are beneath that place in the HPT are used). However,
     * this does make lookups of the deque in a place for a given worker
     * constant time based on offset in place->deques.
     */
#ifdef HC_CUDA
    int ngpus = 0;
    int gpu_counter = 0;
    cudaError_t cuda_err = cudaGetDeviceCount(&ngpus);
    if (cuda_err == cudaErrorNoDevice) {
        ngpus = 0;
    }

    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        pl->cuda_id = -1;
        if (is_nvgpu_place(pl)) {
            pl->cuda_id = gpu_counter++;
            CHECK_CUDA(cudaSetDevice(pl->cuda_id));
            CHECK_CUDA(cudaStreamCreate(&pl->cuda_stream));
        }
    }
#endif

    for (i = 0; i < context->nworkers; i++) {
        hclib_worker_state *ws = context->workers[i];
        const int id = ws->id;
        for (j = 0; j < context->nplaces; j++) {
            place_t *pl = context->places[j];
            if (is_cpu_place(pl)) {
                hc_deque_t *hc_deq = &(pl->deques[id]);
                hc_deq->ws = ws;
#ifdef HC_CUDA
            } else if (is_nvgpu_place(pl)) {
                hc_deque_t *hc_deq = &(pl->deques[id]);
                hc_deq->ws = ws;

#endif
            } else {
                /* unhandled or ignored situation */
                HASSERT(0);
            }
        }

        /* here we link the deques of the ancestor places for this worker */
        place_t *parent = ws->pl;
        place_t *current = parent;
        ws->deques = &(current->deques[id]);
        while (parent->parent != NULL) {
            parent = parent->parent;
            current->deques[id].prev = &(parent->deques[id]);
            parent->deques[id].nnext = &(current->deques[id]);
            current = parent;
        }
        ws->current = &(current->deques[id]);
    }

#ifdef VERBOSE
    /*Print HPT*/
    int level = context->places[0]->level;
    printf("Level %d: ", level);
    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        if (level != pl->level) {
            printf("\n");
            level = pl->level;
            printf("Level %d: ", level);
        }

        printf("Place %d %s ", pl->id, place_type_to_str(pl->type));
        hclib_worker_state *w = pl->workers;
        if (w != NULL) {
            printf("[ ");
            while (w != NULL) {
                printf("W%d ", w->id);
                w = w->next_worker;
            }
            printf("] ");
        }

        place_t *c = pl->child;
        if (c != NULL) {
            printf("{ ");
            while (c != NULL) {
                printf("%d ", c->id);
                c = c->nnext;
            }
            printf("} ");
        }
        printf("\t");
    }
    printf("\n");
#endif
}
Example #22
0
    void gpu_data::
    set_size(
        size_t new_size
    )
    {
        if (new_size == 0)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                CHECK_CUDA(cudaStreamSynchronize(0));
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = 0;
            host_current = true;
            device_current = true;
            device_in_use = false;
            data_host.reset();
            data_device.reset();
        }
        else if (new_size != data_size)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                CHECK_CUDA(cudaStreamSynchronize(0));
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = new_size;
            host_current = true;
            device_current = true;
            device_in_use = false;

            try
            {
                CHECK_CUDA(cudaGetDevice(&the_device_id));

                // free memory blocks before we allocate new ones.
                data_host.reset();
                data_device.reset();

                void* data;
                CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
                // Note that we don't throw exceptions since the free calls are invariably
                // called in destructors.  They also shouldn't fail anyway unless someone
                // is resetting the GPU card in the middle of their program.
                data_host.reset((float*)data, [](float* ptr){
                    auto err = cudaFreeHost(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
                data_device.reset((float*)data, [](float* ptr){
                    auto err = cudaFree(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                if (!cuda_stream)
                {
                    cudaStream_t cstream;
                    CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
                    cuda_stream.reset(cstream, [](void* ptr){
                        auto err = cudaStreamDestroy((cudaStream_t)ptr);
                        if(err!=cudaSuccess)
                            std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                    });
                }

            }
            catch(...)
            {
                set_size(0);
                throw;
            }
        }
    }
Example #23
0
/*
 * n-dimensional redistribute from group-cyclic with cycle c0 to cycle c1
 * 1 <=c0,c1 <= pdim[i]
 */
void dfft_cuda_redistribute_nd( dfft_plan *plan,int stage, int size_in, int *embed, int *d_embed, int dir,
                  cuda_cpx_t *d_work, int **rho_c0, int **rho_c1, int **rho_plc0c1, int **rho_pc0,
                  int **rho_pc1, int **rho_c0c1pl)
    {
    /* exit early if nothing needs to be done */
    int res = 0;
    int i;
    for (i = 0; i < plan->ndim; ++i)
        if (!(plan->c0[stage][i] == plan->c1[stage][i])
            || plan->rev_global[stage][i]) res = 1;
    if (!res) return;

    int pdim_tot=1;
    int k,t;
    for (k = 0; k < plan->ndim; ++k)
        pdim_tot *= plan->pdim[k];
    for (t = 0; t<pdim_tot; ++t)
        {
        plan->nsend[t] = 0;
        plan->nrecv[t] = 0;
        plan->offset_send[t] = 0;
        plan->offset_recv[t] = 0;
        }

    int roffs = 0;
    int soffs = 0;
    for (t = 0; t < pdim_tot; ++t)
        {
        int send_size = 1;
        int recv_size = 1;

        /* send and recv flags */
        int send =1;
        int recv=1;
        int current_dim;
        int tmp = t;
        int tmp_pidx = pdim_tot;
        for (current_dim = 0; current_dim < plan->ndim; ++current_dim)
            {
            /* find coordinate of remote procesor along current dimension */
            if (!plan->row_m)
                {
                tmp_pidx /= plan->pdim[current_dim];
                i = tmp / tmp_pidx;
                tmp %= tmp_pidx;
                }
            else
                {
                i = (tmp % plan->pdim[current_dim]);
                tmp/=plan->pdim[current_dim];
                }
            int length = plan->gdim[current_dim]/plan->pdim[current_dim];

            /* processor index along current dimension */
            int c0 = plan->c0[stage][current_dim];
            int c1 = plan->c1[stage][current_dim];

            int s = plan->pidx[current_dim];
            int j0_local = s % c0;
            int j2_local = s / c0;
            int j0_new_local = s % c1;
            int j2_new_local = s / c1;
            int p = plan->pdim[current_dim];

            int ratio;
            /* dir == 1: block to cyclic,
               dir == 0: cyclic to block */
            if (dir)
                ratio = c1/c0;
            else
                ratio = c0/c1;

            int size;


            /* initialize send offsets */
            int j0_remote = i % c0;
            int j2_remote = i / c0;
            int j0_new_remote = i % c1;
            int j2_new_remote = i / c1;

            if (dir)
                {
                send &= ((j0_local == (j0_new_remote % c0))
                    && (j2_new_remote == j2_local / ratio));
                recv &= ((j0_remote == (j0_new_local % c0))
                    && (j2_new_local == j2_remote / ratio));
                }
            else
                {
                /* assume dir == 0 */
                if (!plan->rev_global[stage][current_dim])
                    {
                    send &= (((j0_local % c1) == j0_new_remote)
                        && (j2_local == (j2_new_remote/ratio)));
                    recv &= (((j0_remote % c1) == j0_new_local)
                        && (j2_remote == (j2_new_local/ratio)));
                    }
                else
                    {
                    /* global bitreversed output */
                    if (p/c1 > c0)
                        {
                        /* this section is usually not called during a DFFT */
                        k = c0*c1/plan->pdim[i];
                        send &= ((j0_local == rho_c0[current_dim][j2_new_remote/k]) &&
                            (rho_c1[current_dim][j0_new_remote] == j2_local/k));
                        recv &= ((j0_remote == rho_c0[current_dim][j2_new_local/k]) &&
                            (rho_c1[current_dim][j0_new_local] == j2_remote/k));

                        if (p/c1 > length*c0)
                            {
                            k = p/(length*c0*c1);
                            send &= (rho_plc0c1[current_dim][j2_new_remote%k]
                                == (j2_local % k));
                            recv &= (rho_plc0c1[current_dim][j2_new_local%k]
                                == (j2_remote % k));
                            }
                        }
                    else
                        {
                        k = c0*c1/p;
                        if (p/c1 > 1)
                            {
                            send &= (((rho_pc1[current_dim][j2_new_remote] == j0_local%(p/c1))
                                &&(rho_pc0[current_dim][j0_new_remote % (p/c0)] == j2_local)));
                            recv &= (((rho_pc1[current_dim][j2_new_local] == j0_remote%(p/c1))
                                && (rho_pc0[current_dim][j0_new_local % (p/c0)] == j2_remote)));
                            }
                        else
                            {
                            send &= (((j2_new_remote == j0_local%(p/c1))
                                &&(rho_pc0[current_dim][j0_new_remote % (p/c0)] == j2_local)));
                            recv &= (((j2_new_local == j0_remote%(p/c1))
                                && (rho_pc0[current_dim][j0_new_local % (p/c0)] == j2_remote)));
                            }

                        if (p*length/c1 < c0)
                            {
                            /* this section is usually not called during a DFFT */
                            k = c0*c1/p/length;
                            send &= (rho_c0c1pl[current_dim][j0_new_remote/(c1/k)] ==
                                j0_local/(c0/k));
                            recv &= (rho_c0c1pl[current_dim][j0_new_local/(c1/k)] ==
                                j0_remote/(c0/k));
                            }
                        }
                    } /* rev_global */
                } /* dir */
            if (!plan->rev_global[stage][current_dim] && (ratio >= length))
                {
                if (dir)
                    {
                    send &= ((j0_new_remote / (length*c0))
                        == (j2_local % (ratio/length)));
                    recv &= ((j0_new_local / (length*c0))
                        == (j2_remote % (ratio/length)));
                    }
                else
                    {
                    send &= ((j0_local / (length*c1))
                        == (j2_new_remote % (ratio/length)));
                    recv &= ((j0_remote / (length*c1))
                        == (j2_new_local % (ratio/length)));
                    }
                } 

            /* determine packet length for current dimension */
            if (! plan->rev_global[stage][current_dim])
                {
                if (ratio >= length)
                    size = 1;
                else
                    size = length/ratio;
                }
            else
                {
                if (p/c1 >= c0)
                    {
                    // usually not entered
                    size = ((p/c1 <= length*c0) ? (length*c0*c1/p) : 1);
                    }
                else
                    {
                    size = ((length*p/c1 >= c0) ? (length*p/c1/c0) : 1);
                    }
                }
            recv_size *= (recv ? size : 0);
            send_size *= (send ? size : 0);
            } /* end loop over dimensions */

        int rank = plan->proc_map[t];
        plan->nsend[rank] = send_size*sizeof(cuda_cpx_t);
        plan->nrecv[rank] = recv_size*sizeof(cuda_cpx_t);
        plan->offset_send[rank] = soffs*sizeof(cuda_cpx_t);
        plan->offset_recv[rank] = roffs*sizeof(cuda_cpx_t);
        roffs += recv_size;
        soffs += send_size;
        } /* end loop over processors */

    /* pack data */
    if (dir)
        {
        gpu_b2c_pack_nd(size_in, plan->d_c0[stage], plan->d_c1[stage], plan->ndim, d_embed,
            plan->d_length, plan->row_m, d_work, plan->d_scratch);
        if (plan->check_cuda_errors) CHECK_CUDA();
        }
    else
        {
        gpu_c2b_pack_nd(size_in, plan->d_c0[stage], plan->d_c1[stage], plan->ndim, d_embed,
            plan->d_length, plan->row_m, plan->d_pdim, plan->d_rev_j1[stage], plan->d_rev_global[stage],
            d_work, plan->d_scratch);
        if (plan->check_cuda_errors) CHECK_CUDA();
        }

    /* synchronize */
    MPI_Barrier(plan->comm);

    /* communicate */
    #ifdef ENABLE_MPI_CUDA
    MPI_Alltoallv(plan->d_scratch,plan->nsend, plan->offset_send, MPI_BYTE,
                  plan->d_scratch_2, plan->nrecv, plan->offset_recv, MPI_BYTE,
                  plan->comm);
    #else
    // stage into host buf
    cudaMemcpy(plan->h_stage_in, plan->d_scratch, sizeof(cuda_cpx_t)*size_in,cudaMemcpyDefault);
    if (plan->check_cuda_errors) CHECK_CUDA();

    MPI_Alltoallv(plan->h_stage_in,plan->nsend, plan->offset_send, MPI_BYTE,
                  plan->h_stage_out,plan->nrecv, plan->offset_recv, MPI_BYTE,
                  plan->comm);

    // copy back received data
    cudaMemcpy(plan->d_scratch_2,plan->h_stage_out, sizeof(cuda_cpx_t)*size_in,cudaMemcpyDefault);
    if (plan->check_cuda_errors) CHECK_CUDA();
    #endif

    /* unpack data */
    if (dir)
        {
        gpu_b2c_unpack_nd(size_in, plan->d_c0[stage], plan->d_c1[stage], plan->ndim, d_embed,
            plan->d_length, plan->row_m, plan->d_scratch_2, d_work);
        if (plan->check_cuda_errors) CHECK_CUDA();
        }
    else
        {
        gpu_c2b_unpack_nd(size_in, plan->d_c0[stage], plan->d_c1[stage], plan->ndim, d_embed,
            plan->d_length, plan->row_m, plan->d_pdim, plan->d_rev_global[stage],
            plan->d_rev_partial[stage], plan->d_scratch_2, d_work);
        if (plan->check_cuda_errors) CHECK_CUDA();
        }
    }
Example #24
0
/* Redistribute from group-cyclic with cycle c0 to cycle c0>=c1
 * rev=1 if local order is reversed
 *
 * if rev = 1 and np >= c0 (last stage) it really transforms
 * into a hybrid-distribution, which after the last local ordered
 * DFT becomes the cyclic distribution
 */
void dfft_cuda_redistribute_cyclic_to_block_1d(int *dim,
                     int *pdim,
                     int ndim,
                     int current_dim,
                     int c0,
                     int c1,
                     int* pidx,
                     int rev,
                     int size_in,
                     int *embed,
                     cuda_cpx_t *d_work,
                     cuda_cpx_t *d_scratch,
                     cuda_cpx_t *h_stage_in,
                     cuda_cpx_t *h_stage_out,
                     int *rho_L,
                     int *rho_pk0,
                     int *dfft_nsend,
                     int *dfft_nrecv,
                     int *dfft_offset_send,
                     int *dfft_offset_recv,
                     MPI_Comm comm,
                     int check_err,
                     int row_m
                     )
    {
    if (c1 == c0) return;

    /* length along current dimension */
    int length = dim[current_dim]/pdim[current_dim];
    int size = length*c1/c0;
    size = (size ? size : 1);
    int npackets = length/size; 

    int stride = size_in/embed[current_dim];

    /* processor index along current dimension */
    int s=pidx[current_dim];
    /* number of procs along current dimension */
    int p=pdim[current_dim];

    size *= stride;

    int offset = 0;
    int recv_size,send_size;
    int j0_local = s%c0;
    int j2_local = s/c0;
    int j0_new_local = s%c1;
    int j2_new_local = s/c1;

    int pdim_tot=1;
    int k;
    for (k = 0; k < ndim; ++k)
        pdim_tot *= pdim[k];

    int i;
    for (i = 0; i < pdim_tot; ++i)
        {
        dfft_nsend[i] = 0;
        dfft_nrecv[i] = 0;
        dfft_offset_send[i] = 0;
        dfft_offset_recv[i] = 0;
        }

    for (i = 0; i < p; ++i)
        {
        int j0_remote = i%c0;
        int j2_remote = i/c0;

        int j0_new_remote = i % c1;
        int j2_new_remote = i/c1;
    
        /* decision to send and/or receive */
        int send = 0;
        int recv = 0;
        if (rev && (length >= c0))
            {
            /* redistribute into block with reversed processor id
               and swapped-partially reversed local order (the c0 LSB
               of the local index are MSB, and the n/p/c0 MSB
               are LSB and are reversed */
            send = (((j2_new_remote % (p/c0)) == (rho_pk0[j2_local])) ? 1 : 0);
            recv = (((j2_new_local % (p/c0)) == (rho_pk0[j2_remote])) ? 1 : 0);
            }
        else
            {
            send = (((j2_new_remote / (c0/c1)) == j2_local) && ((j0_local % c1)==j0_new_remote) ? 1 : 0); 
            recv = (((j2_new_local / (c0/c1)) == j2_remote) &&  ((j0_remote % c1)==j0_new_local) ? 1 : 0);

            if (length*c1 < c0)
                {
                send &= (j0_local/(length*c1) == j2_new_remote % (c0/(length*c1)));
                recv &= (j0_remote/(length*c1) == j2_new_local % (c0/(length*c1)));
                }
            }

        /* offset of first element sent */
        int j1;
        if (length*c1 >= c0)
            {
            j1 = (j2_new_remote % (c0/c1))*length*c1/c0;
            }
        else
            {
            j1 = (j2_new_remote / (c0/(length*c1))) % length;
            }

        if (rev)
            {
            if (length >= c0)
                {
                j1 = j2_new_remote/(p/c0);
                }
            else
                j1 = rho_L[j1];
            }
        
        /* mirror remote decision to send */
        send_size = (send ? size : 0);
        recv_size = (recv ? size : 0);

        int destproc = 0;
        int k;
        if (row_m)
            {
            for (k = ndim-1; k >=0 ;--k)
                {
                destproc *= pdim[k];
                destproc += ((current_dim == k) ? i : pidx[k]);
                }
            }
        else
            {
            for (k = 0; k < ndim; ++k)
                {
                destproc *= pdim[k];
                destproc += ((current_dim == k) ? i : pidx[k]);
                }
            }
 
        dfft_offset_send[destproc] = (send ? (stride*j1*sizeof(cuda_cpx_t)) : 0);
        if (rev && (length > c0/c1))
            {
            /* we are directly receving into the work buf */
            dfft_offset_recv[destproc] = stride*j0_remote*length/c0*sizeof(cuda_cpx_t);
            }
        else
            {
            dfft_offset_recv[destproc] = offset*sizeof(cuda_cpx_t);
            }

        dfft_nsend[destproc] = send_size*sizeof(cuda_cpx_t);
        dfft_nrecv[destproc] = recv_size*sizeof(cuda_cpx_t);
        offset+=(recv ? size : 0);
        }

    /* we need to pack data if the local input buffer is reversed
       and we are sending more than one element */
    if (rev && (size > stride))
        {
        offset = 0;
        int i;
        for (i = 0; i <p; ++i)
            {
            int destproc = 0;
            int k;
            if (row_m)
                {
                for (k = ndim-1; k >=0 ;--k)
                    {
                    destproc *= pdim[k];
                    destproc += ((current_dim == k) ? i : pidx[k]);
                    }
                }
            else
                {
                for (k = 0; k < ndim; ++k)
                    {
                    destproc *= pdim[k];
                    destproc += ((current_dim == k) ? i : pidx[k]);
                    }
                }
 
            int j1_offset = dfft_offset_send[destproc]/sizeof(cuda_cpx_t)/stride;

            /* we are sending from a tmp buffer/stride */
            dfft_offset_send[destproc] = offset*sizeof(cuda_cpx_t)*stride;
            int n = dfft_nsend[destproc]/stride/sizeof(cuda_cpx_t);
            int j;
            offset += n;
            }

        /* pack data */
        gpu_b2c_pack(size_in, c0, size, c0, stride, d_work, d_scratch);
        if (check_err) CHECK_CUDA();
       
        /* perform communication */
        MPI_Barrier(comm);
        #ifdef ENABLE_MPI_CUDA
        MPI_Alltoallv(d_scratch,dfft_nsend, dfft_offset_send, MPI_BYTE,
                      d_work, dfft_nrecv, dfft_offset_recv, MPI_BYTE,
                      comm);
        #else
        // stage into host buf
        cudaMemcpy(h_stage_in, d_scratch, sizeof(cuda_cpx_t)*length*stride,cudaMemcpyDefault); 
        if (check_err) CHECK_CUDA();

        MPI_Alltoallv(h_stage_in,dfft_nsend, dfft_offset_send, MPI_BYTE,
                      h_stage_out, dfft_nrecv, dfft_offset_recv, MPI_BYTE,
                      comm);

        // copy back received data
        cudaMemcpy(d_work,h_stage_out, sizeof(cuda_cpx_t)*npackets*size,cudaMemcpyDefault); 
        if (check_err) CHECK_CUDA();
        #endif
        }
    else
        {
        /* perform communication */
        MPI_Barrier(comm);
        #ifdef ENABLE_MPI_CUDA
        MPI_Alltoallv(d_work,dfft_nsend, dfft_offset_send, MPI_BYTE,
                      d_scratch, dfft_nrecv, dfft_offset_recv, MPI_BYTE,
                      comm);
        #else
        // stage into host buf
        cudaMemcpy(h_stage_in, d_work, sizeof(cuda_cpx_t)*size_in,cudaMemcpyDefault); 
        if (check_err) CHECK_CUDA();

        MPI_Alltoallv(h_stage_in,dfft_nsend, dfft_offset_send, MPI_BYTE,
                      h_stage_out, dfft_nrecv, dfft_offset_recv, MPI_BYTE,
                      comm);

        // copy back received data
        cudaMemcpy(d_scratch,h_stage_out, sizeof(cuda_cpx_t)*npackets*size,cudaMemcpyDefault); 
        if (check_err) CHECK_CUDA();
        #endif

        /* unpack */
        gpu_c2b_unpack(npackets*size, length, c0, c1, size, j0_new_local, stride, rev, d_work, d_scratch);
        if (check_err) CHECK_CUDA();
        }
    }
Example #25
0
int main(){


    //general function

    //utilCuda::printAllCudaDeviceSpecs();


{
    utilCuda::ContextPtrType ptr = utilCuda::createCudaContextOnDevice(0);
    ptr->setActive();

    std::cout << ptr->device().deviceString() << std::endl;

    //Make a managed device memory pointer (intrusive pointer, cleans it self)
    std::vector<utilCuda::DeviceMemPtr<char> > pDevs;
    for(int i=0;i<10;i++){
        pDevs.push_back( ptr->malloc<char>(10<<20) );
        //std::cout << static_cast<utilCuda::CudaAllocBuckets*>(ptr->getAllocator())->allocated() << std::endl;
    }

    pDevs.erase(pDevs.begin(),pDevs.begin()+2);


    std::cout << ptr->device().memInfoString() << std::endl;
    pDevs.push_back( ptr->malloc<char>(2000<<20) ); // To big for a allocator bucket, goes into uncached!
    pDevs.pop_back();
    std::cout << ptr->device().memInfoString() << std::endl;

    // Make another
    utilCuda::DeviceMemPtr<int> pDev2 = ptr->genRandom<int>(100,4,10);
    utilCuda::printArray(*pDev2,"%4d", 10);

    pDevs.clear();
    std::cout << ptr->device().memInfoString() << std::endl;
}


std::cout << utilCuda::CudaDevice::selected().memInfoString() << std::endl;
// Memory Stress Test
{
    utilCuda::ContextPtrType ptr = utilCuda::createCudaContextOnDevice(0);
    ptr->setActive();
    std::vector<utilCuda::DeviceMemPtr<char> > pDevs;
    std::default_random_engine generator;
    std::uniform_int_distribution<int> distribution(1,300);
    int dice_roll = distribution(generator);

    for(int l=0;l<200;l++){
        //add
        for(int i=0;i<5;i++){
                std::cout << "add" <<",";
            pDevs.push_back( ptr->malloc<char>(distribution(generator)<<20) );
        }
        std::cout << static_cast<utilCuda::CudaAllocBuckets*>(ptr->getAllocator())->capacity();
        //remove
        for(int i=0;i<5;i++){
            std::cout << "remove" << ",";
            pDevs.pop_back();
        }
        std::cout << static_cast<utilCuda::CudaAllocBuckets*>(ptr->getAllocator())->capacity();
    }
    std::cout << std::endl;
}
std::cout << utilCuda::CudaDevice::selected().memInfoString() << std::endl;


{
    utilCuda::ContextPtrType c = utilCuda::createCudaContextOnDevice(0,false);
    c->setActive();

     // Make another
     {
         utilCuda::DeviceMatrixPtr<double> pA_dev = c->mallocMatrix<double,false>(13000,13000);
         std::cout << "Size of CudaMatrix: " << sizeof(*pA_dev) << std::endl;
         std::cout << "pitch:" << pA_dev->get().m_outerStrideBytes << std::endl;
         std::cout << c->device().memInfoString() << std::endl;
     }

     // Make another aligned matrix :)
     {
         std::cout << c->device().memInfoString() << std::endl;
         utilCuda::DeviceMatrixPtr<double> pA_dev = c->mallocMatrix<double,true>(13000,2);
         std::cout << "pitch:" << pA_dev->get().m_outerStrideBytes << std::endl;
         std::cout << c->device().memInfoString() << std::endl;

         Eigen::MatrixXd A(13000,2);

         std::cout << "Copy From Matrix" <<std::endl;
         CHECK_CUDA(pA_dev->fromHost(A));

         std::cout << "Copy From temporary expression A+A" <<std::endl;
         CHECK_CUDA(pA_dev->fromHost(A+A));

         Eigen::MatrixXd B(13000,4);
//
         std::cout << "Copy From temporary expression (block)" <<std::endl;
         CHECK_CUDA(pA_dev->fromHost(B.leftCols(2)));


         Eigen::MatrixXd C(13400,4);

         std::cout << "Copy From temporary expression (block)" <<std::endl;
         CHECK_CUDA(pA_dev->fromHost(C.block(400,2,13000,2)));

         std::cout << "Copy From temporary expression (block)" <<std::endl;
         //CHECK_CUDA(pA_dev->fromHost(c->block(30,2,12500,2))); // fails because not rigth size!

     }


     {
         utilCuda::DeviceMatrixPtr<double> pA_dev = c->mallocMatrix<double,true>(55,5);
         Eigen::MatrixXd A(55,5);
         A.setOnes();
         std::cout << "A: " << std::endl << A << std::endl;
         pA_dev->fromHost(A+A);
         utilCuda::printArray(*pA_dev,"%4f"); // copies internally the matrix to the device before outputing!
     }


}
    // see how the system changes the GPU mem! :-)
    for(int i=0; i < 10; i++){
        sleep(1);
        std::cout << utilCuda::CudaDevice::selected().memInfoString() << std::endl;
    }

    utilCuda::destroyDeviceGroup();

};