__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\
int tid = get_local_id(0); \n\
int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\
int i = tid + (bx) * (get_local_size(0)); \n\
//debug \n\
//if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\
if (i < size ) { mem[i]=val; } \n\
}" };
示例#2
0
size_t _CL_OVERLOADABLE
get_global_id(unsigned int dimindx)
{
  switch(dimindx)
    {
        /* TODO: add get_global_offset(X) to these! */
    case 0: return get_local_size(0) * get_group_id(0) + get_local_id(0);
    case 1: return get_local_size(1) * get_group_id(1) + get_local_id(1);
    case 2: return get_local_size(2) * get_group_id(2) + get_local_id(2);
    default: return 0;
    }
}
示例#3
0
/// \fn _copyKernel
/// \brief generate a copy kernel program
compute::program _copyKernel(const compute::context& context)
{
    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
        __kernel void copy_kernel(__global const float *src, __global float *dst)
        {
            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
            
            uint width = get_num_groups(0) * TILE_DIM;
            
            for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
            {
                dst[(y+i)*width +x] = src[(y+i)*width + x];
            }
        }
示例#4
0
文件: cfg_rpc.c 项目: kiryu/kamailio
static void rpc_set_delayed(rpc_t* rpc, void* c)
{
    str	group, var;
    int	i, err;
    char	*ch;
    unsigned int	*group_id;

    if (rpc->scan(c, "SS", &group, &var) < 2)
        return;

    if (get_group_id(&group, &group_id)) {
        rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\"");
        return;
    }

    if (rpc->scan(c, "d", &i) == 1)
        err = cfg_set_delayed_int(ctx, &group, group_id, &var, i);
    else if (rpc->scan(c, "s", &ch) == 1)
        err = cfg_set_delayed_string(ctx, &group, group_id, &var, ch);
    else
        return; /* error */

    if (err) {
        rpc->fault(c, 400, "Failed to set the variable");
        return;
    }
}
示例#5
0
void handle_work_checkout_event(
    awe_server_state * ns,
    tw_bf * b,
    awe_msg * m,
    tw_lp * lp)
{
    tw_event *e;
    awe_msg *msg;


    e = codes_event_new(m->src, ns_tw_lookahead, lp);
    msg = tw_event_data(e);
    msg->event_type = WORK_CHECKOUT;
    memset(msg->object_id, 0, sizeof(msg->object_id));
    
    tw_lpid client_id = m->src;
//    char group_name[MAX_LENGTH_GROUP];
    //char lp_type_name[MAX_LENGTH_GROUP];
    //int  lp_type_id, grp_id, grp_rep_id, offset;
 //   codes_mapping_get_lp_info(client_id, group_name, &grp_id, &lp_type_id,
   //         lp_type_name, &grp_rep_id, &offset);

    int group_id = 0;
    group_id = get_group_id(client_id);

    /*if queue is empty, msg->object_id is "", otherwise msg->object-id is the dequeued workid*/
    int got_work = 0;
    char workid[MAX_LENGTH_ID];
    if (!g_queue_is_empty(work_queue)) {
        if (group_id == 1 && sched_policy>0) {  //client from remote site
            char* work = NULL;
            if (sched_policy==1) {
                work = get_first_work_by_stage(5); //checkout task 5 (blat) only for remote site
            } else if (sched_policy==2) {
            	work = get_first_work_by_greedy(WorkOrder);
            }
            if (work) {
            	strcpy(workid, work);
            	got_work = 1;
            }
        } else {
        	strcpy(workid, g_queue_pop_head(work_queue));
        	got_work = 1;
        }
    }

    if (got_work) { //eligible work found, send back to the requesting client
        fprintf(event_log, "%lf;awe_server;%lu;WC;work=%s client=%lu\n", now_sec(lp), lp->gid, workid, m->src);
        assert (strlen(workid) > 10);
        strcpy(msg->object_id, workid);
        tw_event_send(e);
    } else {  //no eligible work found, put client request to the waiting queue
        tw_lpid *clientid = NULL;
        clientid = malloc(sizeof(tw_lpid));
        *clientid = m->src;
        g_queue_push_tail(client_req_queue, clientid);
    }
    return;
}
示例#6
0
void ComputeGradinetsRTLR_SetGradients(local float* tmpGradients, global float* gradients, global float* gradientSums)
{
    Reduce_Sum(tmpGradients);

    if (get_local_id(0) == 0)
    {
        int ijValueIndex = get_group_id(0);
        if (gradients != null) gradients[ijValueIndex] = tmpGradients[0];
        if (gradientSums != null) gradientSums[ijValueIndex] += tmpGradients[0];
    }
}
__kernel void kernel_scan(__global float* input, __global float* output) {
    int global_idx = get_global_id(0);
    int local_idx = get_local_id(0);
    int block_size = get_local_size(0);
    int group_id = get_group_id(0);

    output[global_idx] = input[global_idx];
    mem_fence(CLK_GLOBAL_MEM_FENCE);

    for(int i = 1; i < block_size; i <<= 1) {
        if(global_idx >= i)
            output[global_idx] += output[global_idx - i];
        mem_fence(CLK_GLOBAL_MEM_FENCE);
    }
}
示例#8
0
__kernel void kernel_reduce(__global float* input, __global float* output) {
    int global_idx = get_global_id(0);
    int local_idx = get_local_id(0);
    int block_size = get_local_size(0);
    int group_id = get_group_id(0);

    for(int i = block_size/2; i > 0; i >>= 1) {
        if(local_idx < i)
            input[global_idx] += input[global_idx + i];

        barrier(CLK_GLOBAL_MEM_FENCE);
    }

    if(local_idx == 0)
        output[group_id] = input[global_idx];
}
示例#9
0
文件: cfg_rpc.c 项目: kiryu/kamailio
static void rpc_get(rpc_t* rpc, void* c)
{
    str	group, var;
    void	*val;
    unsigned int	val_type;
    int	ret;
    unsigned int	*group_id;

    if (rpc->scan(c, "SS", &group, &var) < 2)
        return;

    if (get_group_id(&group, &group_id)) {
        rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\"");
        return;
    }

    ret = cfg_get_by_name(ctx, &group, group_id, &var,
                          &val, &val_type);
    if (ret < 0) {
        rpc->fault(c, 400, "Failed to get the variable");
        return;

    } else if (ret > 0) {
        rpc->fault(c, 400, "Variable exists, but it is not readable via RPC interface");
        return;
    }

    switch (val_type) {
    case CFG_VAR_INT:
        rpc->add(c, "d", (int)(long)val);
        break;

    case CFG_VAR_STRING:
        rpc->add(c, "s", (char *)val);
        break;

    case CFG_VAR_STR:
        rpc->add(c, "S", (str *)val);
        break;

    case CFG_VAR_POINTER:
        rpc->printf(c, "%p", val);
        break;

    }

}
示例#10
0
int client_match_work(tw_lpid client_id, char* workid) {
    int match = 1;
    //char group_name[MAX_LENGTH_GROUP];
    //char lp_type_name[MAX_LENGTH_GROUP];
    //codes_mapping_get_lp_info(clientid, group_name, grp_id, lp_type_id, lp_type_name, grp_rep_id, offset);

    int group_id = 0;
    group_id = get_group_id(client_id);
    if (group_id == 1) {  //remote client
    	gchar **seg = g_strsplit(workid, "_", 3);
    	int taskid = atoi(seg[1]);
    	if (taskid != 5) {
    		match = 0;
    	}
    }
    return match;
}
示例#11
0
文件: cfg_rpc.c 项目: kiryu/kamailio
static void rpc_del_delayed(rpc_t* rpc, void* c)
{
    str	group, var;
    unsigned int	*group_id;

    if (rpc->scan(c, "SS", &group, &var) < 2)
        return;

    if (get_group_id(&group, &group_id) || !group_id) {
        rpc->fault(c, 400, "Wrong group syntax. Use \"group[id]\"");
        return;
    }

    if (cfg_del_delayed(ctx, &group, group_id, &var)) {
        rpc->fault(c, 400, "Failed to delete the value");
        return;
    }
}
示例#12
0
文件: cfg_rpc.c 项目: kiryu/kamailio
static void rpc_del_group_inst(rpc_t* rpc, void* c)
{
    str	group;
    unsigned int	*group_id;

    if (rpc->scan(c, "S", &group) < 1)
        return;

    if (get_group_id(&group, &group_id) || !group_id) {
        rpc->fault(c, 400, "Wrong group syntax. Use \"group[id]\"");
        return;
    }

    if (cfg_del_group_inst(ctx, &group, *group_id)) {
        rpc->fault(c, 400, "Failed to delete the group instance");
        return;
    }
}
示例#13
0
文件: cfg_rpc.c 项目: kiryu/kamailio
static void rpc_set_now_int(rpc_t* rpc, void* c)
{
    str	group, var;
    int	i;
    unsigned int	*group_id;

    if (rpc->scan(c, "SSd", &group, &var, &i) < 3)
        return;

    if (get_group_id(&group, &group_id)) {
        rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\"");
        return;
    }

    if (cfg_set_now_int(ctx, &group, group_id, &var, i)) {
        rpc->fault(c, 400, "Failed to set the variable");
        return;
    }
}
示例#14
0
文件: cfg_rpc.c 项目: kiryu/kamailio
static void rpc_set_delayed_string(rpc_t* rpc, void* c)
{
    str	group, var;
    char	*ch;
    unsigned int	*group_id;

    if (rpc->scan(c, "SSs", &group, &var, &ch) < 3)
        return;

    if (get_group_id(&group, &group_id)) {
        rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\"");
        return;
    }

    if (cfg_set_delayed_string(ctx, &group, group_id, &var, ch)) {
        rpc->fault(c, 400, "Failed to set the variable");
        return;
    }
}
示例#15
0
inline global float* GetPValuesPtr(global float* pValuesOfWeights, int uLayersCount, int maxULayerSize, int kLayerIndex)
{
    int ijValueIndex = get_group_id(0);
    return pValuesOfWeights + (ijValueIndex * uLayersCount * maxULayerSize) + (kLayerIndex * maxULayerSize);
}
示例#16
0
kernel void ComputeGradientsRTLR_V0_CPU(
    global float* pValuesOfWeights
    , int uLayersCount
    , int maxULayerSize
    , int p_i_j_l_LayerIndex_0_0
    , int p_i_j_l_LayerSize_0_0
    , global float$* weights_0_0
    , int p_i_j_l_LayerIndex_1_0
    , int p_i_j_l_LayerSize_1_0
    , global float$* weights_1_0
    , int p_i_j_l_LayerIndex_2_0
    , int p_i_j_l_LayerSize_2_0
    , global float$* weights_2_0
    , int p_i_j_l_LayerIndex_3_0
    , int p_i_j_l_LayerSize_3_0
    , global float$* weights_3_0
    , int p_i_j_k_LayerSize_0
    , global float* netDerivValues_0
    , int p_i_j_l_LayerIndex_0_1
    , int p_i_j_l_LayerSize_0_1
    , global float$* weights_0_1
    , int p_i_j_l_LayerIndex_1_1
    , int p_i_j_l_LayerSize_1_1
    , global float$* weights_1_1
    , int p_i_j_l_LayerIndex_2_1
    , int p_i_j_l_LayerSize_2_1
    , global float$* weights_2_1
    , int p_i_j_l_LayerIndex_3_1
    , int p_i_j_l_LayerSize_3_1
    , global float$* weights_3_1
    , int p_i_j_k_LayerSize_1
    , global float* netDerivValues_1
    , int p_i_j_l_LayerIndex_0_2
    , int p_i_j_l_LayerSize_0_2
    , global float$* weights_0_2
    , int p_i_j_l_LayerIndex_1_2
    , int p_i_j_l_LayerSize_1_2
    , global float$* weights_1_2
    , int p_i_j_l_LayerIndex_2_2
    , int p_i_j_l_LayerSize_2_2
    , global float$* weights_2_2
    , int p_i_j_l_LayerIndex_3_2
    , int p_i_j_l_LayerSize_3_2
    , global float$* weights_3_2
    , int p_i_j_k_LayerSize_2
    , global float* netDerivValues_2
    , int p_i_j_l_LayerIndex_0_3
    , int p_i_j_l_LayerSize_0_3
    , global float$* weights_0_3
    , int p_i_j_l_LayerIndex_1_3
    , int p_i_j_l_LayerSize_1_3
    , global float$* weights_1_3
    , int p_i_j_l_LayerIndex_2_3
    , int p_i_j_l_LayerSize_2_3
    , global float$* weights_2_3
    , int p_i_j_l_LayerIndex_3_3
    , int p_i_j_l_LayerSize_3_3
    , global float$* weights_3_3
    , int p_i_j_k_LayerSize_3
    , global float* netDerivValues_3
    , int iLayerIndex
    , global float* inputs
    , int inputsSize // + bias (null) = 1, inputs: size
    , global float* outputs
    , global float* desiredOutputs
    , local float* tmpGradients // size = local size
    , global float* gradients
    , global float* gradientSums)
{
    int localId = get_local_id(0);
    int localSize = get_local_size(0);

    int ijValueIndex = get_group_id(0);
    int iValueIndex = ijValueIndex / inputsSize;
    int jValueIndex = ijValueIndex % inputsSize;

    tmpGradients[localId] = 0.0f;

    barrier(CLK_LOCAL_MEM_FENCE);

    // Local size ~ avg uLayerSize
    for (int kLayerIndex = 0; kLayerIndex < uLayersCount; kLayerIndex++)
    {
        int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex);
        bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null;

        int block = kLayerSize / localSize + (kLayerSize % localSize != 0 ? 1 : 0);
        int kValueIndex = localId * block;
        int max = kValueIndex + block;
        if (max > kLayerSize) max = kLayerSize;
        while (kValueIndex < max)
        {
            float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f;

            int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex);
            int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex);
            global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex);
            sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex);
            float p = netDerivValues[kValueIndex] * sum;
            GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p;

            if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p;

            kValueIndex++;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (gradients != null || gradientSums != null)
    {
        ComputeGradinetsRTLR_SetGradients(tmpGradients, gradients, gradientSums);
    }

    /*int pValuesOfWeightsSize2 = uLayersCount * maxULayerSize;
    int block = pValuesOfWeightsSize2 / localSize + (pValuesOfWeightsSize2 % localSize != 0 ? 1 : 0);
    int kLayerAndValueIndex = localId * block;
    int max = kLayerAndValueIndex + block;
    if (max > pValuesOfWeightsSize2) max = pValuesOfWeightsSize2;
    while (kLayerAndValueIndex < max)
    {
        int kLayerIndex = kLayerAndValueIndex / maxULayerSize;
        int kValueIndex = kLayerAndValueIndex % maxULayerSize;
        int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex);
        
        if (kValueIndex < kLayerSize)
        {
            bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null;

            float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f;

            int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex);
            int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex);
            global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex);
            sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            
            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }
            
            global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex);
            float p = netDerivValues[kValueIndex] * sum;
            GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p;
            
            if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p;
        }

        kLayerAndValueIndex++;
    }*/
}
示例#17
0
uint na)
{ 
int i, j;
double Ystx = 0; 
__local double *y = 0; 
double switcher; 
double neg_switcher; 

// Thread index
int tx = get_local_id(0); 

// Thread index 
int gx = get_global_id(0); 

// Block index 
int bx = get_group_id(0); 

A = A + offA; 

__global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; 
int NumBLperNB = NB / BLOCK_SIZE; 
d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); 

__local double Bs[BLOCK_SIZE*BLOCK_SIZE]; 
__local double workspace[BLOCK_SIZE];	// workspace used to store the current working column 

// load A 
_Pragma("unroll")
for (i = 0; i < BLOCK_SIZE; i++)
{ 
	if (tx <= i && i + bx*BLOCK_SIZE < na)
   uint offsetB,
   uint offsetC)
{
  float rC[4][4]  = { {(float)0} };
  float rA[1][4];
  float rB[1][4];


  A += offsetA;
  B += offsetB;
  C+=offsetC;

  __local float lA[1056];
  __local float lB[1056];

  uint gidx = get_group_id(0);
  uint gidy = get_group_id(1);
  uint idx = get_local_id(0);
  uint idy = get_local_id(1);

  uint idt = 16*idy + idx;
  uint idxT = idt % 16;
  uint idyT = idt / 16;

  A +=  gidx*64*lda+ idxT + idyT*lda;
  B +=  gidy*64*ldb+ idxT + idyT*ldb;


  uint block_k = K >> 4;
  do
  {
示例#19
0
文件: groups.c 项目: cran/rcqp
Group *
ComputeGroupExternally(Group *group)
{
  int i;
  int size = group->my_corpus->size;
  int cutoff_freq = group->cutoff_frequency;

  char temporary_name[TEMP_FILENAME_BUFSIZE];
  FILE *fd;
  FILE *pipe;
  char sort_call[CL_MAX_LINE_LENGTH];

  /* ---------------------------------------------------------------------- */

  if ((fd = open_temporary_file(temporary_name)) == NULL) {
    perror("Error while opening temporary file");
    cqpmessage(Warning, "Can't open temporary file");
    return group;
  }

  for (i = 0; i < size; i++) {
    fprintf(fd, "%d %d\n", get_group_id(group, i, 0), get_group_id(group, i, 1)); /* (source ID, target ID) */
  }
  fclose(fd);

  /* construct sort call */
  sprintf(sort_call, ExternalGroupingCommand, temporary_name);
  if (GROUP_DEBUG)
   Rprintf( "Running grouping sort: \n\t%s\n",
            sort_call);
  if ((pipe = popen(sort_call, "r")) == NULL) {
    perror("Failure opening grouping pipe");
    cqpmessage(Warning, "Can't open grouping pipe:\n%s\n"
               "Disable external grouping by\n"
               "  set UseExternalGrouping off;", 
               sort_call);
  }
  else {
    int freq, p1, p2, tokens;
#define GROUP_REALLOC 16

    while ((tokens = fscanf(pipe, "%d%d%d", &freq, &p1, &p2)) == 3) {
      if (freq > cutoff_freq) {
        if ((group->nr_cells % GROUP_REALLOC) == 0) {
          if (group->count_cells == NULL) {
            group->count_cells = 
              (ID_Count_Mapping *)cl_malloc(GROUP_REALLOC *
                                         sizeof(ID_Count_Mapping));
          }
          else {
            group->count_cells = 
              (ID_Count_Mapping *)cl_realloc(group->count_cells,
                                          (group->nr_cells + GROUP_REALLOC) *
                                          sizeof(ID_Count_Mapping));
          }
          assert(group->count_cells);
        }

        group->count_cells[group->nr_cells].s = p1;
        group->count_cells[group->nr_cells].t = p2;
        group->count_cells[group->nr_cells].freq = freq;

        group->nr_cells = group->nr_cells + 1;
      }
    }

    if (tokens != EOF) {
     Rprintf( "Warning: could not reach EOF of temporary file!\n");
    }

    pclose(pipe);
  }

  if (GROUP_DEBUG) {
   Rprintf( "Keeping temporary file %s -- delete manually\n",
            temporary_name);
  }
  else if (unlink(temporary_name) != 0) {
    perror(temporary_name);
   Rprintf( "Can't remove temporary file %s -- \n\tI will continue, "
            "but you should remove that file.\n", temporary_name);
  }
  
  return group;
}
示例#20
0
文件: groups.c 项目: cran/rcqp
Group *
ComputeGroupInternally(Group *group)
{
  ID_Count_Mapping node;
  ID_Count_Mapping *result;

  int i;
  size_t nr_nodes;
  int percentage, new_percentage; /* for ProgressBar */
  int size = group->my_corpus->size;

  /* ---------------------------------------------------------------------- */

  nr_nodes = 0;
  
  if (progress_bar)
    progress_bar_clear_line();
  percentage = -1;

  EvaluationIsRunning = 1;

  for (i = 0; i < size; i++) {
    if (! EvaluationIsRunning)
      break;                    /* user abort (Ctrl-C) */

    if (progress_bar) {
      new_percentage = floor(0.5 + (100.0 * i) / size);
      if (new_percentage > percentage) {
        percentage = new_percentage;
        progress_bar_percentage(1, 2, percentage);
      }
    }

    node.s = get_group_id(group, i, 0);       /* source ID */
    node.t = get_group_id(group, i, 1);       /* target ID */
    node.freq = 0;
  
    result = binsert_g(&node,
                       (void **) &(group->count_cells),
                       &nr_nodes,
                       sizeof(ID_Count_Mapping),
                       compare_st_cells);

    result->freq++;
  }

  if (EvaluationIsRunning) {
    group->nr_cells = sum_freqs(group->count_cells, nr_nodes, group->cutoff_frequency);
    
    if (progress_bar)
      progress_bar_clear_line();
    
    if (group->nr_cells < nr_nodes)
      group->count_cells = 
        cl_realloc(group->count_cells, (group->nr_cells * sizeof(ID_Count_Mapping)));
  }
  else {
    cqpmessage(Warning, "Group operation aborted by user.");
    if (which_app == cqp) install_signal_handler();
    free_group(&group);         /* sets return value to NULL to indicate failure */
  }
  EvaluationIsRunning = 0;
    
  return group;
}
	__kernel void TRIPLE_DGEMM_UPDATE_192_96_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
{ 
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 48 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2

	//Work group size is [24, 2]
	//global work size is [48*number of blocks, 4]
	//each work item in each work group is responsible for 12 elements (1/4) in that row
	//each work group is responsible for 24 by 24 macro tile;

	////////////// -invA11*invA12
	const uint gidx = get_group_id(0); 
	const uint gidy = get_group_id(1); 
	const uint idx = get_local_id(0); 
	const uint idy = get_local_id(1); 

	//uint page = gidx / 2;//0-1 for 192; 0 for 96
	//const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block
	const uint page_block = gidx / 4; //#index of page_block; 4 WG per page; 4 WG per page_block

	__global double *A, *B, *C; 
	__local double lA[24][48]; 
	__local double lB[48][24]; 
	double privateC[12] = { (double)0 }; 

	//decide invA11 location for each page
	//each workgroup loads half of A (left or right)
	//A = d_dinvA + page*NB*NB + gidx%2*(blk/2);
	A = d_dinvA + page_block*NB*NB + gidx % 4 * (blk / 4); 

	//decide invA12 (B) location for each page
	//actually it was saved in invA21 from last kernel
	//each workgroup loads half of B (up or down)
	//B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB;
	B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk / 4)*NB; 

	//decide invA12 location for each page
	//each workgroup writes 1/4 of C
	//C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
	C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; 

	//read A and B into LDS no transpose operated here
	//each work item loads a half row of A and half column of B
	//idx 0-23 idy 0-1

	uint block_k = blk / 48;  //thus we need 2 iterations here
	do{
		barrier(CLK_LOCAL_MEM_FENCE); 
		lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; 
		lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; 
		lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB];
		lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB];
		lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB];
		lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB];
		lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB];
		lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB];
		lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB];
		lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB];
		lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB];
		lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB];
		lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB];
		lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB];
		lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB];
		lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB];
		lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB];
		lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB];
		lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB];
		lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB];
		lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB];
		lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB];
		lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB];
		lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB];

		lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; 
		lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1]; 
		lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];
		lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];
		lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];
		lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];
		lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];
		lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];
		lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];
		lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];
		lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];
		lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];
		lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];
		lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];
		lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];
		lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];
		lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];
		lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];
		lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];
		lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];
		lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];
		lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];
		lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];
		lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];
		barrier(CLK_LOCAL_MEM_FENCE); 

		//do math

		uint i = 0; 

		do{
			privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);
			privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);
			privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);
			privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);
			privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);
			privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);
			privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);
			privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);
			privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);
			privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);
			privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); 
			privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); 
			i = i + 1; 
		} while (i < 48); 

		A += 48 * NB; 
		B += 48; 

	} while (--block_k>0); 

	uint i = 0; 
	do{
		C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; 
		i = i + 1; 
	} while (i < 12); 


}
	__kernel void TRIPLE_DGEMM_UPDATE_192_24_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
{ 
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 24 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4

	//Work group size is [24, 2]
	//global work size is [96*number of blocks, 2]
	//each work item in each work group is responsible for 12 elements (half) in that row
	//each work group is responsible for one gemm;


	////////////// A12*invA22
	const uint gidx = get_group_id(0); 
	const uint gidy = get_group_id(1); 
	const uint idx = get_local_id(0); 
	const uint idy = get_local_id(1); 

	const uint page = gidx % npages; //0-3 for 192; 0-1 for 96
	const uint page_block = page / 4; //4 pages per page block



	__global double *B, *C; 
	__local double lA[24][24]; 
	__local double lB[24][24]; 
	double privateC[12] = { (double)0 }; 

	//decide A12 location for each page
	Ain = Ain + offAin; 
	Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; 

	//decide invA22 (B) location for each page
	B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk + blk; 

	//decide invA12 location for each page
	C = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; 

	//read A and B into LDS no transpose operated here
	//each work iteam loads half a row
	lA[idx][0 + idy * 12] = Ain[idx + idy * 12 * lda]; 
	lA[idx][1 + idy * 12] = Ain[idx + lda + idy * 12 * lda]; 
	lA[idx][2 + idy * 12] = Ain[idx + lda * 2 + idy * 12 * lda]; 
	lA[idx][3 + idy * 12] = Ain[idx + lda * 3 + idy * 12 * lda]; 
	lA[idx][4 + idy * 12] = Ain[idx + lda * 4 + idy * 12 * lda]; 
	lA[idx][5 + idy * 12] = Ain[idx + lda * 5 + idy * 12 * lda]; 
	lA[idx][6 + idy * 12] = Ain[idx + lda * 6 + idy * 12 * lda]; 
	lA[idx][7 + idy * 12] = Ain[idx + lda * 7 + idy * 12 * lda]; 
	lA[idx][8 + idy * 12] = Ain[idx + lda * 8 + idy * 12 * lda]; 
	lA[idx][9 + idy * 12] = Ain[idx + lda * 9 + idy * 12 * lda]; 
	lA[idx][10 + idy * 12] = Ain[idx + lda * 10 + idy * 12 * lda]; 
	lA[idx][11 + idy * 12] = Ain[idx + lda * 11 + idy * 12 * lda]; 

	lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB]; 
	lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB]; 
	lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB]; 
	lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB]; 
	lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB]; 
	lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB]; 
	lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB]; 
	lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB]; 
	lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB]; 
	lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB]; 
	lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB]; 
	lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB]; 
	barrier(CLK_LOCAL_MEM_FENCE); 

	//do math

	uint i = 0; 

	do{ 
		privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); 
		privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); 
		privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); 
		privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); 
		privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); 
		privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); 
		privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); 
		privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); 
		privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); 
		privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); 
		privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); 
		privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); 
		i = i + 1; 
	} while (i < 24); 

	i = 0; 
	do{ 
		C[NB*idy * 12 + NB*i + idx] = privateC[i]; 
		i = i + 1; 
	} while (i < 12); 
}
示例#23
0
std::string marathon_app::get_group_id() const
{
	return get_group_id(get_id());
}
  uint offsetC)
{
	float rC[2][4] = { (float)0 };
	float rA[1][2];
	float rB[1][4];


	A += offsetA;
	B += offsetB;
	C += offsetC;

	__local float lA[528];//16*32+16
	__local float lB[1040];//16*64+16

	uint gidx = M / 64;//get_group_id(0);
	uint gidy = get_group_id(1);
	uint idx = get_local_id(0);
	uint idy = get_local_id(1);


	int CurrentOffSetA = gidx * 64 + idx;

	A += gidx * 64 + idx + idy*lda;
	B += gidy * 64 + idx + idy*ldb;


	uint block_k = K >> 4;
	do
	{
		__local float* plA = lA + idy * 33 + idx;
		__local float* plB = lB + idy * 65 + idx;
示例#25
0
kernel void convolute(int4 imagesize, global unsigned char *input,
                      global unsigned char *output, global kernf *filterG) {
  int4 gid = (int4)(get_global_id(0)*CONV_UNROLL,  get_global_id(1),  get_global_id(2),  0);
  int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0);
  int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0);
  // First (?) pixel to process with this kernel
  int4 pixelid = gid;

  // Starting offset of the first pixel to process
  int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2;
  int i,j;

  int dx,dy,dz;

  /* MAD performs a single convolution operation for each kernel,
     using the current 'raw' value as the input image
     'ko' as an instance of an unrolled convolution filter
     'pos' as the X-offset for each of the unrolled convolution filters
     Note that all the if statements dependent only on static values -
     meaning that they can be optimized away by the compiler
  */
#define MAD(ko,pos) {if(CONV_UNROLL>ko) {    \
      if(pos-ko >= 0 && pos-ko < kernsize) {    \
        val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]);   \
      }}}
#define MADS(pos) {if(pos<kernsize) { \
    raw=input[imoffset2+pos];       \
    MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \
    MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \
    MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \
    MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \
    MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \
    }}

  kernf val[CONV_UNROLL];
  for(j=0;j<CONV_UNROLL;j++)
    val[j]=(kernf)(0.0);

  int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
  local kernf filter[kernsize*kernsize*kernsize];

  /* Copy global filter to local memory */
  event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0);
  wait_group_events(1, &event);

  if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 ||
     gid.s1 + kernsize > imagesize.s1 ||
     gid.s2 + kernsize > imagesize.s2) return;

  for(dz=0;dz<kernsize;dz++)
    for(dy=0;dy<kernsize;dy++)  {
      int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels;
      int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1;
      unsigned char raw;

      /* kernsize + convolution_unroll < 42 */
      MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5);
      MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11);
      MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17);
      MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23);
      MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29);
      MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35);
      MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41);
    }

  for(j=0;j<CONV_UNROLL;j++) {
    kernstore( convert_kernuc(val[j]), imoffset+j, output);
  }
}
	__kernel void TRIPLE_DGEMM_UPDATE_192_48_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
{\n
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 48 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2

	//Work group size is [24, 2]
	//global work size is [96*number of blocks, 4]
	//each work item in each work group is responsible for 12 elements (1/4) in that row
	//each work group is responsible for 24 by 24 macro tile;

	////////////// A12*invA22
	const uint gidx = get_group_id(0);\n
	const uint gidy = get_group_id(1);\n
	const uint idx = get_local_id(0);\n
	const uint idy = get_local_id(1);\n

	//uint page = gidx / 2;//0-1 for 192; 0 for 96
	const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block
	const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block

	__global double *B, *C; \n
	__local double lA[24][48]; \n
	__local double lB[48][24]; \n
	double privateC[12] = { (double)0 }; \n

	//decide A12 location for each page
	//each workgroup loads half of A (left or right)
	Ain = Ain + offAin; \n
	Ain += page_block*NB*lda + page_block*NB + page*blk * 2 * lda + page*blk * 2 + blk*lda + gidx % 2 * (blk / 2); \n

	//decide invA22 (B) location for each page
	//each workgroup loads half of B (up or down)
	B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n

	//decide invA12 location for each page;
	//Actually this will be stored in invA21 temporarily
	//each workgroup writes 1/4 of C
	C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n

	//read A and B into LDS no transpose operated here
	//each work item loads a half row of A and half column of B
	//idx 0-23 idy 0-1
	lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n
	lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n
	lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n
	lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n
	lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n
	lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n
	lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n
	lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n
	lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n
	lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n
	lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n
	lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n
	lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n
	lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n
	lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n
	lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n
	lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n
	lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n
	lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n
	lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n
	lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n
	lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n
	lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n
	lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n

	lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n
	lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n
	lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n
	lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n
	lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n
	lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n
	lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n
	lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n
	lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n
	lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n
	lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n
	lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n
	lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n
	lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n
	lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n
	lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n
	lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n
	lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n
	lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n
	lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n
	lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n
	lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n
	lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n
	lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n
	barrier(CLK_LOCAL_MEM_FENCE); \n

	//do math

	uint i = 0; \n

	do{\n
		privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n
		privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n
		privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n
		privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n
		privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n
		privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n
		privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n
		privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n
		privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n
		privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n
		privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
		privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
		i = i + 1; \n
	} while (i < 48); \n

	i = 0; \n
	do{\n
		C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n
		i = i + 1; \n
	} while (i < 12); \n
}\n
	__kernel void TRIPLE_DGEMM_UPDATE_192_12_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
{
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 12 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8

	//Work group size is [12]
	//global work size is [96*number of blocks]
	//each work item in each work group is responsible for every element in that row
	//each work group is responsible for one gemm;\


	////////////// A12*invA22
	const uint gidx = get_group_id(0);
	const uint idx = get_local_id(0);

	const uint page = gidx % npages;
	const uint page_block = page / 8;//8 pages per page block
	const uint page_index_in_block = page % 8;


	__global double *B, *C;
	__local double lA[12][12];
	__local double lB[12][12];
	double privateC[12] = { (double)0 };

	//decide A12 location for each page
	Ain = Ain + offAin;
	Ain += (page*blk * 2 + blk) * lda + page * 2 * blk;

	//decide invA22 (B) location for each page
	B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk + blk;

	//decide invA12 location for each page
	C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk;

	//read A and B into LDS no transpose operated here
	lA[idx][0] = Ain[idx];
	lA[idx][1] = Ain[idx + lda];
	lA[idx][2] = Ain[idx + lda * 2];
	lA[idx][3] = Ain[idx + lda * 3];
	lA[idx][4] = Ain[idx + lda * 4];
	lA[idx][5] = Ain[idx + lda * 5];
	lA[idx][6] = Ain[idx + lda * 6];
	lA[idx][7] = Ain[idx + lda * 7];
	lA[idx][8] = Ain[idx + lda * 8];
	lA[idx][9] = Ain[idx + lda * 9];
	lA[idx][10] = Ain[idx + lda * 10];
	lA[idx][11] = Ain[idx + lda * 11];

	lB[idx][0] = B[idx];
	lB[idx][1] = B[idx + NB];
	lB[idx][2] = B[idx + NB * 2];
	lB[idx][3] = B[idx + NB * 3];
	lB[idx][4] = B[idx + NB * 4];
	lB[idx][5] = B[idx + NB * 5];
	lB[idx][6] = B[idx + NB * 6];
	lB[idx][7] = B[idx + NB * 7];
	lB[idx][8] = B[idx + NB * 8];
	lB[idx][9] = B[idx + NB * 9];
	lB[idx][10] = B[idx + NB * 10];
	lB[idx][11] = B[idx + NB * 11];
	barrier(CLK_LOCAL_MEM_FENCE);

	//do math

	uint i = 0;

	do{
		privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);
		privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);
		privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);
		privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);
		privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);
		privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);
		privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);
		privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);
		privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);
		privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);
		privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);
		privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);
		//mem_fence(CLK_LOCAL_MEM_FENCE);
		i = i + 1;
	} while (i < 12);

	i = 0;
	do{
		C[NB*i + idx] = privateC[i];
		i = i + 1;
	} while (i < 12);

	////////////// -invA11*invA12
	barrier(CLK_GLOBAL_MEM_FENCE);
	//A is moving to invA11
	__global double *A;
	A = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2) * NB + (page % 4) * 2 * blk;
	//both B and C are pointing at invA12
	B = C;

	//read A and B into LDS no transpose operated here
	lA[idx][0] = A[idx];
	lA[idx][1] = A[idx + NB];
	lA[idx][2] = A[idx + NB * 2];
	lA[idx][3] = A[idx + NB * 3];
	lA[idx][4] = A[idx + NB * 4];
	lA[idx][5] = A[idx + NB * 5];
	lA[idx][6] = A[idx + NB * 6];
	lA[idx][7] = A[idx + NB * 7];
	lA[idx][8] = A[idx + NB * 8];
	lA[idx][9] = A[idx + NB * 9];
	lA[idx][10] = A[idx + NB * 10];
	lA[idx][11] = A[idx + NB * 11];

	lB[idx][0] = B[idx];
	lB[idx][1] = B[idx + NB];
	lB[idx][2] = B[idx + NB * 2];
	lB[idx][3] = B[idx + NB * 3];
	lB[idx][4] = B[idx + NB * 4];
	lB[idx][5] = B[idx + NB * 5];
	lB[idx][6] = B[idx + NB * 6];
	lB[idx][7] = B[idx + NB * 7];
	lB[idx][8] = B[idx + NB * 8];
	lB[idx][9] = B[idx + NB * 9];
	lB[idx][10] = B[idx + NB * 10];
	lB[idx][11] = B[idx + NB * 11];
	barrier(CLK_LOCAL_MEM_FENCE);

	//do math

	i = 0;
	privateC[0] = 0;
	privateC[1] = 0;
	privateC[2] = 0;
	privateC[3] = 0;
	privateC[4] = 0;
	privateC[5] = 0;
	privateC[6] = 0;
	privateC[7] = 0;
	privateC[8] = 0;
	privateC[9] = 0;
	privateC[10] = 0;
	privateC[11] = 0;
	do{
		privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);
		privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);
		privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);
		privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);
		privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);
		privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);
		privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);
		privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);
		privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);
		privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);
		privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);
		privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);
		//mem_fence(CLK_LOCAL_MEM_FENCE);
		i = i + 1;
	} while (i < 12);

	i = 0;
	do{
		C[NB*i + idx] = -1 * privateC[i];
		i = i + 1;
	} while (i < 12);

}
示例#28
0
void step_bodies(
		struct Body * bodies,
		struct Pair * pairs,
		unsigned int * map,
		float dt,
		unsigned int num_bodies, // in
		float * velocity_ratio, // in/out
		float * mass_center, // in
		float mass, // in
		unsigned int * number_escaped // out
		)
{
	/* work group */
	int local_block = num_bodies / get_num_groups(0);

	unsigned int i_group0 = get_group_id(0) * local_block;
	unsigned int i_group1 = i_group0 + local_block;

	if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies;

	/* work item */
	int block = (i_group1 - i_group0) / get_local_size(0);

	unsigned int i_local0 = i_group0 + get_local_id(0) * block;
	unsigned int i_local1 = i_local0 + block;

	if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1;

	/*
	   printf("local_block = %i\n", local_block);
	   printf("block = %i\n", block);
	   */
	/*
	   printf("i_local0 = %i\n", i_local0);
	   printf("i_local1 = %i\n", i_local1);
	   */
	/* copy data for work group */
	//__local struct Pair local_pairs[NUM_PAIRS];
	//__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS];

	//event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0);
	//wait_group_events(1, &e0);

	//event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0);
	//wait_group_events(1, &e1);

	/* */
	float f[3];

	//__local struct BodyMap * pbm = 0;
	//struct BodyMap * pbm = 0;


	Body * pb = 0;

	for(unsigned int b = i_local0; b < i_local1; b++)
	{
		//pbm = local_bodymaps + b;
		//pbm = bodymaps + b;

		pb = bodies + b;

		if(pb->alive == 0)
		{
			//puts("body dead");
			continue;
		}

		f[0] = 0;
		f[1] = 0;
		f[2] = 0;

		for(unsigned int i = 0; i < num_bodies; i++)
		{
			if(b == i) continue;

			//__local struct Pair * pp = &local_pairs[pbm->pair[p]];
			Pair * pp = pairs + map[b * num_bodies + i];

			if(pp->_M_alive == 0) continue;


			if(pp->b0 == b)
			{
				f[0] -= pp->u[0] * pp->f;
				f[1] -= pp->u[1] * pp->f;
				f[2] -= pp->u[2] * pp->f;
			}
			else if(pp->b1 == b)
			{
				f[0] += pp->u[0] * pp->f;
				f[1] += pp->u[1] * pp->f;
				f[2] += pp->u[2] * pp->f;
			}
			else
			{
				assert(0);
			}
		}

		float dv[3];

		if(0)
		{
			dv[0] = dt * f[0] / pb->mass;
			dv[1] = dt * f[1] / pb->mass;
			dv[2] = dt * f[2] / pb->mass;
		}
		else
		{
			dv[0] = dt * pb->f[0] / pb->mass;
			dv[1] = dt * pb->f[1] / pb->mass;
			dv[2] = dt * pb->f[2] / pb->mass;
		}
		//print(pb->f);

		if(
				(!feq(pb->f[0], f[0])) ||
				(!feq(pb->f[1], f[1])) ||
				(!feq(pb->f[2], f[2]))
		  )
		{
			print(f);
			print(pb->f);
			abort();
		}

		assert(std::isfinite(pb->mass));
		assert(std::isfinite(dt));
		assert(std::isfinite(pb->f[0]));
		assert(std::isfinite(pb->f[1]));
		assert(std::isfinite(pb->f[2]));

		// reset accumulating force
		pb->f[0] = 0;
		pb->f[1] = 0;
		pb->f[2] = 0;


		float e = 0.01;

		float rat[3];
		rat[0] = fabs(dv[0] / pb->v[0]);
		rat[1] = fabs(dv[1] / pb->v[1]);
		rat[2] = fabs(dv[2] / pb->v[2]);

		// atomic
		if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0];
		if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1];
		if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2];

		if(0)
		{
			if(
					((std::isfinite(rat[0])) && (rat[0] > e)) ||
					((std::isfinite(rat[1])) && (rat[1] > e)) ||
					((std::isfinite(rat[2])) && (rat[2] > e))
			  )
			{
				printf("% 12f % 12f % 12f\n",
						rat[0],
						rat[1],
						rat[2]);
			}
		}

		pb->v[0] += dv[0];
		pb->v[1] += dv[1];
		pb->v[2] += dv[2];

		pb->x[0] += dt * pb->v[0];
		pb->x[1] += dt * pb->v[1];
		pb->x[2] += dt * pb->v[2];

		// distance from mass center
		float r[3];
		r[0] = pb->x[0] - mass_center[0];
		r[1] = pb->x[1] - mass_center[1];
		r[2] = pb->x[2] - mass_center[2];

		float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);

		float escape_speed2 = 2.0 * 6.67e-11 * mass / d;

		float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2];

		// dot product of velocity and displacement vector
		float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2];

		if(s2 > (escape_speed2)) // speed exceeds escape speed
		{
			if(dot > 0.0) // parallel componenet points away from mass_center
			{
				// atomic
				(*number_escaped)++;
				//printf("escape!\n");
			}
		}
	}
}
示例#29
0
void fs_private_dev(void){
	// install a new /dev directory
	if (arg_debug)
		printf("Mounting tmpfs on /dev\n");

	// create DRI_DIR
	// keep a copy of dev directory
	mkdir_attr(RUN_DEV_DIR, 0755, 0, 0);
	if (mount("/dev", RUN_DEV_DIR, NULL, MS_BIND|MS_REC, NULL) < 0)
		errExit("mounting /dev");

	// create DEVLOG_FILE
	int have_devlog = 0;
	struct stat s;
	if (stat("/dev/log", &s) == 0) {
		have_devlog = 1;
		FILE *fp = fopen(RUN_DEVLOG_FILE, "w");
		if (!fp)
			have_devlog = 0;
		else {
			fprintf(fp, "\n");
			fclose(fp);
			if (mount("/dev/log", RUN_DEVLOG_FILE, NULL, MS_BIND|MS_REC, NULL) < 0)
				errExit("mounting /dev/log");
		}
	}

	// mount tmpfs on top of /dev
	if (mount("tmpfs", "/dev", "tmpfs", MS_NOSUID | MS_STRICTATIME | MS_REC,  "mode=755,gid=0") < 0)
		errExit("mounting /dev");
	fs_logger("tmpfs /dev");

	// optional devices: sound, video cards etc...
	deventry_mount();

	// bring back /dev/log
	if (have_devlog) {
		FILE *fp = fopen("/dev/log", "w");
		if (fp) {
			fprintf(fp, "\n");
			fclose(fp);
			if (mount(RUN_DEVLOG_FILE, "/dev/log", NULL, MS_BIND|MS_REC, NULL) < 0)
				errExit("mounting /dev/log");
			fs_logger("clone /dev/log");
		}
	}

	// bring forward the current /dev/shm directory if necessary
	if (arg_debug)
		printf("Process /dev/shm directory\n");
	process_dev_shm();

	if (mount(RUN_RO_DIR, RUN_DEV_DIR, "none", MS_BIND, "mode=400,gid=0") < 0)
		errExit("disable run dev directory");

	// create default devices
	create_char_dev("/dev/zero", 0666, 1, 5); // mknod -m 666 /dev/zero c 1 5
	fs_logger("mknod /dev/zero");
	create_char_dev("/dev/null", 0666, 1, 3); // mknod -m 666 /dev/null c 1 3
	fs_logger("mknod /dev/null");
	create_char_dev("/dev/full", 0666, 1, 7); // mknod -m 666 /dev/full c 1 7
	fs_logger("mknod /dev/full");
	create_char_dev("/dev/random", 0666, 1, 8); // Mknod -m 666 /dev/random c 1 8
	fs_logger("mknod /dev/random");
	create_char_dev("/dev/urandom", 0666, 1, 9); // mknod -m 666 /dev/urandom c 1 9
	fs_logger("mknod /dev/urandom");
	create_char_dev("/dev/tty", 0666,  5, 0); // mknod -m 666 /dev/tty c 5 0
	fs_logger("mknod /dev/tty");
#if 0
	create_dev("/dev/tty0", "mknod -m 666 /dev/tty0 c 4 0");
	create_dev("/dev/console", "mknod -m 622 /dev/console c 5 1");
#endif

	// pseudo-terminal
	mkdir_attr("/dev/pts", 0755, 0, 0);
	fs_logger("mkdir /dev/pts");
	fs_logger("create /dev/pts");
	create_char_dev("/dev/pts/ptmx", 0666, 5, 2); //"mknod -m 666 /dev/pts/ptmx c 5 2");
	fs_logger("mknod /dev/pts/ptmx");
	create_link("/dev/pts/ptmx", "/dev/ptmx");

// code before github issue #351
	// mount -vt devpts -o newinstance -o ptmxmode=0666 devpts //dev/pts
//	if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,  "newinstance,ptmxmode=0666") < 0)
//		errExit("mounting /dev/pts");


	// mount /dev/pts
	gid_t ttygid = get_group_id("tty");
	char *data;
	if (asprintf(&data, "newinstance,gid=%d,mode=620,ptmxmode=0666", (int) ttygid) == -1)
		errExit("asprintf");
	if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,  data) < 0)
		errExit("mounting /dev/pts");
	free(data);
	fs_logger("clone /dev/pts");

	// stdin, stdout, stderr
#if 0
	create_link("/proc/self/fd", "/dev/fd");
	create_link("/proc/self/fd/0", "/dev/stdin");
	create_link("/proc/self/fd/1", "/dev/stdout");
	create_link("/proc/self/fd/2", "/dev/stderr");
#endif

	// symlinks for DVD/CD players
	if (stat("/dev/sr0", &s) == 0) {
		create_link("/dev/sr0", "/dev/cdrom");
		create_link("/dev/sr0", "/dev/cdrw");
		create_link("/dev/sr0", "/dev/dvd");
		create_link("/dev/sr0", "/dev/dvdrw");
	}
}