Пример #1
0
//******************************************************************************
// Private Interface
//******************************************************************************
MEMORY_POOL::SEGMENT* MEMORY_POOL::GetNewSegment()
{
    SEGMENT* NewSegment = (SEGMENT*)malloc( AllocationSize );
    NewSegment->Memory = (void*)alignup( (u8*)(NewSegment) + sizeof(SEGMENT), BlockSize );
    NewSegment->FreeBlock = (BLOCK*)NewSegment->Memory;
    NewSegment->NextFreeSegment = null;
    NewSegment->Left = null;
    NewSegment->Right = null;
    NewSegment->Key = (u64)(NewSegment->Memory);
    SegmentTree.Insert(NewSegment);

    BLOCK* Block = null;
    BLOCK* NextBlock = null;
    int NumBlocks = ( AllocationSize - alignup( sizeof(SEGMENT), BlockSize ) ) / BlockSize;
    for( int i = 0; i < NumBlocks - 1; i++ ) {
        Block     = (BLOCK*)( (u8*)(NewSegment->Memory) + BlockSize*( i ) );
        NextBlock = (BLOCK*)( (u8*)(NewSegment->Memory) + BlockSize*( i + 1 ) );
        Block->Next = NextBlock;
    }
    Block = (BLOCK*)( (u8*)(NewSegment->Memory) + BlockSize*( NumBlocks - 1 ) );
    Block->Next = null;
    
    AddToFreeList(NewSegment);

    NumSegments++;

    return NewSegment;
}
Пример #2
0
inline kernel_call transpose_kernel(const cl::CommandQueue &queue, size_t width, size_t height, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    const auto dev = qdev(queue);
    kernel_common<T>(o, dev);

    // determine max block size to fit into local memory/workgroup
    size_t block_size = 128;
    {
        const auto local_size = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
        const auto workgroup = dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
        while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2;
        while(block_size * block_size > workgroup) block_size /= 2;
    }

    // from NVIDIA SDK.
    o << "__kernel void transpose("
      << "__global const real2_t *input, __global real2_t *output, uint width, uint height) {\n"
      << "  const size_t "
      << "    global_x = get_global_id(0), global_y = get_global_id(1),\n"
      << "    local_x = get_local_id(0), local_y = get_local_id(1),\n"
      << "    group_x = get_group_id(0), group_y = get_group_id(1),\n"
      << "    block_size = " << block_size << ",\n"
      << "    target_x = local_y + group_y * block_size,\n"
      << "    target_y = local_x + group_x * block_size;\n"
      << "  const bool range = global_x < width && global_y < height;\n"
        // local memory
      << "  __local real2_t block[" << (block_size * block_size) << "];\n"
        // copy from input to local memory
      << "  if(range)\n"
      << "    block[local_x + local_y * block_size] = input[global_x + global_y * width];\n"
        // wait until the whole block is filled
      << "  barrier(CLK_LOCAL_MEM_FENCE);\n"
        // transpose local block to target
      << "  if(range)\n"
      << "    output[target_x + target_y * height] = block[local_x + local_y * block_size];\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "transpose");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(width));
    kernel.setArg(3, static_cast<cl_uint>(height));

    // range multiple of wg size, last block maybe not completely filled.
    size_t r_w = alignup(width, block_size);
    size_t r_h = alignup(height, block_size);

    std::ostringstream desc;
    desc << "transpose{"
         << "w=" << width << "(" << r_w << "), "
         << "h=" << height << "(" << r_h << "), "
         << "bs=" << block_size << "}";

    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(r_w, r_h),
        cl::NDRange(block_size, block_size));
}
Пример #3
0
// TODO: check
void* rpallocEx(rpheap_t _heap, size_t _cb, size_t _align) {

    if (_heap == NULL)
        _heap = default_heap;

    if (_cb == 0)
        _cb = DEFAULT_ALIGN;
        
    RP_HEAP_HEADER* rhhHeap = _heap;

    // check initial (still not guaranteed to be room for allocation)
    if (rhhHeap->cbFree < _cb)
        return errno = ENOMEM, NULL;

    const void* const pHeapEnd = (char*)rhhHeap->pBase + rhhHeap->cbHeap;

    // local loop vars
    register RP_BLOCK_HEADER* rbhCurBlock;
    const size_t cbPotentialBlock = alignup(_cb, _align);
    
    // loop control vars
    register void* pCurBlock = rhhHeap->pBase;

    for (; pCurBlock < pHeapEnd; pCurBlock = rbhCurBlock->pNext) {
        rbhCurBlock = pCurBlock;
    
        register char* pPotentialBlock = (char*)((uintptr_t)rbhCurBlock + rbhCurBlock->cbBlock);

        // check if we can align up
        pPotentialBlock = alignup(pPotentialBlock, _align);
        // if there isn't room
        register void* pVirtualMax = rbhCurBlock->pNext == NULL ? pHeapEnd : rbhCurBlock->pNext;
        if (pPotentialBlock > pVirtualMax 
         || pPotentialBlock + cbPotentialBlock > pVirtualMax)
            continue;

        pCurBlock = pPotentialBlock;
        break;
    };

    if (pCurBlock >= pHeapEnd)
        return errno = ENOMEM, NULL;

    // put new header at location
    register RP_BLOCK_HEADER* rbhNewBlock = pCurBlock;
    rbhNewBlock->cbBlock = cbPotentialBlock;
    rbhNewBlock->pNext = rbhCurBlock->pNext;

    // update old references
    rbhCurBlock->pNext = rbhNewBlock;
}
Пример #4
0
inline kernel_call bluestein_mul(const cl::CommandQueue &queue, size_t n, size_t batch, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);

    o << "__kernel void bluestein_mul("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, uint stride) {\n"
      << "  const size_t x = get_global_id(0), y = get_global_id(1);\n"
      << "  if(x < stride) {\n"
      << "    const size_t off = x + stride * y;"
      << "    output[off] = mul(data[off], exp[x]);\n"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg(3, static_cast<cl_uint>(n));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t threads = alignup(n, wg);

    std::ostringstream desc;
    desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1));
}
Пример #5
0
//******************************************************************************
MEMORY_POOL::MEMORY_POOL(u32 _BlockSize, u32 _AllocationSize)
{
    BlockSize      = max(sizeof(BLOCK), _BlockSize);
    AllocationSize = _AllocationSize;
    FreeSegments   = null;
    NumBlocks      = 0;
    NumSegments    = 0;
    
    // Even this is far too generous of an assertion
    assert( BlockSize < AllocationSize / 2 );
    
    BlockSize = alignup( _BlockSize, sizeof(BLOCK) );
    DataSize  = AllocationSize - alignup( sizeof(SEGMENT), BlockSize );

    GetNewSegment();
}
Пример #6
0
//*****************************************************************************
void Memcpy(void* dest, const void* src, size_t size)
{
    char* Dest;
    char* Src;
    char* DestEnd;
    u64* Dest2;
    u64* Src2;
    u64* DestEnd2;

    // Copy over bytes until we are 8-byte aligned
    Dest = (char*)dest;
    Src = (char*)src;
    DestEnd = (char*)alignup(dest, 8);
    while(Dest < DestEnd) {
        *Dest++ = *Src++;
    }

    // Do large copies (8 bytes at a time)
    Dest2 = (u64*)Dest;
    Src2 = (u64*)Src;
    DestEnd2 = (u64*)aligndown(dest + size, 8);
    while(Dest2 < DestEnd2) {
        *Dest2++ = *Src2++;
    }

    // Finish it up (anything on the tail end thats not 8-byte aligned)
    Dest = (char*)Dest2;
    Src = (char*)Src2;
    DestEnd = (char*)dest + size;
    while(Dest < DestEnd) {
        *Dest++ = *Src++;
    }
}
Пример #7
0
inline kernel_call radix_kernel(bool once, const cl::CommandQueue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    o << std::setprecision(25);
    const auto device = qdev(queue);
    kernel_common<T>(o, device);
    mul_code(o, invert);
    twiddle_code<T>(o);

    const size_t m = n / radix.value;
    kernel_radix<T>(o, radix, invert);

    auto program = build_sources(qctx(queue), o.str(), "-cl-mad-enable -cl-fast-relaxed-math");
    cl::Kernel kernel(program, "radix");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(p));
    kernel.setArg(3, static_cast<cl_uint>(m));

    const size_t wg_mul = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);
    //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
    //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
    size_t wg = wg_mul;
    //while(wg * max_cu < max_wg) wg += wg_mul;
    //wg -= wg_mul;
    const size_t threads = alignup(m, wg);

    std::ostringstream desc;
    desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}";

    return kernel_call(once, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1));
}
Пример #8
0
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "reduce_by_key is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
Пример #9
0
inline kernel_call bluestein_mul_in(const cl::CommandQueue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);
    twiddle_code<T>(o);

    o << "__kernel void bluestein_mul_in("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, "
      << "uint radix, uint p, uint out_stride) {\n"
      << "  const size_t\n"
      << "    thread = get_global_id(0), threads = get_global_size(0),\n"
      << "    batch = get_global_id(1),\n"
      << "    element = get_global_id(2);\n"
      << "  if(element < out_stride) {\n"
      << "    const size_t\n"
      << "      in_off = thread + batch * radix * threads + element * threads,\n"
      << "      out_off = thread * out_stride + batch * out_stride * threads + element;\n"
      << "    if(element < radix) {\n"
      << "      real2_t w = exp[element];"
      << "      if(p != 1) {\n"
      << "        const int sign = " << (inverse ? "+1" : "-1") << ";\n"
      << "        ulong a = (ulong)element * (thread % p);\n"
      << "        ulong b = (ulong)radix * p;\n"
      << "        real2_t t = twiddle(2 * sign * M_PI * (a % (2 * b)) / b);\n"
      << "        w = mul(w, t);\n"
      << "      }\n"
      << "      output[out_off] = mul(data[in_off], w);\n"
      << "    } else\n"
      << "      output[out_off] = (real2_t)(0,0);"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul_in");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg(3, static_cast<cl_uint>(radix));
    kernel.setArg(4, static_cast<cl_uint>(p));
    kernel.setArg(5, static_cast<cl_uint>(stride));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t stride_pad = alignup(stride, wg);

    std::ostringstream desc;
    desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, stride_pad), cl::NDRange(1, 1, wg));
}
Пример #10
0
std::vector<size_t> partitioning_scheme<dummy>::get(size_t n,
	const std::vector<cl::CommandQueue> &queue)
{
    if (!is_set) {
	weight = device_vector_perf;
	is_set = true;
    }

    std::vector<size_t> part;
    part.reserve(queue.size() + 1);
    part.push_back(0);

    if (queue.size() > 1) {
	std::vector<double> cumsum;
	cumsum.reserve(queue.size() + 1);
	cumsum.push_back(0);

	for(auto q = queue.begin(); q != queue.end(); q++) {
	    cl::Context context = q->getInfo<CL_QUEUE_CONTEXT>();
	    cl::Device  device  = q->getInfo<CL_QUEUE_DEVICE>();

	    auto dw = device_weight.find(device());

	    double w = (dw == device_weight.end()) ?
		(device_weight[device()] = weight(context, device)) :
		dw->second;

	    cumsum.push_back(cumsum.back() + w);
	}

	for(uint d = 1; d < queue.size(); d++)
	    part.push_back(
		    std::min(n,
			alignup(static_cast<size_t>(n * cumsum[d] / cumsum.back()))
			)
		    );
    }

    part.push_back(n);
    return part;
}
Пример #11
0
inline kernel_call bluestein_mul_out(const cl::CommandQueue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);

    o << "__kernel void bluestein_mul_out("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, "
      << "real_t div, uint p, uint in_stride, uint radix) {\n"
      << "  const size_t\n"
      << "    i = get_global_id(0), threads = get_global_size(0),\n"
      << "    b = get_global_id(1),\n"
      << "    l = get_global_id(2);\n"
      << "  if(l < radix) {\n"
      << "    const size_t\n"
      << "      k = i % p,\n"
      << "      j = k + (i - k) * radix,\n"
      << "      in_off = i * in_stride + b * in_stride * threads + l,\n"
      << "      out_off = j + b * threads * radix + l * p;\n"
      << "    output[out_off] = mul(data[in_off] * div, exp[l]);\n"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul_out");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg<T>(3, static_cast<T>(1) / stride);
    kernel.setArg(4, static_cast<cl_uint>(p));
    kernel.setArg(5, static_cast<cl_uint>(stride));
    kernel.setArg(6, static_cast<cl_uint>(radix));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t radix_pad = alignup(radix, wg);

    std::ostringstream desc;
    desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, radix_pad), cl::NDRange(1, 1, wg));
}
Пример #12
0
/*
 *    _ malloc() point
 *   /
 *  /          _ aligned point
 * /          /
 * +----+-----+---------------------------------------
 * |    |     |
 * +----+--|--+---------------------------------------
 * ^       |
 * |       | embeded address of malloc().
 * +-------+
 * 
 */
void *
ri_mem_alloc_aligned(size_t sz, uint32_t align)
{
    uint64_t  size;
    void     *p;
    void     *aligned;
    uint64_t  diff;

    assert(align > 0);
    assert(align % 16 == 0);

    size = alignup((uint64_t)sz, align);

    size += 8;   /* room for embed the address */

    p       = malloc(size);
    aligned = alignptr(p, align);
    if (aligned == p) {
        aligned += align;       // align up
    }

    diff = (uintptr_t)(aligned - p);
    /*
    printf("malloc  point : %p\n", p);
    printf("aligned point : %p\n", aligned);
    printf("diff          : %llu\n", diff);
    */


    /*
     * Embed malloc() address
     */
    *((uint64_t *)aligned - 1) = (uintptr_t)p;

    assert(aligned != NULL);

    return aligned;
}
Пример #13
0
std::vector<size_t> partitioning_scheme<dummy>::get(size_t n,
        const std::vector<backend::command_queue> &queue)
{
    static const bool once = init_weight_function();
    (void)once; // do not warn about unused variable

    std::vector<size_t> part;
    part.reserve(queue.size() + 1);
    part.push_back(0);

    if (queue.size() > 1) {
        std::vector<double> cumsum;
        cumsum.reserve(queue.size() + 1);
        cumsum.push_back(0);

        for(auto q = queue.begin(); q != queue.end(); q++) {
            auto dev_id = backend::get_device_id(*q);
            auto dw = device_weight.find(dev_id);

            double w = (dw == device_weight.end()) ?
                (device_weight[dev_id] = weight(*q)) :
                dw->second;

            cumsum.push_back(cumsum.back() + w);
        }

        for(unsigned d = 1; d < queue.size(); d++)
            part.push_back(
                    std::min(n,
                        alignup(static_cast<size_t>(n * cumsum[d] / cumsum.back()))
                        )
                    );
    }

    part.push_back(n);
    return part;
}
Пример #14
0
int main(int argc, char **argv) {
  int help=0;
  int arg=1;

  gasnet_handlerentry_t htable[] = { 
    { hidx_ping_shorthandler,  ping_shorthandler  },
    { hidx_pong_shorthandler,  pong_shorthandler  },
    { hidx_ping_medhandler,    ping_medhandler    },
    { hidx_pong_medhandler,    pong_medhandler    },
    { hidx_ping_longhandler,   ping_longhandler   },
    { hidx_pong_longhandler,   pong_longhandler   },

    { hidx_ping_shorthandler_flood,  ping_shorthandler_flood  },
    { hidx_pong_shorthandler_flood,  pong_shorthandler_flood  },
    { hidx_ping_medhandler_flood,    ping_medhandler_flood    },
    { hidx_pong_medhandler_flood,    pong_medhandler_flood    },
    { hidx_ping_longhandler_flood,   ping_longhandler_flood   },
    { hidx_pong_longhandler_flood,   pong_longhandler_flood   },

    { hidx_done_shorthandler,  done_shorthandler  }
  };

  GASNET_Safe(gasnet_init(&argc, &argv));

  mynode = gasnet_mynode();
  numnode = gasnet_nodes();

  arg = 1;
  while (argc > arg) {
    if (!strcmp(argv[arg], "-p")) {
#if GASNET_PAR
      pollers = test_thread_limit(atoi(argv[arg+1])+1)-1;
      arg += 2;
#else
      if (0 == mynode) {
        fprintf(stderr, "testam %s\n", GASNET_CONFIG_STRING);
        fprintf(stderr, "ERROR: The -p option is only available in the PAR configuration.\n");
        fflush(NULL);
      }
      sleep(1);
      gasnet_exit(1);
#endif
    } else if (!strcmp(argv[arg], "-in")) {
      insegment = 1;
      ++arg;
    } else if (!strcmp(argv[arg], "-out")) {
      insegment = 0;
      ++arg;
    } else if (!strcmp(argv[arg], "-c")) {
      crossmachinemode = 1;
      ++arg;
    } else if (!strcmp(argv[arg], "-src-noop")) {
      src_mode = SRC_NOOP;
      ++arg;
    } else if (!strcmp(argv[arg], "-src-generate")) {
      src_mode = SRC_GENERATE;
      ++arg;
    } else if (!strcmp(argv[arg], "-src-memcpy")) {
      src_mode = SRC_MEMCPY;
      ++arg;
    } else if (argv[arg][0] == '-') {
      help = 1;
      ++arg;
    } else break;
  }

  if (argc > arg) { iters = atoi(argv[arg]); ++arg; }
  if (!iters) iters = 1000;
  if (argc > arg) { maxsz = atoi(argv[arg]); ++arg; }
  if (!maxsz) maxsz = 2*1024*1024;
  if (argc > arg) { TEST_SECTION_PARSE(argv[arg]); ++arg; }

  GASNET_Safe(gasnet_attach(htable, sizeof(htable)/sizeof(gasnet_handlerentry_t),
                            TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET));
#if GASNET_PAR
  #define PAR_USAGE \
               "  The -p option gives the number of polling threads, specified as\n" \
               "    a non-negative integer argument (default is no polling threads).\n"
#else
  #define PAR_USAGE ""
#endif
  test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n"
               "  The '-in' or '-out' option selects whether the requestor's\n"
               "    buffer is in the GASNet segment or not (default is 'in').\n"
               PAR_USAGE
               "  The '-src-*' options select treatment of the payload buffer used for\n"
               "    Medium and Long AMs, as follows:\n"
               "      -src-noop:      no per-operation initialization (default)\n"
               "      -src-generate:  initialized (w/o memory reads) on each AM injection\n"
               "      -src-memcpy:    initialized using memcpy() on each AM injection\n"
               "  The -c option enables cross-machine pairing (default is nearest neighbor).\n");
  if (help || argc > arg) test_usage();

  TEST_PRINT_CONDUITINFO();

  if (insegment) {
    myseg = TEST_MYSEG();
  } else {
    char *space = test_malloc(alignup(maxsz,PAGESZ) + PAGESZ);
    myseg = alignup_ptr(space, PAGESZ);
  }

  maxmed = MIN(maxsz, gasnet_AMMaxMedium());
  maxlongreq = MIN(maxsz, gasnet_AMMaxLongRequest());
  maxlongrep = MIN(maxsz, gasnet_AMMaxLongReply());

  if (src_mode == SRC_MEMCPY) {
    zero_buffer = test_calloc(maxsz, 1);
  }

  if (crossmachinemode) {
    if ((numnode%2) && (mynode == numnode-1)) {
      sender = 1;
      peer = mynode;
    } else {
      gasnet_node_t half = numnode / 2;
      sender = (mynode < half);
      peer = sender ? (mynode + half) : (mynode - half);
    }
  } else {
    peer = mynode ^ 1;
    sender = mynode % 2 == 0;
    if (peer == numnode) {
      peer = mynode;
    }
  }

  recvr = !sender || (peer == mynode);

  // Long Request and Reply (distinct for loopback)
  reply_addr = TEST_SEG(peer);
  request_addr = (peer == mynode) ? (void*)((uintptr_t)reply_addr + alignup(maxsz,SIZEOF_GASNET_REGISTER_VALUE_T))
                                  : reply_addr;

  BARRIER();

#if GASNET_PAR
  #define PAR_FMT "  %i extra recvr polling threads\n"
  #define PAR_ARG ,pollers
#else
  #define PAR_FMT /*empty*/
  #define PAR_ARG /*empty*/
#endif
  if (mynode == 0) {
      printf("Running %i iterations of %s AM performance with:\n"
             "  local addresses %sside the segment%s\n"
             "  %s\n"
             PAR_FMT
             "  ...\n",
             iters,
             (crossmachinemode ? "cross-machine ": ""),
             (insegment ? "in" : "out"),
             (insegment ? " (default)" : ""),
             ((src_mode == SRC_NOOP)     ? "no payload initialization (default)"
             :(src_mode == SRC_GENERATE) ? "payload initialized by computation"
                                         : "payload initialized using memcpy()")
             PAR_ARG
            );
      printf("   Msg Sz  Description                             Total time   Avg. time   Bandwidth\n"
             "   ------  -----------                             ----------   ---------   ---------\n");
      fflush(stdout);
  }
#if GASNET_PAR
  TEST_SET_WAITMODE(pollers+1);
  if (pollers)
    test_createandjoin_pthreads(pollers+1,doAll,NULL,0);
  else
#endif
    doAll(NULL);

  MSG("done.");

  gasnet_exit(0);
  return 0;
}
Пример #15
0
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "Sorting is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = detail::offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;");
    detail::scan(queue[0], offset, offset, 0, false, plus);

    /***** Kernel 1 *****/
    auto krn1 = is_cpu(queue[0]) ?
        detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn1.push_arg(count);
    krn1.push_arg(offset);
    krn1.push_arg(ivals(0));
    krn1.push_arg(offset_val);
    krn1.push_arg(key_sum);
    krn1.push_arg(pre_sum);

    krn1.config(num_blocks, NT);
    krn1(queue[0]);

    /***** Kernel 2 *****/
    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));

    auto krn2 = is_cpu(queue[0]) ?
        detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn2.push_arg(num_blocks);
    krn2.push_arg(key_sum);
    krn2.push_arg(pre_sum);
    krn2.push_arg(post_sum);
    krn2.push_arg(work_per_thread);

    krn2.config(1, NT);
    krn2(queue[0]);

    /***** Kernel 3 *****/
    auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]);

    krn3.push_arg(count);
    krn3.push_arg(key_sum);
    krn3.push_arg(post_sum);
    krn3.push_arg(offset);
    krn3.push_arg(offset_val);

    krn3.config(num_blocks, NT);
    krn3(queue[0]);

    /***** resize okeys and ovals *****/
    int out_elements;
    offset.read(queue[0], count - 1, 1, &out_elements, true);
    ++out_elements;

    boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements));
    ovals.resize(ivals.queue_list(), out_elements);

    /***** Kernel 4 *****/
    auto krn4 = detail::key_value_mapping<K, V>(queue[0]);

    krn4.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn4));
    boost::fusion::for_each(okeys, do_push_arg(krn4));
    krn4.push_arg(ovals(0));
    krn4.push_arg(offset);
    krn4.push_arg(offset_val);

    krn4(queue[0]);

    return out_elements;
}
Пример #16
0
        ell(
                const std::vector<backend::command_queue> &q,
                size_t nrows, size_t ncols,
                const PtrRange &ptr,
                const ColRange &col,
                const ValRange &val,
                bool fast = true
           ) :
            q(q[0]), n(nrows), m(ncols), nnz(boost::size(val)),
            ell_pitch(alignup(nrows, 16U)), csr_nnz(0)
        {
            precondition(q.size() == 1,
                    "sparse::ell is only supported for single-device contexts");

            if (fast) {
                convert(ptr, col, val);
                return;
            }

            /* 1. Get optimal ELL widths for local and remote parts. */
            // Speed of ELL relative to CSR:
            const double ell_vs_csr = 3.0;

            // Find maximum widths for local and remote parts:
            int max_width = 0;
            for(size_t i = 0; i < n; ++i)
                max_width = std::max(max_width, static_cast<int>(ptr[i+1] - ptr[i]));

            // Build width distribution histogram.
            std::vector<size_t> hist(max_width + 1, 0);
            for(size_t i = 0; i < n; ++i)
                ++hist[ptr[i+1] - ptr[i]];

            // Estimate optimal width for ELL part of the matrix.
            ell_width = max_width;
            {
                size_t rows = n;
                for(int i = 0; i < max_width; ++i) {
                    rows -= hist[i]; // Number of rows wider than i.
                    if (ell_vs_csr * rows < n) {
                        ell_width = i;
                        break;
                    }
                }
            }

            if (ell_width == 0) {
                csr_nnz = nnz;

                csr_ptr = backend::device_vector<Col>(q[0], n + 1,   &ptr[0]);
                csr_col = backend::device_vector<Col>(q[0], csr_nnz, &col[0]);
                csr_val = backend::device_vector<Val>(q[0], csr_nnz, &val[0]);

                return;
            }

            // Count nonzeros in CSR part of the matrix.
            for(int i = ell_width + 1; i <= max_width; ++i)
                csr_nnz += hist[i] * (i - ell_width);

            /* 3. Split the input matrix into ELL and CSR submatrices. */
            std::vector<Col> _ell_col(ell_pitch * ell_width, static_cast<Col>(-1));
            std::vector<Val> _ell_val(ell_pitch * ell_width);
            std::vector<Ptr> _csr_ptr;
            std::vector<Col> _csr_col;
            std::vector<Val> _csr_val;

            if (csr_nnz) {
                _csr_ptr.resize(n + 1);
                _csr_col.resize(csr_nnz);
                _csr_val.resize(csr_nnz);

                _csr_ptr[0] = 0;
                for(size_t i = 0; i < n; ++i) {
                    Ptr w = ptr[i+1] - ptr[i];
                    _csr_ptr[i+1] = _csr_ptr[i] + static_cast<Ptr>(w > ell_width ? w - ell_width : 0);
                }
            }


            for(size_t i = 0; i < n; ++i) {
                int w = 0;
                Ptr csr_head = csr_nnz ? _csr_ptr[i] : 0;
                for(Ptr j = ptr[i], e = ptr[i+1]; j < e; ++j, ++w) {
                    Col c = col[j];
                    Val v = val[j];

                    if (w < ell_width) {
                        _ell_col[i + w * ell_pitch] = c;
                        _ell_val[i + w * ell_pitch] = v;
                    } else {
                        _csr_col[csr_head] = c;
                        _csr_val[csr_head] = v;
                        ++csr_head;
                    }
                }
            }

            ell_col = backend::device_vector<Col>(q[0], ell_pitch * ell_width, _ell_col.data());
            ell_val = backend::device_vector<Val>(q[0], ell_pitch * ell_width, _ell_val.data());

            if (csr_nnz) {
                csr_ptr = backend::device_vector<Col>(q[0], n + 1,   _csr_ptr.data());
                csr_col = backend::device_vector<Col>(q[0], csr_nnz, _csr_col.data());
                csr_val = backend::device_vector<Val>(q[0], csr_nnz, _csr_val.data());
            }
        }
Пример #17
0
int main(int argc, char **argv) {
    /* call startup */
    GASNET_Safe(gasnet_init(&argc, &argv));

    /* parse arguments */
    arg = 1;
    while (argc > arg) {
      if (!strcmp(argv[arg], "-in")) {
        insegment = 1;
        ++arg;
      } else if (!strcmp(argv[arg], "-out")) {
        insegment = 0;
        ++arg;
      } else if (!strcmp(argv[arg], "-f")) {
        firstlastmode = 1;
        ++arg;
      } else if (!strcmp(argv[arg], "-a")) {
        fullduplexmode = 1;
        ++arg;
      } else if (!strcmp(argv[arg], "-p")) {
        do_puts = 1; numflavors++;
        ++arg;
      } else if (!strcmp(argv[arg], "-g")) {
        do_gets = 1; numflavors++;
        ++arg;
      } else if (!strcmp(argv[arg], "-s")) {
        do_amshort = 1; numflavors++;
        ++arg;
      } else if (!strcmp(argv[arg], "-m")) {
        do_ammedium = 1; numflavors++;
        ++arg;
      } else if (!strcmp(argv[arg], "-l")) {
        do_amlong = 1; numflavors++;
        ++arg;
      } else if (!strcmp(argv[arg], "-b")) {
        do_bulk = 1;
        ++arg;
      } else if (!strcmp(argv[arg], "-n")) {
        do_nonbulk = 1; 
        ++arg;
      } else if (!strcmp(argv[arg], "-v")) {
        do_value = 1; 
        ++arg;
      } else if (!strcmp(argv[arg], "-i")) {
        do_implicit = 1; numsync++;
        ++arg;
      } else if (!strcmp(argv[arg], "-e")) {
        do_explicit = 1; numsync++;
        ++arg;
      } else if (!strcmp(argv[arg], "-k")) {
        do_blocking = 1; numsync++;
        ++arg;
      } else if (argv[arg][0] == '-') {
        help = 1;
        ++arg;
      } else break;
    }
    if (fullduplexmode && firstlastmode) help = 1;
    if (argc > arg+3) help = 1;

    if (argc > arg) { iters = atoi(argv[arg]); arg++; }
    if (!iters) iters = 10;
    if (argc > arg) { maxdepth = atoi(argv[arg]); arg++; }
    if (!maxdepth) maxdepth = 1024; /* 1024 default */
    if (argc > arg) { maxsz = atoi(argv[arg]); arg++; }
    if (!maxsz) maxsz = 2*1024*1024; /* 2 MB default */

    #ifdef GASNET_SEGMENT_EVERYTHING
      if (maxsz > TEST_SEGSZ) { MSG("maxsz must be <= %lu on GASNET_SEGMENT_EVERYTHING",(unsigned long)TEST_SEGSZ); gasnet_exit(1); }
    #endif
    GASNET_Safe(gasnet_attach(htable, sizeof(htable)/sizeof(gasnet_handlerentry_t), 
                              TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET));
    test_init("testqueue",1,"[-in|-out|-a|-f] (iters) (maxdepth) (maxsz)\n"
               "  The 'in' or 'out' option selects whether the initiator-side\n"
               "  memory is in the GASNet segment or not (default is not).\n"
               "  The -a option enables full-duplex mode, where all nodes send.\n"
               "  The -f option enables 'first/last' mode, where the first node\n"
               "  sends to the last, while all other nodes sit idle.\n"
               "  Test types to run: (defaults to everything)\n"
               "   -p : puts\n"
               "   -g : gets\n"
               "   -s : AMShort\n"
               "   -m : AMMedium\n"
               "   -l : AMLong\n"
               "   -n : Test non-bulk put/gets\n"
               "   -b : Test bulk put/gets\n"
               "   -v : Test value-based put/gets\n"
               "   -i : Test implicit-handle put/gets\n"
               "   -e : Test explicit-handle put/gets\n"
               "   -k : Test blocking put/gets\n");
    if (help) test_usage();

    min_payload = 1;
    max_payload = maxsz;

    if (numflavors == 0) { /* default to all */
      do_puts = 1; 
      do_gets = 1; 
      do_amshort = 1;
      do_ammedium = 1;
      do_amlong = 1;
    }
    if (numsync == 0) { /* default to all */
      do_implicit = 1;
      do_explicit = 1;
      do_blocking = 1;
    }
    if (!do_bulk && !do_nonbulk && !do_value) {
      do_bulk = 1;
      do_nonbulk = 1;
      do_value = 1;
    }

    if (!do_implicit && !do_explicit && !do_blocking) {
      do_implicit = 1;
      do_explicit = 1;
      do_blocking = 1;
    }

    if (max_payload < min_payload) {
      printf("ERROR: maxsz must be >= %i\n",min_payload);
      gasnet_exit(1);
    }

    /* get SPMD info */
    myproc = gasnet_mynode();
    numprocs = gasnet_nodes();

    if (!firstlastmode) {
      /* Only allow 1 or even number for numprocs */
      if (numprocs > 1 && numprocs % 2 != 0) {
        MSG0("WARNING: This test requires a unary or even number of nodes. Test skipped.\n");
        gasnet_exit(0); /* exit 0 to prevent false negatives in test harnesses for smp-conduit */
      }
    }
    
    /* Setting peer thread rank */
    if (firstlastmode) {
      peerproc = numprocs-1;
      iamsender = (myproc == 0);
      iamrecver = (myproc == numprocs-1);
      multisender = 0;
    } else if (numprocs == 1) {
      peerproc = 0;
      iamsender = 1;
      iamrecver = 1;
      multisender = 0;
    } else { 
      peerproc = (myproc % 2) ? (myproc - 1) : (myproc + 1);
      iamsender = (fullduplexmode || myproc % 2 == 0);
      iamrecver = (fullduplexmode || !iamsender);
      multisender = (fullduplexmode || numprocs >= 4);
    }
    multisender = 1; /* messes up output on some systems */

    
    myseg = TEST_SEG(myproc);
    tgtmem = TEST_SEG(peerproc);

    if (insegment) {
	msgbuf = (void *) myseg;
    } else {
	alloc = (void *) test_calloc(maxsz+PAGESZ,1); /* use calloc to prevent valgrind warnings */
        msgbuf = (void *) alignup(((uintptr_t)alloc), PAGESZ); /* ensure page alignment of base */
    }
    assert(((uintptr_t)msgbuf) % PAGESZ == 0);

    MSG0("Running %squeue test with local addr %sside segment, iters=%i, maxdepth=%i, sz: %i...%i", 
      firstlastmode ? "first/last " : (fullduplexmode ? "full-duplex ": ""),
      insegment ? "in" : "out", 
      iters, 
      maxdepth,
      min_payload, max_payload);
    MSG0("x-axis: queue depth, y-axis: message size, injection time in microseconds\n");
    BARRIER();

    handles = (gasnet_handle_t *) test_malloc(sizeof(gasnet_handle_t) * maxdepth);
    vghandles = (gasnet_valget_handle_t *) test_malloc(sizeof(gasnet_valget_handle_t) * maxdepth);

    do_bulkputgets();
    do_nonbulkputgets();
    do_valueputgets();
    do_blockingputgets();
    do_amtests();

    BARRIER();
    test_free(handles);
    if (!insegment) {
	test_free(alloc);
    }

    gasnet_exit(0);

    return 0;

}
Пример #18
0
int main(int argc, char **argv) {
  int help=0;
  int arg=1;

  GASNET_Safe(gex_Client_Init(&myclient, &myep, &myteam, "testam", &argc, &argv, 0));

  mynode = gex_TM_QueryRank(myteam);
  numnode = gex_TM_QuerySize(myteam);

  arg = 1;
  while (argc > arg) {
    if (!strcmp(argv[arg], "-p")) {
#if GASNET_PAR
      pollers = test_thread_limit(atoi(argv[arg+1])+1)-1;
      arg += 2;
#else
      if (0 == mynode) {
        fprintf(stderr, "testam %s\n", GASNET_CONFIG_STRING);
        fprintf(stderr, "ERROR: The -p option is only available in the PAR configuration.\n");
        fflush(NULL);
      }
      sleep(1);
      gasnet_exit(1);
#endif
    } else if (!strcmp(argv[arg], "-in")) {
      insegment = 1;
      ++arg;
    } else if (!strcmp(argv[arg], "-out")) {
      insegment = 0;
      ++arg;
    } else if (!strcmp(argv[arg], "-c")) {
      crossmachinemode = 1;
      ++arg;
    } else if (!strcmp(argv[arg], "-sync-req")) {
      asynclc = 0;
      lc_opt = GEX_EVENT_NOW;
      ++arg;
    } else if (!strcmp(argv[arg], "-async-req")) {
      asynclc = 1;
      lc_opt = GEX_EVENT_GROUP;
      ++arg;
    } else if (!strcmp(argv[arg], "-fp")) {
      use_np = 0;
      ++arg;
    } else if (!strcmp(argv[arg], "-np-cb")) {
      use_np = 1;
      np_cbuf = 1;
      ++arg;
    } else if (!strcmp(argv[arg], "-np-gb")) {
      use_np = 1;
      np_cbuf = 0;
      ++arg;
    } else if (!strcmp(argv[arg], "-src-noop")) {
      src_mode = SRC_NOOP;
      ++arg;
    } else if (!strcmp(argv[arg], "-src-generate")) {
      src_mode = SRC_GENERATE;
      ++arg;
    } else if (!strcmp(argv[arg], "-src-memcpy")) {
      src_mode = SRC_MEMCPY;
      ++arg;
    } else if (argv[arg][0] == '-') {
      help = 1;
      ++arg;
    } else break;
  }

  if (argc > arg) { iters = atoi(argv[arg]); ++arg; }
  if (!iters) iters = 1000;
  if (argc > arg) { maxsz = atoi(argv[arg]); ++arg; }
  if (!maxsz) maxsz = 2*1024*1024;
  if (argc > arg) { TEST_SECTION_PARSE(argv[arg]); ++arg; }

  GASNET_Safe(gex_Segment_Attach(&mysegment, myteam, TEST_SEGSZ_REQUEST));
  GASNET_Safe(gex_EP_RegisterHandlers(myep, htable, sizeof(htable)/sizeof(gex_AM_Entry_t)));

#if GASNET_PAR
  #define PAR_USAGE \
               "  The -p option gives the number of polling threads, specified as\n" \
               "    a non-negative integer argument (default is no polling threads).\n"
#else
  #define PAR_USAGE ""
#endif
  test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n"
               "  The '-in' or '-out' option selects whether the requestor's\n"
               "    buffer is in the GASNet segment or not (default is 'in').\n"
               PAR_USAGE
               "  The '-sync-req' or '-async-req' option selects synchronous or asynchronous\n"
               "    local completion of Medium and Long Requests (default is synchronous).\n"
               "  The '-fp', '-np-gb' or '-np-cb' option selects Fixed- or Negotiated-Payload\n"
               "    for Medium and Long AMs, as follows:\n"
               "      -fp:     Fixed-Payload (default)\n"
               "      -np-gb:  Negotiated-Payload with GASNet-provided buffer\n"
               "      -np-cb:  Negotiated-Payload with client-provided buffer\n"
               "  The '-src-*' options select treatment of the payload buffer used for\n"
               "    Medium and Long AMs, as follows:\n"
               "      -src-noop:      no per-operation initialization (default)\n"
               "      -src-generate:  initialized (w/o memory reads) on each AM injection\n"
               "      -src-memcpy:    initialized using memcpy() on each AM injection\n"
               "  The -c option enables cross-machine pairing (default is nearest neighbor).\n");
  if (help || argc > arg) test_usage();

  TEST_PRINT_CONDUITINFO();

  if (insegment) {
    myseg = TEST_MYSEG();
  } else {
    char *space = test_malloc(alignup(maxsz,PAGESZ) + PAGESZ);
    myseg = alignup_ptr(space, PAGESZ);
  }

  if (src_mode == SRC_MEMCPY) {
    zero_buffer = test_calloc(maxsz, 1);
  }

  np_lc_opt = np_cbuf ? lc_opt : NULL;

  if (crossmachinemode) {
    if ((numnode%2) && (mynode == numnode-1)) {
      sender = 1;
      peer = mynode;
    } else {
      gex_Rank_t half = numnode / 2;
      sender = (mynode < half);
      peer = sender ? (mynode + half) : (mynode - half);
    }
  } else {
    peer = mynode ^ 1;
    sender = mynode % 2 == 0;
    if (peer == numnode) {
      peer = mynode;
    }
  }

  gex_Event_t *tmp_lc_opt = use_np ? np_lc_opt : lc_opt;
  gex_Flags_t flags = use_np ? ( np_cbuf ? GEX_FLAG_AM_PREPARE_LEAST_CLIENT
                                         : GEX_FLAG_AM_PREPARE_LEAST_ALLOC) : 0;
  maxmedreq  = MIN(maxsz, gex_AM_MaxRequestMedium(myteam,peer,tmp_lc_opt,flags,0));
  maxmedrep  = MIN(maxsz, gex_AM_MaxReplyMedium  (myteam,peer,GEX_EVENT_NOW,flags,0));
  maxlongreq = MIN(maxsz, gex_AM_MaxRequestLong  (myteam,peer,tmp_lc_opt,flags,0));
  maxlongrep = MIN(maxsz, gex_AM_MaxReplyLong    (myteam,peer,GEX_EVENT_NOW,flags,0));

  recvr = !sender || (peer == mynode);

  // Long Request and Reply (distinct for loopback)
  reply_addr = TEST_SEG(peer);
  request_addr = (peer == mynode) ? (void*)((uintptr_t)reply_addr + alignup(maxsz,SIZEOF_GEX_RMA_VALUE_T))
                                  : reply_addr;

  BARRIER();

#if GASNET_PAR
  #define PAR_FMT "  %i extra recvr polling threads\n"
  #define PAR_ARG ,pollers
#else
  #define PAR_FMT /*empty*/
  #define PAR_ARG /*empty*/
#endif
  if (mynode == 0) {
      printf("Running %i iterations of %s AM performance with:\n"
             "  local addresses %sside the segment%s\n"
             "  %ssynchronous LC for Requests%s\n"
             "  %s\n"
             "  %s\n"
             PAR_FMT
             "  ...\n",
             iters,
             (crossmachinemode ? "cross-machine ": ""),
             (insegment ? "in" : "out"),
             (insegment ? " (default)" : ""),
             (asynclc ? "a": ""),
             (asynclc ? "": " (default)"),
             (!use_np   ? "fixed-Payload (default)"
              :(np_cbuf ? "negotiated-Payload with client-provided buffer"
                        : "negotiated-Payload with GASNet-provided buffer")),
             ((src_mode == SRC_NOOP)     ? "no payload initialization (default)"
             :(src_mode == SRC_GENERATE) ? "payload initialized by computation"
                                         : "payload initialized using memcpy()")
             PAR_ARG
            );
      printf("   Msg Sz  Description                             Total time   Avg. time   Bandwidth\n"
             "   ------  -----------                             ----------   ---------   ---------\n");
      fflush(stdout);
  }
#if GASNET_PAR
  TEST_SET_WAITMODE(pollers+1);
  if (pollers)
    test_createandjoin_pthreads(pollers+1,doAll,NULL,0);
  else
#endif
    doAll(NULL);

  MSG("done.");

  gasnet_exit(0);
  return 0;
}
Пример #19
0
int main(int argc, char **argv) {
  int help=0;
  int arg=1;

  gasnet_handlerentry_t htable[] = { 
    { hidx_ping_shorthandler,  ping_shorthandler  },
    { hidx_pong_shorthandler,  pong_shorthandler  },
    { hidx_ping_medhandler,    ping_medhandler    },
    { hidx_pong_medhandler,    pong_medhandler    },
    { hidx_ping_longhandler,   ping_longhandler   },
    { hidx_pong_longhandler,   pong_longhandler   },

    { hidx_ping_shorthandler_flood,  ping_shorthandler_flood  },
    { hidx_pong_shorthandler_flood,  pong_shorthandler_flood  },
    { hidx_ping_medhandler_flood,    ping_medhandler_flood    },
    { hidx_pong_medhandler_flood,    pong_medhandler_flood    },
    { hidx_ping_longhandler_flood,   ping_longhandler_flood   },
    { hidx_pong_longhandler_flood,   pong_longhandler_flood   },

    { hidx_done_shorthandler,  done_shorthandler  }
  };

  GASNET_Safe(gasnet_init(&argc, &argv));

  mynode = gasnet_mynode();
  numnode = gasnet_nodes();

  arg = 1;
  while (argc > arg) {
    if (!strcmp(argv[arg], "-p")) {
#if GASNET_PAR
      pollers = atoi(argv[arg+1]);
      arg += 2;
#else
      if (0 == mynode) {
        fprintf(stderr, "testam %s\n", GASNET_CONFIG_STRING);
        fprintf(stderr, "ERROR: The -p option is only available in the PAR configuration.\n");
        fflush(NULL);
      }
      sleep(1);
      gasnet_exit(1);
#endif
    } else if (!strcmp(argv[arg], "-in")) {
      insegment = 1;
      ++arg;
    } else if (!strcmp(argv[arg], "-out")) {
      insegment = 0;
      ++arg;
    } else if (!strcmp(argv[arg], "-c")) {
      crossmachinemode = 1;
      ++arg;
    } else if (argv[arg][0] == '-') {
      help = 1;
      ++arg;
    } else break;
  }

  if (argc > arg) { iters = atoi(argv[arg]); ++arg; }
  if (!iters) iters = 1000;
  if (argc > arg) { maxsz = atoi(argv[arg]); ++arg; }
  if (!maxsz) maxsz = 2*1024*1024;
  if (argc > arg) { TEST_SECTION_PARSE(argv[arg]); ++arg; }

  GASNET_Safe(gasnet_attach(htable, sizeof(htable)/sizeof(gasnet_handlerentry_t),
                            TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET));
#if GASNET_PAR
  test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n"
               "  The '-in' or '-out' option selects whether the requestor's\n"
               "    buffer is in the GASNet segment or not (default is 'in').\n"
               "  The -p option gives the number of polling threads, specified as\n"
               "    a non-negative integer argument (default is no polling threads).\n"
               "  The -c option enables cross-machine pairing (default is nearest neighbor).\n");
#else
  test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n"
               "  The '-in' or '-out' option selects whether the requestor's\n"
               "    buffer is in the GASNet segment or not (default is 'in').\n"
               "  The -c option enables cross-machine pairing (default is nearest neighbor).\n");
#endif
  if (help || argc > arg) test_usage();

  TEST_PRINT_CONDUITINFO();

  if (insegment) {
    myseg = TEST_MYSEG();
  } else {
    char *space = test_malloc(alignup(maxsz,PAGESZ) + PAGESZ);
    myseg = alignup_ptr(space, PAGESZ);
  }

  maxmed = MIN(maxsz, gasnet_AMMaxMedium());
  maxlongreq = MIN(maxsz, gasnet_AMMaxLongRequest());
  maxlongrep = MIN(maxsz, gasnet_AMMaxLongReply());

  if (crossmachinemode) {
    if ((numnode%2) && (mynode == numnode-1)) {
      sender = 1;
      peer = mynode;
    } else {
      gasnet_node_t half = numnode / 2;
      sender = (mynode < half);
      peer = sender ? (mynode + half) : (mynode - half);
    }
  } else {
    peer = mynode ^ 1;
    sender = mynode % 2 == 0;
    if (peer == numnode) {
      peer = mynode;
    }
  }

  recvr = !sender || (peer == mynode);

  peerseg = TEST_SEG(peer);

  BARRIER();

  if (mynode == 0) {
      printf("Running %sAM performance test with %i iterations"
#if GASNET_PAR
             " and %i extra recvr polling threads"
#endif
             "...\n",
             (crossmachinemode ? "cross-machine ": ""),
             iters
#if GASNET_PAR
             ,pollers
#endif
            );
      printf("   Msg Sz  Description                             Total time   Avg. time   Bandwidth\n"
             "   ------  -----------                             ----------   ---------   ---------\n");
      fflush(stdout);
  }
#if GASNET_PAR
  if (pollers)
    test_createandjoin_pthreads(pollers+1,doAll,NULL,0);
  else
#endif
    doAll(NULL);

  MSG("done.");

  gasnet_exit(0);
  return 0;
}
Пример #20
0
void scan(
        backend::command_queue    const &queue,
        backend::device_vector<T> const &input,
        backend::device_vector<T>       &output,
        T init,
        bool exclusive,
        Oper
        )
{
    precondition(
            input.size() == output.size(),
            "Wrong output size in inclusive_scan"
            );

    backend::select_context(queue);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue) ? NT_cpu : NT_gpu;
    const int NT2 = 2 * NT;

    int do_exclusive = exclusive ? 1 : 0;

    const size_t count         = input.size();
    const size_t num_blocks    = (count + NT2 - 1) / NT2;
    const size_t scan_buf_size = alignup(num_blocks, NT2);

    backend::device_vector<T> pre_sum1(queue, scan_buf_size);
    backend::device_vector<T> pre_sum2(queue, scan_buf_size);
    backend::device_vector<T> post_sum(queue, scan_buf_size);

    // Kernel0
    auto krn0 = is_cpu(queue) ?
        block_inclusive_scan<NT_cpu, T, Oper>(queue) :
        block_inclusive_scan<NT_gpu, T, Oper>(queue);

    krn0.push_arg(count);
    krn0.push_arg(input);
    krn0.push_arg(init);
    krn0.push_arg(pre_sum1);
    krn0.push_arg(pre_sum2);
    krn0.push_arg(do_exclusive);

    krn0.config(num_blocks, NT);

    krn0(queue);

    // Kernel1
    auto krn1 = is_cpu(queue) ?
        intra_block_inclusive_scan<NT_cpu, T, Oper>(queue) :
        intra_block_inclusive_scan<NT_gpu, T, Oper>(queue);

    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));
    krn1.push_arg(num_blocks);
    krn1.push_arg(post_sum);
    krn1.push_arg(pre_sum1);
    krn1.push_arg(init);
    krn1.push_arg(work_per_thread);

    krn1.config(1, NT);

    krn1(queue);

    // Kernel2
    auto krn2 = is_cpu(queue) ?
        block_addition<NT_cpu, T, Oper>(queue) :
        block_addition<NT_gpu, T, Oper>(queue);

    krn2.push_arg(count);
    krn2.push_arg(input);
    krn2.push_arg(output);
    krn2.push_arg(post_sum);
    krn2.push_arg(pre_sum2);
    krn2.push_arg(init);
    krn2.push_arg(do_exclusive);

    krn2.config(num_blocks * 2, NT);

    krn2(queue);
}