Exemplo n.º 1
0
int main(unsigned long long speid __attribute__ ((unused)), 
	 unsigned long long argp, 
	 unsigned long long envp __attribute__ ((unused))) 
{
  int i;
  unsigned int tag_id;

  /* Reserve a tag for application usage */
  if ((tag_id = mfc_tag_reserve()) == MFC_TAG_INVALID) {
    printf("ERROR: unable to reserve a tag\n");
    return 1;
  }

  /* Here is the actual DMA call */
  /* the first parameter is the address in local store to place the data */
  /* the second parameter holds the main memory address                  */
  /* the third parameter holds the number of bytes to DMA                */
  /* the fourth parameter identifies a "tag" to associate with this DMA  */
  /* (this should be a number between 0 and 31, inclusive)               */
  /* the last two parameters are only useful if you've implemented your  */
  /* own cache replacement management policy.  Otherwise set them to 0.  */

  mfc_get(&cb, argp, sizeof(cb), tag_id, 0, 0);

  /* Now, we set the "tag bit" into the correct channel on the hardware  */
  /* this is always 1 left-shifted by the tag specified with the DMA     */
  /* for whose completion you wish to wait.                              */
  mfc_write_tag_mask(1<<tag_id);

  /* Now, issue the read and wait to guarantee DMA completion before we  */
  /* continue. */
  mfc_read_tag_status_all();

  /* DMA the data from system memory to our local store buffer. */
  mfc_get(data, cb.addr, DATA_BUFFER_SIZE, tag_id, 0, 0);


  printf("Address received through control block = 0x%llx\n", cb.addr);


  /* Wait for the data array DMA to complete. */
  mfc_read_tag_status_all();

  /* Verify that the data array contains a valid fibonacci sequence.
   */
  for (i=2; i<DATA_BUFFER_ENTRIES; i++) {
    if (data[i] != data[i-1] + data[i-2]) {
      printf("ERROR: fibonacci sequence error at entry %d. Expected %d, Got %d\n",
	     i, data[i-1] + data[i-2], data[i]);
      return (1);
    }
  }

  return 0;
}
Exemplo n.º 2
0
int main2mod(unsigned long long spe_id, unsigned long long program_data_ea, unsigned long long env) 
{
	unsigned tagid = spe_id&31;
	uint32 i,j;

	// get program data
	mfc_get(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0);
	mfc_write_tag_mask(1<<tagid);
	mfc_read_tag_status_all();

	// precompute partial working states based on ihv & partial msg block
	pre_compute(pd.ihv1, pd.ihv2, pd.m1, pd.m2);

	if (pd.collisiondata > 0)
	{
		j = pd.collisiondata*8;
		vec_uint32* bufferptr = &buffer[j];

		// get the trail buffer
		for (i = 0; i < j; i += 128)
			mfc_get(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0);
		mfc_write_tag_mask(1<<tagid);
		mfc_read_tag_status_all();

		// process collision trails
		
		reduce_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr);
		reduce_trails2mod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr);
		find_collmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, &buffer[0], bufferptr);

		// store the trail buffer
		for (i = 0; i < j; i += 128)
			mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0);
		mfc_write_tag_mask(1<<tagid);
		mfc_read_tag_status_all();
	} else {
		// fill the trail buffer in steps and do intermediate DMA transfers
		vec_uint32* bufferptr = &buffer[0];
		for (i = 0; i < BUFFERSIZE; i += 256)
		{
			bufferptr = generate_trailsmod(pd.ihv1, pd.ihv2mod, pd.m1, pd.m2, &pd.astart, bufferptr, &buffer[i+256]);
			mfc_put(&buffer[i], &pd.buffer[i], sizeof(vec_uint32)*128, tagid, 0, 0);
			mfc_put(&buffer[i+128], &pd.buffer[i+128], sizeof(vec_uint32)*128, tagid, 0, 0);
		}
	}
	// transfer the current program data back
	mfc_put(&pd, program_data_ea, sizeof(spu_program_data), tagid, 0, 0);

	// wait for dma transfers to complete
	mfc_write_tag_mask(1<<tagid);
	mfc_read_tag_status_all();
	return 0;
}
Exemplo n.º 3
0
void getlarge( void* to, unsigned long from, int size, int tag ) {
	unsigned long ito, ifrom;
	ito   = (unsigned long)to;
	ifrom = (unsigned long)from;
	while ( size >= 16384 ) {
		mfc_get((void*)ito, (unsigned long)ifrom, 16384, tag, 0, 0);
		size  -= 16384;
		ito   += 16384;
		ifrom += 16384;
	}
	if (size > 0 ) {
		mfc_get((void*)ito, (unsigned long)ifrom, size, tag, 0, 0);
	}
}
Exemplo n.º 4
0
void triad()
{
	int i, j, n;
	
	vector float s = spu_splats(args.scalar);
	
	n = SIZE * sizeof(float);

	for (i = 0; (i + SIZE) < args.N; i += SIZE) {
		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();

		for (j = 0; j < (SIZE / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);

		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
	}
		
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	if (unlikely(i < args.N)) {
		/* 
		 * args.N - i will be smaller than SIZE at this point so
		 * it is safe to do a DMA transfer.
		 * We need to make sure that size is a multiple of 16.
		 */
		n = ((args.N - i) * sizeof(float)) & (~127);

		mfc_get((void *)&ls1[0], (unsigned int )&args.b[i], n, TAG, 0, 0);
		mfc_get((void *)&ls2[0], (unsigned int )&args.c[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
		
		/* n must be divisible by 4. */
		for (j = 0; j < ((args.N - i) / 4); ++j)
			ls3[j] = spu_madd(s, ls2[j], ls1[j]);
		
		mfc_put((void *)&ls3[0], (unsigned int )&args.a[i], n, TAG, 0, 0);
		mfc_write_tag_mask(1 << TAG);
		mfc_read_tag_status_all();
	}

	/* 
	 * At this point it may be that i is still smaller than args.N if the length
	 * was not divisible by the number of SPUs times 16.
	 */
}
int main(unsigned long long speid, unsigned long long argp, unsigned long long envp)
{
	int i = 0;
	ppu_data_t ppu_data __attribute__ ((aligned(16)));

	tag_id = mfc_tag_reserve();
	if (tag_id == MFC_TAG_INVALID){
		printf("SPU: ERROR can't allocate tag ID\n");
		return -1;
	}

	/* Obtin prin DMA structura cu pointeri, nr de frame-uri si spe_id */
	dprintf("SPU: am intrat in spu %llx %lu %llx\n",
			speid, sizeof(ppu_data_t), envp);
	mfc_get((void*)&ppu_data, argp, (uint32_t)envp, tag_id, 0, 0);
	waittag(tag_id);

	dprintf("SPU: speid:%llx got struct\n", speid);
	dprintf("SPU: speid:%llx id:%02d input:%p big_img:%p num_frms:%d\n",
			speid, ppu_data.spe_id, ppu_data.input, ppu_data.big_image,
			ppu_data.num_frames);
	speid = speid;

	/* Frame processing goes here */
	for (i = 0; i < ppu_data.num_frames; ++i) {
		process_frame(ppu_data, i);
	}

	return 0;
}
Exemplo n.º 6
0
int main(uint64_t ea, uint64_t outptr, uint64_t arg3, uint64_t arg4)
{
	/* memory-aligned buffer (vectors always are properly aligned) */
	volatile vec_uchar16 v;

	/* fetch the 16 bytes using dma */
	mfc_get(&v, ea, 16, TAG, 0, 0);
	wait_for_completion();

	/* compare all characters with the small 'a' character code */
	vec_uchar16 cmp = spu_cmpgt(v, spu_splats((unsigned char)('a'-1)));

	/* for all small characters, we remove 0x20 to get the corresponding capital*/
	vec_uchar16 sub = spu_splats((unsigned char)0x20) & cmp;

	/* convert all small characters to capitals */
	v = v - sub;

	/* send the updated vector to ppe */
	mfc_put(&v, ea, 16, TAG, 0, 0);
	wait_for_completion();

	/* send a message to inform the ppe program that the work is done */
	uint32_t ok __attribute__((aligned(16))) = 1;
	mfc_put(&ok, outptr, 4, TAG, 0, 0);
	wait_for_completion();

	/* properly exit the thread */
	spu_thread_exit(0);
	return 0;
}
Exemplo n.º 7
0
void process_image_simple(struct image* img){
	unsigned char *input, *output, *temp;
	unsigned int addr1, addr2, i, j, k, r, g, b;
	int block_nr = img->block_nr;
	vector unsigned char *v1, *v2, *v3, *v4, *v5 ;

	input = malloc_align(NUM_CHANNELS * SCALE_FACTOR * img->width, 4);
	output = malloc_align(NUM_CHANNELS * img->width / SCALE_FACTOR, 4);
	temp = malloc_align(NUM_CHANNELS * img->width, 4);

	v1 = (vector unsigned char *) &input[0];
	v2 = (vector unsigned char *) &input[1 * img->width * NUM_CHANNELS];
	v3 = (vector unsigned char *) &input[2 * img->width * NUM_CHANNELS];
	v4 = (vector unsigned char *) &input[3 * img->width * NUM_CHANNELS];
	v5 = (vector unsigned char *) temp;

	addr2 = (unsigned int)img->dst; //start of image
	addr2 += (block_nr / NUM_IMAGES_HEIGHT) * img->width * NUM_CHANNELS * 
		img->height / NUM_IMAGES_HEIGHT; //start line of spu block
	addr2 += (block_nr % NUM_IMAGES_WIDTH) * NUM_CHANNELS *
		img->width / NUM_IMAGES_WIDTH;

	for (i=0; i<img->height / SCALE_FACTOR; i++){
		//get 4 lines
		addr1 = ((unsigned int)img->src) + i * img->width * NUM_CHANNELS * SCALE_FACTOR;
		mfc_get(input, addr1, SCALE_FACTOR * img->width * NUM_CHANNELS, MY_TAG, 0, 0);
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();

		//compute the scaled line
		for (j = 0; j < img->width * NUM_CHANNELS / 16; j++){
			v5[j] = spu_avg(spu_avg(v1[j], v2[j]), spu_avg(v3[j], v4[j]));
		}
		for (j=0; j < img->width; j+=SCALE_FACTOR){
			r = g = b = 0;
			for (k = j; k < j + SCALE_FACTOR; k++) {
				r += temp[k * NUM_CHANNELS + 0];
				g += temp[k * NUM_CHANNELS + 1];
				b += temp[k * NUM_CHANNELS + 2];
			}
			r /= SCALE_FACTOR;
			b /= SCALE_FACTOR;
			g /= SCALE_FACTOR;

			output[j / SCALE_FACTOR * NUM_CHANNELS + 0] = (unsigned char) r;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 1] = (unsigned char) g;
			output[j / SCALE_FACTOR * NUM_CHANNELS + 2] = (unsigned char) b;
		}

		//put the scaled line back
		mfc_put(output, addr2, img->width / SCALE_FACTOR * NUM_CHANNELS, MY_TAG, 0, 0);
		addr2 += img->width * NUM_CHANNELS; //line inside spu block
		mfc_write_tag_mask(1 << MY_TAG);
		mfc_read_tag_status_all();
	}

	free_align(temp);
	free_align(input);
	free_align(output);
}
Exemplo n.º 8
0
int cacheGetPrime(int n)
{
    if ((n < primeCacheStart + primeCacheSize) && (n > primeCacheStart))
    {
        int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
        return r;
    }

    // Haal op.

    uint32_t    tag, size;
    tag = mfc_tag_reserve();
    size = CACHE_PRIME_SIZE*16;

    unsigned long long  EA = setup.vPrimes + (n - n%4) * 4;

    mfc_get(&primeCacheData, EA, size, tag, 0, 0);
    mfc_write_tag_mask(1 << tag);
    mfc_read_tag_status_all();
    mfc_tag_release(tag);

    primeCacheStart = n - (n % 4);

    int r = spu_extract(primeCacheData[(n - primeCacheStart) / 4], n%4);
    return r;
}
Exemplo n.º 9
0
Arquivo: spu_dma.c Projeto: ALaDyn/psc
void
first_preload_particle(volatile void *ls, unsigned long long ea, unsigned long size)
{
  tag_preload = get_tagid();
  tag_store = 0; 
  mfc_get(ls, ea, size, tag_preload, 0, 0);
}
Exemplo n.º 10
0
Arquivo: spu.c Projeto: vitalyvch/MPAC
int main(unsigned long long spe_id, unsigned long long program_data_ea,unsigned long long env)
{
	char array[MAX] __attribute__((aligned(128)));
	int func,dma_count;
	unsigned int tag = 1,count,k,byte_size,chunk_size, transfered_size,dest_inc;
	unsigned int count1,add_inc;
	unsigned long int rep;
	char arr[MAX];
	unsigned long int array_size = 32768;
	unsigned long int data_size;
	spu_write_decrementer(0);
	rep = spu_read_in_mbox();
	data_size = spu_read_in_mbox();
	func = spu_read_in_mbox();
	byte_size = data_size;
	k = byte_size - MAX;
	chunk_size = byte_size;
	mfc_get(array, (unsigned int)program_data_ea, chunk_size, tag, 0, 0);
	mfc_write_tag_mask(1<<tag);                     
	mfc_read_tag_status_any();
	for(count = 0; count < rep;count++)
	for(count1 = 0 ; count1 < chunk_size ; count1++)
	{
		arr[count1%array_size] = array[count1];
	}
	return 0;
}
Exemplo n.º 11
0
  void compute()
  {
    // Compute my portion to compute
    int my_rows = rows / nspe + (rank < rows % nspe);
    int offset  = rank * (rows / nspe) + std::min(rank, rows % nspe);

#if DEBUG
    printf("Compute (%d/%d %d, %d) %d/%d\n", my_rows, rows, offset,
	   cols, rank, nspe);
#endif
    int tag = 23;

    uint64_t pin0 = in0 + offset * cols * sizeof(float);
    uint64_t pin1 = in1 + offset * cols * sizeof(float);
    uint64_t pin2 = in2 + offset * cols * sizeof(float);
    uint64_t pout = out + offset * cols * sizeof(float);

    float  buf[4*cols];
    float* buf0 = buf + 0*cols;
    float* buf1 = buf + 1*cols;
    float* buf2 = buf + 2*cols;
    float* buf3 = buf + 3*cols;

    for (int r=0; r<my_rows; ++r)
    {
      mfc_get(buf0, pin0, cols*sizeof(float), tag, 0, 0);
      mfc_get(buf1, pin1, cols*sizeof(float), tag, 0, 0);
      mfc_get(buf2, pin2, cols*sizeof(float), tag, 0, 0);

      pin0 += cols * sizeof(float);
      pin1 += cols * sizeof(float);
      pin2 += cols * sizeof(float);

      // Wait for DMAs to complete
      mfc_write_tag_mask(1<<tag);
      mfc_read_tag_status_all();

      for (int c=0; c<cols; ++c)
	buf3[c] = buf0[c] * buf1[c] + buf2[c];

      mfc_put(buf3, pout, cols*sizeof(float), tag, 0, 0);
      pout += cols * sizeof(float);
    }

    mfc_write_tag_mask(1<<tag);
    mfc_read_tag_status_all();
  }
Exemplo n.º 12
0
void get_Bmatrix_segments ( int tag, int seg_i, int seg_j ) {
	int i, j;
	j=0;
	for(i=seg_i; i<(seg_i+stsize); i++ ) {
		mfc_get((void*)&(BM0[j][0]), (unsigned long )&((*pB_matrix)[i][seg_j]), stsize*sizeof(float), tag, 0, 0);
		j++;
	}
}
Exemplo n.º 13
0
void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__SPU__) || defined (USE_LIBSPE2)
	mfc_get(ls,ea,size,tag,0,0);
	return ls;
#else
	return (void*)(uint32_t)ea;
#endif
}
Exemplo n.º 14
0
void pull(int side){
  int avail_in = num_free_in_buffer(side);
  int avail_mm = mcb[am].data_size[side] - md[am].num_pulled[side];
  int num_pull = avail_in < avail_mm ? avail_in : avail_mm;
  num_pull = num_pull < MAX_DMA_SIZE ? num_pull : MAX_DMA_SIZE;
  int head = spu_extract(md[am].idx[side][HEAD],0);
  int avail_from_head = mcb[am].buffer_size[side] - head;
  int first_pull = num_pull < avail_from_head ? num_pull : avail_from_head;
        
  if(!first_pull)
    return;

  // pull #first_pull
  unsigned int to_ea = (unsigned int) &md[am].buffer[side][head];
  int tag = mfc_tag_reserve();
  if(tag == MFC_TAG_INVALID){
    return;
  } else {
    md[am].held_tag[side] = tag;
  }

  mfc_get((void*)to_ea,
	  mcb[am].block_addr[side],
	  first_pull * sizeof(vector signed int),
	  md[am].held_tag[side],
	  0,0);
  mcb[am].block_addr[side] += first_pull * sizeof(vector signed int);

  if(first_pull < num_pull){
    to_ea = (unsigned int) &md[am].buffer[side][0];
    int second_pull = num_pull - first_pull;

    mfc_get((void*)to_ea,
	    mcb[am].block_addr[side],
	    second_pull * sizeof(vector signed int),
	    md[am].held_tag[side],
	    0,0);
    mcb[am].block_addr[side] += second_pull * sizeof(vector signed int);
  }

  md[am].num_waiting[side] = num_pull;
}
Exemplo n.º 15
0
/**
 * Get arguments from main memory synchronously
 */
void get_transport_argv(uint64_t argvp, real_t *dt, real_t *size, uint32_t *block)
{
    mfc_get(&argv, argvp, sizeof(spe_argv_t), GET_ARG_TAG_MASK, 0, 0);
    wait_for_dma(GET_ARG_TAG_MASK);

    conc[0].ea_base = argv.arg[0].u64;
    wind[0].ea_base = argv.arg[1].u64;
    diff[0].ea_base = argv.arg[2].u64;
    buff[0].ea_base = argv.arg[0].u64;
    conc[0].length  = argv.arg[5].u32[0];
    wind[0].length  = conc[0].length;
    diff[0].length  = conc[0].length;
    buff[0].length  = conc[0].length;

    conc[1].ea_base = conc[0].ea_base;
    wind[1].ea_base = wind[0].ea_base;
    diff[1].ea_base = diff[0].ea_base;
    buff[1].ea_base = buff[0].ea_base;
    conc[1].length  = conc[0].length;
    wind[1].length  = wind[0].length;
    diff[1].length  = diff[0].length;
    buff[1].length  = buff[0].length;

    conc[2].ea_base = conc[0].ea_base;
    wind[2].ea_base = wind[0].ea_base;
    diff[2].ea_base = diff[0].ea_base;
    buff[2].ea_base = buff[0].ea_base;
    conc[2].length  = conc[0].length;
    wind[2].length  = wind[0].length;
    diff[2].length  = diff[0].length;
    buff[2].length  = buff[0].length;

    clist[0].length = conc[0].length;
    wlist[0].length = wind[0].length;
    dlist[0].length = diff[0].length;

    clist[1].length = conc[1].length;
    wlist[1].length = wind[1].length;
    dlist[1].length = diff[1].length;

    clist[2].length = conc[1].length;
    wlist[2].length = wind[1].length;
    dlist[2].length = diff[1].length;

    shuffle[0].length = conc[0].length;
    shuffle[1].length = wind[0].length;
    shuffle[2].length = diff[0].length;
    shuffle[3].length = buff[0].length;

    *dt     = argv.arg[3].dbl;
    *size   = argv.arg[4].dbl;
    *block  = argv.arg[5].u32[1];
}
Exemplo n.º 16
0
int main(uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) {
    /* get data structure */
    spu_ea = arg1;
    mfc_get(&spu, spu_ea, sizeof(spustr_t), TAG, 0, 0);
    wait_for_completion(TAG);

    /* main loop: wait for screen address or 0 to end */
    uint32_t buffer_ea;
    while ((buffer_ea = spu_read_signal1()) != 0) {
        mfc_get(&spu, spu_ea, sizeof(spustr_t), TAG, 0, 0);
        wait_for_completion(TAG);

        draw_frame(buffer_ea);
        send_response(1);
        wait_for_completion(TAG);
    }

    /* properly exit the thread */
    spu_thread_exit(0);
    return 0;
}
Exemplo n.º 17
0
/*
 * The argv argument will be populated with the address that the PPE provided,
 * from the 4th argument to spe_context_run()
 */
int main(uint64_t speid, uint64_t argv, uint64_t envp)
{
    struct spe_args args __attribute__((aligned(SPE_ALIGN)));

    mfc_get(&args, argv, sizeof(args), 0, 0, 0);

    mfc_write_tag_mask(1 << 0);
    mfc_read_tag_status_all();

    cmap_calls = 0;
    dma_puts = 0;
    spu_write_decrementer(-1);

    // Run multiple renders with offsets.  Should be factored into render_fractal()
    render_fractal(&args.fractal, args.thread_idx, args.n_threads, 0.);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 7 / 8);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 3 / 4);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 5 / 8);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta / 2);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta * 3 / 8);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta / 4);
    render_fractal(&args.fractal, args.thread_idx, args.n_threads,
                   args.fractal.delta / 8);

    // Send remaining points
    if(fill%2048) {
        // select the last buffer used
        int f = fill / 2048;
        mfc_put(&points[f*2048], (uint)args.fractal.pointbuf[f], 16384, 0, 0, 0);
        // Block for completion
        mfc_write_tag_mask(1<<0);
        mfc_read_tag_status_all();
        // Send a message with top bit set to indicate final item
        spu_write_out_intr_mbox((1<<31)|f);
        // Send another message indicating count
        spu_write_out_intr_mbox(fill%2048);
        ++dma_puts;
    }

    // Report some stats
    uint ticks = -1 - spu_read_decrementer();
    printf("cmap calls %d ticks %u calls/tick %f\n",
           cmap_calls, ticks, (double)cmap_calls/ticks );
    printf("dma puts %d\n", dma_puts);

    return 0;
}
Exemplo n.º 18
0
/**
 * Get arguments from main memory synchronously 
 */
void get_chemistry_argv(uint64_t argvp, uint32_t* rows)
{
    mfc_get(&argv, argvp, sizeof(spe_argv_t), GET_ARG_TAG_MASK, 0, 0);
    wait_for_dma(GET_ARG_TAG_MASK);
    
    conc[0].ea_base = argv.arg[0].u64;
    conc[0].length  = NSPEC;
    conc[1].ea_base = conc[0].ea_base;
    conc[1].length  = conc[0].length;
    
    TIME  = argv.arg[1].dbl;
    DT    = argv.arg[2].dbl;
    *rows = argv.arg[3].u32[0];
}
Exemplo n.º 19
0
int main(uint64_t speid, uint64_t argp, uint64_t envp){
	unsigned int data[NUM_STREAMS];
	unsigned int num_spus = (unsigned int)argp, i, num_images;
	struct image my_image __attribute__ ((aligned(16)));
	int mode = (int)envp;

	speid = speid; //get rid of warning

	while(1){
		num_images = 0;
		for (i = 0; i < NUM_STREAMS / num_spus; i++){
			//assume NUM_STREAMS is a multiple of num_spus
			while(spu_stat_in_mbox() == 0);
			data[i] = spu_read_in_mbox();
			if (!data[i])
				return 0;
			num_images++;
		}

		for (i = 0; i < num_images; i++){
			mfc_get(&my_image, data[i], sizeof(struct image), MY_TAG, 0, 0);
			mfc_write_tag_mask(1 << MY_TAG);
			mfc_read_tag_status_all();
			switch(mode){
				default:
				case MODE_SIMPLE:
					process_image_simple(&my_image);
					break;
				case MODE_2LINES:
					process_image_2lines(&my_image);
					break;
				case MODE_DOUBLE:
					process_image_double(&my_image);
					break;
				case MODE_DMALIST:
					process_image_dmalist(&my_image);
					break;
			}
		}	
		data[0] = DONE;
		spu_write_out_intr_mbox(data[0]);	
	}

	return 0;
}
static void cleargroups(void)
{
  unsigned i;

  for (i = 0; i < GROUPS_COUNT; i++)
  {
    group_keysvectors[i] = spu_splats((u16) 0);
    group_insertpos[i]   = spu_splats((u32) 0);
#ifdef GET_CACHE_STATS
    group_length[i]      = 0;
#endif
  }
  /* All vectors now points to group0, so fill all entries with true data for group 0 */
  mfc_get(group_values[0][0], myCellOGRCoreArgs.upchoose, GROUP_ELEMENTS * 2, DMA_ID, 0, 0);
  mfc_read_tag_status_all();
  for (i = 1; i < GROUPS_COUNT * GROUPS_LENGTH; i++)
    memcpy(group_values[0][i], group_values[0][0], GROUP_ELEMENTS * 2);
}
Exemplo n.º 21
0
/**
 * Get arguments from main memory synchronously 
 */
void get_chemistry_argv(uint64_t argvp, uint32_t* rows)
{
    timer_start(&metrics.comm);
    
    mfc_get(&argv, argvp, sizeof(spe_argv_t), 31, 0, 0);
    wait_for_dma(31);
    
    conc[0].ea_base = argv.arg[0].u64;
    conc[0].length  = NSPEC;
    
    conc[1].ea_base = conc[0].ea_base;
    conc[1].length  = conc[0].length;
    
    TIME  = argv.arg[1].dbl;
    DT    = argv.arg[2].dbl;
    *rows = argv.arg[3].u32[0];
    
    timer_stop(&metrics.comm);
}
Exemplo n.º 22
0
static void init(unsigned long long argp) {
  mfc_get(&spu_arguments, (unsigned) argp, sizeof(spu_arguments), 0, 0, 0);
  mfc_write_tag_mask(1 << 0);
  mfc_read_tag_status_all();

  first_channel =  spu_arguments.spu_id      * NR_CHANNELS / NR_SPUS;
  last_channel  = (spu_arguments.spu_id + 1) * NR_CHANNELS / NR_SPUS;

  for(int i=0; i<NR_STATIONS; i++) {
    samples_dma_list[i].size = sizeof(samples[0][0]);
  }

  if(spu_arguments.spu_id == 0) {
    printf("SPU sample dma size = %ld bytes\n", sizeof(samples[0][0]));
    printf("SPU in buffers = %ld KB @ %p, out buffers = %ld B @ %p\n", sizeof(samples) / 1024, samples, sizeof(visibilities), visibilities);
  }

  printf("I am spu %d, calculating channels %3d - %3d\n", spu_arguments.spu_id, first_channel, last_channel);
}
Exemplo n.º 23
0
void initialize(
  Fastconv_params* fc,
  void*            p_kernel, 
  fft1d_f*         obj,
  void*            buf)
{
  unsigned int size = fc->elements*2*sizeof(float);

  // The kernel matches the input and output size
  mfc_get(p_kernel, fc->ea_kernel, size, 31, 0, 0);
  mfc_write_tag_mask(1<<31);
  mfc_read_tag_status_all();

  if (fc->transform_kernel)
  {
    // Perform the forward FFT on the kernel, in place.  This only need 
    // be done once -- subsequent calls will utilize the same kernel.
    cml_ccfft1d_ip_f(obj, (float*)coeff, CML_FFT_FWD, buf);
  }
}
Exemplo n.º 24
0
Arquivo: spu_dma.c Projeto: ALaDyn/psc
void
spu_dma_get(volatile void *ls, unsigned long long ea, unsigned long size)
{
  // Check that we're on 16B boundaries, and 
  // the size of the struct we're bringing in is 
  // a multiple of 16B 
  //  fprintf(stderr, "size %d\n", size);
  assert(((unsigned long)ls & 15) == 0);
  assert((ea & 15) == 0);
  assert((size & 15) == 0);
  //fflush(stdout);
  //  fprintf(stderr,"dma_get %p %llu %lu\n", ls, ea, size);

  int tagid = get_tagid();
  assert(tagid >= 0);
  
  //  fprintf(stderr, " size %lu \n", size);
  mfc_get(ls, ea, size, tagid, 0, 0);
  wait_tagid(tagid);
  put_tagid(tagid);
}
Exemplo n.º 25
0
int main(ull id, ull argp, ull envp)
{
	unsigned int cmd;

	mfc_get(&args, argp, sizeof(args), TAG, 0, 0);
	mfc_write_tag_mask(1 << TAG);
	mfc_read_tag_status_all();

	while (1) {
		cmd = spu_read_in_mbox();

		if (unlikely(SPU2_MSG_PPU_TO_SPU_EXIT == cmd))
			break;

		switch (cmd) {
		case SPU2_MSG_PPU_TO_SPU_DO_COPY:
			copy();
			break;
		case SPU2_MSG_PPU_TO_SPU_DO_SCALE:
			scale();
			break;
		case SPU2_MSG_PPU_TO_SPU_DO_ADD:
			add();
			break;
		case SPU2_MSG_PPU_TO_SPU_DO_TRIAD:
			triad();
			break;
		default:
			fprintf(stderr, " [SPU]: Invalid command received in mailbox\n");
		}

		spu_write_out_mbox(SPU2_MSG_SPU_TO_PPU_DONE);
	}

	return 0;
}
Exemplo n.º 26
0
int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  unsigned int i, num_chunks;
  mfc_list_element_t* dma_list_in;
  unsigned int tmp_addr;

#ifdef USE_TIMER
  uint64_t start, time_working;
    
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* calculate the address of the local buffer where we can point the 
   * dma_list_in pointer to */
  tmp_addr = (unsigned int)((local_buffer_in + sizeof(float)*CHUNK_SIZE * NUM_LIST_ELEMENTS) - 
      (sizeof (mfc_list_element_t) * NUM_LIST_ELEMENTS));
  dma_list_in = (mfc_list_element_t*) (tmp_addr);

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA get to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks of data
   * NUM_LIST_ELEMENTS at a time. Each list element is going to move CHUNK_SIZE
   * of data into system memory. Data is moved into local store, processed, and 
   * written back to system memory NUM_LIST_ELEMENT chunks per loop iteration.      
   */
  for (i = 0; i <num_chunks; i+= NUM_LIST_ELEMENTS)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i * CHUNK_SIZE * sizeof (float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof (float));

    /* fill the dma list with the appropriate lower 32bit effective address and size for
     * each dma list element. This dma list is used to gather the input data 
     * from system memory */
    fill_dma_list (dma_list_in, NUM_LIST_ELEMENTS, in_addr, CHUNK_SIZE * sizeof(float)); 

    /* issue a DMA get list command to gather the NUM_LIST_ELEMENT chunks of data from system memory.
     * The data will be gathered into local buffer local_buffer_in */
    mfc_getl (local_buffer_in, in_addr, dma_list_in, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), tag, 0, 0);

    /* wait for the DMA get list command to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();

    /* invoke process_data to work on the data that's just been moved into local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE * NUM_LIST_ELEMENTS);

    /* fill the dma list with the appropriate lower 32 bit ea and size for each
     * dma list element. This dma list is used to scatter the output data to system memory  */
    fill_dma_list (dma_list_out, NUM_LIST_ELEMENTS, out_addr, CHUNK_SIZE * sizeof(float)); 

    /* issue the DMA put list command to scatter the result from local memory to 
    * different places in system memory */
    mfc_putl (local_buffer_out, out_addr, dma_list_out, NUM_LIST_ELEMENTS * sizeof(mfc_list_element_t), 
        tag, 0, 0);

    /* wait for the DMA put list to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();

  }

#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
Exemplo n.º 27
0
void process_data_simd (float* buf_in, float* buf_out, unsigned int size)
{
  unsigned int i;
  vector float *vbuf_in, *vbuf_out;
  vector float v1 = (vector float) {1.0f, 1.0f, 1.0f, 1.0f};
  vbuf_in = (vector float*) buf_in;
  vbuf_out = (vector float*) buf_out;

  for (i = 0; i < (size / 4); i++)
  {
   vbuf_out[i] = spu_add (vbuf_in[i], v1); 
  }
}

int main(unsigned long long speid __attribute__ ((unused)),
         unsigned long long argp, 
         unsigned long long envp __attribute__ ((unused)))
{
  unsigned int tag;
  unsigned long long in_addr, out_addr;
  int i, num_chunks;

#ifdef USE_TIMER
  uint64_t start, time_working;
  spu_slih_register (MFC_DECREMENTER_EVENT, spu_clock_slih);
  spu_clock_start();
  start = spu_clock_read();
#endif /* USE_TIMER */

  /* First, we reserve a MFC tag for use */
  tag = mfc_tag_reserve();
  if (tag == MFC_TAG_INVALID)
  {
    printf ("SPU ERROR, unable to reserve tag\n");
    return 1;
  }

  /* issue DMA transfer to get the control block information from 
   * system memory */
  mfc_get (&control_block, argp, sizeof (control_block_t), tag, 0, 0);

  /* wait for the DMA to complete */ 
  mfc_write_tag_mask (1 << tag);
  mfc_read_tag_status_all ();

  /* calculate the number of blocks (chunks) that this spe is assigned 
   * to process */
  num_chunks = control_block.num_elements_per_spe/CHUNK_SIZE;

  /*
   * This is the main loop.  We basically goes through the num_chunks
   * and fetches one 'chunk' of data at a time, process it, and write 
   * it back to system memory until done. 
   */
  for (i = 0; i < num_chunks; i++)
  {
    /* set the in_addr and out_addr variables, we will use these for
     * issuing DMA get and put commands */
    in_addr = control_block.in_addr + (i* CHUNK_SIZE * sizeof(float));
    out_addr = control_block.out_addr + (i * CHUNK_SIZE * sizeof(float));

    /* issue a DMA get command to fetch the chunk of data from system memory */
    mfc_get (local_buffer_in, in_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA get to complete */ 
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();


    /* invoke process_data to work on the data that's just been moved into 
     * local store*/
    process_data_simd (local_buffer_in, local_buffer_out, CHUNK_SIZE);

   /* issue the DMA put command to transfer result from local memory to 
    * system memory */
    mfc_put (local_buffer_out, out_addr, CHUNK_SIZE * sizeof(float), 
        tag, 0, 0);

    /* wait for the DMA put to complete */
    mfc_write_tag_mask (1 << tag);
    mfc_read_tag_status_all ();
  }
#ifdef USE_TIMER
  time_working = (spu_clock_read() - start);
  spu_clock_stop();
  printf ("SPE time_working = %lld\n", time_working);
#endif /* USE_TIMER */

  return 0;
}
Exemplo n.º 28
0
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
{
	
	btAssert(size<32);
	
	ATTRIBUTE_ALIGNED16(char	tmpBuffer[32]);

	char* mainMem = (char*)ea;
	char* localStore = (char*)ls;
	uint32_t i;
	

	///make sure last 4 bits are the same, for cellDmaSmallGet
	uint32_t last4BitsOffset = ea & 0x0f;
	char* tmpTarget = tmpBuffer + last4BitsOffset;
	
#if defined (__SPU__) || defined (USE_LIBSPE2)
	
	int remainingSize = size;

//#define FORCE_cellDmaUnalignedGet 1
#ifdef FORCE_cellDmaUnalignedGet
	cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
#else
	char* remainingTmpTarget = tmpTarget;
	uint64_t remainingEa = ea;

	while (remainingSize)
	{
		switch (remainingSize)
		{
		case 1:
		case 2:
		case 4:
		case 8:
		case 16:
			{
				mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0);
				remainingSize=0;
				break;
			}
		default:
			{
				//spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize);
				int actualSize = 0;

				if (remainingSize > 16)
					actualSize = 16;
				else
					if (remainingSize >8)
						actualSize=8;
					else
						if (remainingSize >4)
							actualSize=4;
						else
							if (remainingSize >2)
								actualSize=2;
				mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0);
				remainingSize-=actualSize;
				remainingTmpTarget+=actualSize;
				remainingEa += actualSize;
			}
		}
	}
#endif//FORCE_cellDmaUnalignedGet

#else
	//copy into final destination
#ifdef USE_MEMCPY
		memcpy(tmpTarget,mainMem,size);
#else
		for ( i=0;i<size;i++)
		{
			tmpTarget[i] = mainMem[i];
		}
#endif //USE_MEMCPY

#endif

	cellDmaWaitTagStatusAll(DMA_MASK(1));

	//this is slowish, perhaps memcpy on SPU is smarter?
	for (i=0; btLikely( i<size );i++)
	{
		localStore[i] = tmpTarget[i];
	}

	return 0;
}
Exemplo n.º 29
0
Arquivo: spu_dma.c Projeto: ALaDyn/psc
void
loop_preload_particle(volatile void *ls, unsigned long long ea, unsigned long size)
{
  wait_tagid(tag_preload);
  mfc_get(ls, ea, size, tag_preload, 0, 0);
}
Exemplo n.º 30
0
int main(unsigned long long speid, addr64 argp, addr64 envp) 
{
	unsigned long long dummy;

	int l ;
	int p0, p1 ;
	int i1, i2, i3 ;
	int j1, j2, j3 ;
	dummy = envp.ull ;
	dummy = speid ;


	// get arguments
	mfc_get((void*)&args, argp.ull, 128, 31, 0, 0);
	waitfor_matrix_io  ( 31 );


//	printf("SPE(%lld): Data received is: %d %d %d %d\n", speid, (int)args.inA
//	, (int)args.out, (int)args.i_initial, (int)args.i_final );
//	printf("SPE(%lld): Data received is: %x %x %d %d\n", speid, (int)args.inA
//	, (int)args.out, (int)args.i_initial, (int)args.i_final );
//	printf("SPE(%lld): size= %d \n", speid, (int)args.out-(int)args.inA );
//	fflush(stdout);



	if ( args.sortType == 0 ) {
		for( l=args.i_initial; l<args.i_final; l++ ) {
			getlarge( (void*)&darrayA0, (unsigned long)(args.inA)+(l*bsize*sizeof(record)), bsize*sizeof(record), 31 );
			waitfor_matrix_io  ( 31 );

			array0 = &darrayC0 ;  array1 = &darrayA0 ;
# ifdef NEVER
			{
			int j, k, k2 ;
			for(k=2; k<=bsize; k<<=1 ) {  k2 = k/2 ;
				array2 = array0 ;  array0 = array1 ;  array1 = array2 ;

				for(j=0; j<bsize; j+=k ) {
					j1=j; j2=j+k2;  j3=j;
					while ( j1<j+k2 && j2<j+k ) {
						if ( (*array0)[j1].key > (*array0)[j2].key ) {
							(*array1)[j3] = (*array0)[j1]; j3++; j1++;
						} else {
							(*array1)[j3] = (*array0)[j2]; j3++; j2++;
						}
					}
					while ( j1<j+k2 ) {
						(*array1)[j3] = (*array0)[j1]; j3++; j1++; 
						}
					while ( j2<j+k  ) { 
						(*array1)[j3] = (*array0)[j2]; j3++; j2++;
						}
				}

			}
			}
# else
# ifdef NEVER
			array1 = phase1C(array0,array1,bsize);
# else
			array1 = phase1(array0,array1);
# endif
# endif
			
			putlarge( (void*)&((*array1)[0]), (unsigned long)(args.out)+(l*bsize*sizeof(record)), bsize*sizeof(record), 31 );
			waitfor_matrix_io  ( 31 );
		}
	} else if ( args.sortType == 1 ) {
		arrayA = (unsigned long)args.out;
		arrayB = (unsigned long)args.inA;

		for( p0=1; p0<args.blocks; p0<<=1 ) {
			arrayC=arrayA; arrayA=arrayB; arrayB=arrayC; 
		for( p1=args.i_initial; p1<args.i_final; p1+=(p0*2) ) {
			i1 = p1 ;  i2 = p1+p0;  i3 = p1;
	
			getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 );
			array0 = &darrayA0 ; j1=0;
			getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 );
			array1 = &darrayB0 ; j2=0;
			waitfor_matrix_io  ( 31 );
			array2 = &darrayC0 ; j3=0;
		
			while ( i1<(p1+p0) && i2<(p1+2*p0) ) {
# ifdef NEVER
				if ( (*array0)[j1].key > (*array1)[j2].key ) {
					(*array2)[j3] = (*array0)[j1]; j3++; j1++;
				} else {
					(*array2)[j3] = (*array1)[j2]; j3++; j2++;
				}
# else
# ifdef NEVER
				phase22C(array0,array1,array2,&j1,&j2,&j3,bsize);
# else
				phase22(array0,array1,array2,&j1,&j2,&j3,bsize);
# endif
# endif
				if ( j1>=bsize ) {
					i1++;
					if ( i1<(p1+p0) ) {
						getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j1=0;
					}
				}
				if ( j2>=bsize ) {
					i2++;
					if ( i2<(p1+2*p0) ) {
						getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j2=0;
					}
				}
				if ( j3>=bsize ) {
					if ( i3<(p1+2*p0) ) {
						putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j3=0;
					}
					i3++;
				}
			}
			
			while ( i1<(p1+p0) ) {
# ifdef NEVER
				(*array2)[j3] = (*array0)[j1]; j3++; j1++;
# else
# ifdef NEVER
				phase21C(array0,array2,&j1,&j3,bsize);
# else
				phase21(array0,array2,&j1,&j3,bsize);
# endif
# endif
				if ( j1>=bsize ) {
					i1++;
					if ( i1<(p1+p0) ) {
						getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j1=0;
					}
				}
				if ( j3>=bsize ) {
					if ( i3<(p1+2*p0) ) {
						putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j3=0;
					}
					i3++;
				}
			}
			
			while ( i2<(p1+2*p0) ) {
# ifdef NEVER
				(*array2)[j3] = (*array1)[j2]; j3++; j2++;
# else
# ifdef NEVER
				phase21C(array1,array2,&j2,&j3,bsize);
# else
				phase21(array1,array2,&j2,&j3,bsize);
# endif
# endif
				if ( j2>=bsize ) {
					i2++;
					if ( i2<(p1+2*p0) ) {
						getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j2=0;
					}
				}
				if ( j3>=bsize ) {
					if ( i3<(p1+2*p0) ) {
						putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j3=0;
					}
					i3++;
				}
			}
		
		}}
	
	} else if ( args.sortType == 2 ) {
			arrayA = (unsigned long)args.inA;
			arrayB = (unsigned long)args.out;
	
			p0 = args.blocks/2 ;  p1 = args.i_initial ;
			i1 = p1;  i2 = p1+p0;  i3 = p1;
	
			getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 );
			array0 = &darrayA0 ; j1=0;
			getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 );
			array1 = &darrayB0 ; j2=0;
			waitfor_matrix_io  ( 31 );
			array2 = &darrayC0 ; j3=0;
		
			while ( i1<(p1+p0) && i2<(p1+2*p0) ) {
# ifdef NEVER
				if ( (*array0)[j1].key > (*array1)[j2].key ) {
					(*array2)[j3] = (*array0)[j1]; j3++; j1++;
				} else {
					(*array2)[j3] = (*array1)[j2]; j3++; j2++;
				}
# else
# ifdef NEVER
				phase22C(array0,array1,array2,&j1,&j2,&j3,bsize);
# else
				phase22(array0,array1,array2,&j1,&j2,&j3,bsize);
# endif
# endif
				if ( j1>=bsize ) {
					i1++;
					if ( i1<(p1+p0) ) {
						getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j1=0;
					}
				}
				if ( j2>=bsize ) {
					i2++;
					if ( i2<(p1+2*p0) ) {
						getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j2=0;
					}
				}
				if ( j3>=bsize ) {
					if ( i3<(p1+2*p0) ) {
						putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j3=0;
					}
					i3++;
				}
			}
			
			while ( i1<(p1+p0) ) {
# ifdef NEVER
				(*array2)[j3] = (*array0)[j1]; j3++; j1++;
# else
# ifdef NEVER
				phase21C(array0,array2,&j1,&j3,bsize);
# else
				phase21(array0,array2,&j1,&j3,bsize);
# endif
# endif
				if ( j1>=bsize ) {
					i1++;
					if ( i1<(p1+p0) ) {
						getlarge( (void*)&darrayA0, (unsigned long)(arrayA)+(i1*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j1=0;
					}
				}
				if ( j3>=bsize ) {
					if ( i3<(p1+2*p0) ) {
						putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j3=0;
					}
					i3++;
				}
			}
			
			while ( i2<(p1+2*p0) ) {
# ifdef NEVER
				(*array2)[j3] = (*array1)[j2]; j3++; j2++;
# else
# ifdef NEVER
				phase21C(array1,array2,&j2,&j3,bsize);
# else
				phase21(array1,array2,&j2,&j3,bsize);
# endif
# endif
				if ( j2>=bsize ) {
					i2++;
					if ( i2<(p1+2*p0) ) {
						getlarge( (void*)&darrayB0, (unsigned long)(arrayA)+(i2*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j2=0;
					}
				}
				if ( j3>=bsize ) {
					if ( i3<(p1+2*p0) ) {
						putlarge( (void*)&darrayC0, (unsigned long)(arrayB)+(i3*bsize*sizeof(record)), bsize*sizeof(record), 31 );
						waitfor_matrix_io  ( 31 );  j3=0;
					}
					i3++;
				}
			}

	}



	return 0;
}