Ejemplo n.º 1
0
void blockActivater(Block* block, ActiveBlock* active, int tag)
{
	unsigned int bx=block->bx, by=block->by;
	unsigned long long ea = screen.address + screen.bytes_per_line*by*32+bx*128;

	block->pixels = (vec_uint4*) ((void*)&active->pixels[0]);

	if (active->ea_copy == ea) {
//		printf("re-using same ea %llx in %x -> %x\n", ea, block, active);
		return;
	}
	
	active->ea_copy = ea;

	unsigned long stride = screen.bytes_per_line;
	unsigned int lines = 32;

	/////////////

	unsigned long eah = ea >> 32;
	unsigned long eal = ((unsigned long) (ea&0xffffffff));

	build_blit_list(active->new_dma, eal, stride);

	unsigned long old_size = active->current_length;
	unsigned long half_new_size = lines * 4;
	unsigned long new_size = half_new_size * 2;
	unsigned long store_new_size = new_size;

	unsigned long eal_old = (unsigned long) ((void*)active->current_dma);
	unsigned long eal_new = (unsigned long) ((void*)active->new_dma);

	// if this is an unused block, then we have no data to blit out
	// so to avoid branches, split the read block in half
//TODO: why do this fix work???
	unsigned long is_new = 0; //cmp_eq(old_size, 0);
	unsigned long cmd = if_then_else(is_new, MFC_GETL_CMD, MFC_PUTLF_CMD);
	eal_old = if_then_else(is_new, eal_new+half_new_size, eal_old);
	new_size = if_then_else(is_new, half_new_size, new_size);
	old_size = if_then_else(is_new, half_new_size, old_size);

#ifdef DEBUG_2
	printf("old_size %d, is_new %d store_new %d\n",
		active->current_length, is_new&1, store_new_size);
	printf("DMA[%02X]: ls=%lx eah=%lx list=%lx, size=%d, tag=%d\n",
		cmd, &active->pixels[0],eah,eal_old,old_size,tag);
	printf("DMA[%02X]: ls=%lx eah=%lx list=%lx, size=%d, tag=%d\n",
		MFC_GETLF_CMD, &active->pixels[0],eah,eal_new,new_size,tag);
#endif

	spu_mfcdma64(&active->pixels[0],eah,eal_old,old_size,tag, cmd);
	spu_mfcdma64(&active->pixels[0],eah,eal_new,new_size,tag, MFC_GETLF_CMD);

	// update the buffer pointers
	active->current_length = store_new_size;
	vec_uint4* t = active->current_dma; 
	active->current_dma = active->new_dma; 
	active->new_dma = t;
	active->eah = eah;
}
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
   png_const_bytep prev)
{
   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
    * and two pixels from the previous row, b and c:
    *   prev: c b
    *   row:  a d
    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
    * p=a+b-c.
    *
    * The first pixel has no left context, and so uses an Up filter, p = b.
    * This works naturally with our main loop's p = a+b-c if we force a and c
    * to zero.
    * Here we zero b and d, which become c and a respectively at the start of
    * the loop.
    */
   png_debug(1, "in png_read_filter_row_paeth4_sse2");
   const __m128i zero = _mm_setzero_si128();
   __m128i c, b = zero,
           a, d = zero;

   int rb = row_info->rowbytes;
   while (rb > 0) {
      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
       * intermediates.
       */
      c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
      a = d; d = _mm_unpacklo_epi8(load4(row ), zero);

      /* (p-a) == (a+b-c - a) == (b-c) */
      __m128i pa = _mm_sub_epi16(b,c);

      /* (p-b) == (a+b-c - b) == (a-c) */
      __m128i pb = _mm_sub_epi16(a,c);

      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
      __m128i pc = _mm_add_epi16(pa,pb);

      pa = abs_i16(pa);  /* |p-a| */
      pb = abs_i16(pb);  /* |p-b| */
      pc = abs_i16(pc);  /* |p-c| */

      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

      /* Paeth breaks ties favoring a over b over c. */
      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
                                                                     c));

      /* Note `_epi8`: we need addition to wrap modulo 255. */
      d = _mm_add_epi8(d, nearest);
      store4(row, _mm_packus_epi16(d,d));

      prev += 4;
      row  += 4;
      rb   -= 4;
   }
}
Ejemplo n.º 3
0
void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {
    // Paeth tries to predict pixel d using the pixel to the left of it, a,
    // and two pixels from the previous row, b and c:
    //   prev: c b
    //   row:  a d
    // The Paeth function predicts d to be whichever of a, b, or c is nearest to p=a+b-c.

    // The first pixel has no left context, and so uses an Up filter, p = b.
    // This works naturally with our main loop's p = a+b-c if we force a and c to zero.
    // Here we zero b and d, which become c and a respectively at the start of the loop.
    const __m128i zero = _mm_setzero_si128();
    __m128i c, b = zero,
               a, d = zero;

    int rb = row_info->rowbytes;
    while (rb > 0) {
        // It's easiest to do this math (particularly, deal with pc) with 16-bit intermediates.
        c = b;
        b = _mm_unpacklo_epi8(load<bpp>(prev), zero);
        a = d;
        d = _mm_unpacklo_epi8(load<bpp>(row ), zero);

        __m128i pa = _mm_sub_epi16(b,c),   // (p-a) == (a+b-c - a) == (b-c)
                pb = _mm_sub_epi16(a,c),   // (p-b) == (a+b-c - b) == (a-c)
                pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

        pa = abs_i16(pa);  // |p-a|
        pb = abs_i16(pb);  // |p-b|
        pc = abs_i16(pc);  // |p-c|

        __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

        // Paeth breaks ties favoring a over b over c.
        __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
                                        if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
                                                c));

        d = _mm_add_epi8(d, nearest);  // Note `_epi8`: we need addition to wrap modulo 255.
        store<bpp>(row, _mm_packus_epi16(d,d));

        prev += bpp;
        row  += bpp;
        rb   -= bpp;
    }
}
Ejemplo n.º 4
0
void FMEMultipoleKernel::multipoleApproxSingleThreaded(ArrayPartition& nodePointPartition)
{
	FMELocalContext*  localContext	= m_pLocalContext;
	FMEGlobalContext* globalContext = m_pGlobalContext;
	LinearQuadtree&	tree			= *globalContext->pQuadtree;
	if (isMainThread())
	{									
		tree.bottom_up_traversal(					// do a bottom up traversal M2M pass
			if_then_else(tree.is_leaf_condition(),	// if the current node is a leaf
				p2m_function(localContext),			// then calculate the multipole coeff. due to the points in the leaf
				m2m_function(localContext)			// else shift the coefficents of all children to center of the inner node
			)
		)(tree.root());
	
		tree.forall_well_separated_pairs(				// do a wspd traversal M2L direct eval
			pair_vice_versa(m2l_function(localContext)),// M2L for a well-separated pair
			p2p_function(localContext),					// direct evaluation
			p2p_function(localContext)					// direct evaluation
		)(tree.root());
	
		tree.top_down_traversal(						// top down traversal 
			if_then_else( tree.is_leaf_condition(),		// if the node is a leaf
				do_nothing(),							// then do nothing, we will deal with this case later
				l2l_function(localContext)				// else shift the nodes local coeffs to the children
			)
		)(tree.root());// start at the root 

		// evaluate all leaves and store the forces in the threads array
		for_loop(nodePointPartition,				// loop over points
			func_comp(								// composition of two statements
				l2p_function(localContext),			// evaluate the forces due to the local expansion in the corresponding leaf
				collect_force_function				// collect the forces of all threads with the following options:
				<
					COLLECT_REPULSIVE_FACTOR | 		// multiply by the repulsive factor stored in the global options
					COLLECT_TREE_2_GRAPH_ORDER |	// threads data is stored in quadtree leaf order, transform it into graph order
					COLLECT_ZERO_THREAD_ARRAY		// reset threads array
				>(localContext)
			)
		);
	};
};
Ejemplo n.º 5
0
//! the final approximation algorithm which runs the wspd parallel without storing it in the threads subtrees
void FMEMultipoleKernel::multipoleApproxFinal(ArrayPartition& nodePointPartition)
{
	FMELocalContext*  localContext	= m_pLocalContext;
	FMEGlobalContext* globalContext = m_pGlobalContext;
	LinearQuadtree&	tree			= *globalContext->pQuadtree;
	// big multihreaded bottom up traversal.
	for_tree_partition(								// for all roots in the threads tree partition
		tree.bottom_up_traversal(					// do a bottom up traversal 
			if_then_else(tree.is_leaf_condition(),	// if the current node is a leaf
				p2m_function(localContext),			// then calculate the multipole coeff. due to the points in the leaf
				m2m_function(localContext)			// else shift the coefficents of all children to center of the inner node
			)
		)
	);
	sync();
	// top of the tree has to be done by the main thread
	if (isMainThread())
	{
		tree.bottom_up_traversal(					// start a bottom up traversal 
			if_then_else(tree.is_leaf_condition(),	// if the current node is a leaf
				p2m_function(localContext),			// then calculate the multipole coeff. due to the points in the leaf
				m2m_function(localContext)			// else shift the coefficents of all children to center of the inner node
			),
			not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached

		tree.forall_well_separated_pairs(	// do a wspd traversal
			tree.StoreWSPairFunction(),		// store the ws pairs in the WSPD
			tree.StoreDirectPairFunction(), // store the direct pairs
			tree.StoreDirectNodeFunction(),	// store the direct nodes
			not_condition(tree.is_fence_condition()))(tree.root());
	};
	// wait for the main thread to finish
	sync();

	// M2L pass with the WSPD for the result of the single threaded pass above
	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
	
	// D2D pass and store in the thread force array
	for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
	for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));

	// wait until all local coeffs and all direct forces are computed
	sync();

	// the rest of the WSPD can be done on the fly by the thread
	for_tree_partition(	
		tree.forall_well_separated_pairs(					// do a wspd traversal
			pair_vice_versa(m2l_function(localContext)),	// M2L for a well-separated pair
			p2p_function(localContext),						// direct evaluation
			p2p_function(localContext)						// direct evaluation
		)
	);	
	// wait until all local coeffs and all direct forces are computed
	sync();

	// big multihreaded top down traversal. top of the tree has to be done by the main thread
	if (isMainThread())
	{
		tree.top_down_traversal(						// top down traversal L2L pass
			if_then_else( tree.is_leaf_condition(),		// if the node is a leaf
				do_nothing(),							// then do nothing, we will deal with this case later
				l2l_function(localContext)				// else shift the nodes local coeffs to the children
			),
			not_condition(tree.is_fence_condition())	// stop when the fence to the threads is reached 
		)(tree.root());									// start at the root, 
	};
	// wait for the top of the tree
	sync();
	
	for_tree_partition(								// for all roots in the threads tree partition L2L pass
		tree.top_down_traversal(					// do a top down traversal 
			if_then_else( tree.is_leaf_condition(),	// if the node is a leaf
				do_nothing(),						// then do nothing, we will deal with this case later
				l2l_function(localContext)			// else shift the nodes local coeffs to the children
			)
		)
	);
	// wait until the traversal is finished and all leaves have their accumulated local coeffs
	sync(); 
	// evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
	// we can start immediately to collect the forces because we evaluated before point by point 
	for_loop(nodePointPartition,				// loop over threads points
		func_comp(								// composition of two statements
			l2p_function(localContext),			// evaluate the forces due to the local expansion in the corresponding leaf
			collect_force_function				// collect the forces of all threads with the following options:
			<
				COLLECT_REPULSIVE_FACTOR | 		// multiply by the repulsive factor stored in the global options
				COLLECT_TREE_2_GRAPH_ORDER |	// threads data is stored in quadtree leaf order, transform it into graph order
				COLLECT_ZERO_THREAD_ARRAY		// reset threads array
			>(localContext)
		)
	);
};