Пример #1
0
void lab4check(char* folderPath, char* testName) {

	LARGE_INTEGER frequency;
	LARGE_INTEGER t1, t2;
	QueryPerformanceFrequency(&frequency);
	QueryPerformanceCounter(&t1);

	std::string path(folderPath);
	std::string fileName;

	printf("     read %s\n", testName);
	CCharBuffer text(fileName.assign(path).append(testName).c_str());

	printf("     check %s:\n", testName);
	CCharBuffer patterns[4*mLen*3*tests];
	GeneratePatterns(text, patterns);

	CSuffixArray sArr(text);

	const unsigned int cc = tests, bc = 3*tests, ac = mLen*bc, kt[4] = { 1, 1, 3, 1 }, taskMask[4] = { 1, 0, 0, 0 };

	for(unsigned int i=0; i<mLen; i++) {
		printf("        length:\t%u\n", m[i]);

		printf("          suffix tree:");
		for(unsigned int k=0; k<4; k++) if((taskMask[k] != 0) &&((k!=3) || (i<3))) {
			QueryPerformanceCounter(&t1);
			for(unsigned int l=0; l<kt[k]; l++)
				for(unsigned int j=0; j<tests; j++)
					binarySearch(sArr, patterns[k*ac+i*bc+l*cc+j]);
			QueryPerformanceCounter(&t2);
			printf("\t%.2lf ms", (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart);
		}
		printf("\n");

	}
	printf("\n");
}
Пример #2
0
  void ff(TextureQuantizeRAW,format,A)(RESOURCEINFO &texo, RESOURCEINFO &texd, ULONG *texs, ULONG *texr, int level, int l) {
    /* square dimension of this surface-level */
    /* square area of this surface-level */
    const int lv = (1 << l);
    const int av = lv * lv;

    /* ------------------------------------------------------------------------------------------------------- */
    const int NORMALS_SCALEBYLEVEL = ::NORMALS_SCALEBYLEVEL;
    const int  ALPHAS_SCALEBYLEVEL =  ::ALPHAS_SCALEBYLEVEL;
    const float colorgamma       = ::colorgamma;
    const float alphacontrast    = ::alphacontrast;
    const float colorgammainv    = ::colorgammainv;
    const float alphacontrastinv = ::alphacontrastinv;

    int iwidth  = texo.Width;
    int iheight = texo.Height;
    int owidth  = texd.Width;
    int oheight = texd.Height;
    int cwidth  = owidth;
    int cheight = oheight;

    /* get the data back to the CPU */

#if	(TCOMPRESS_CHANNELS(format) == 4)
    /* ABGR -> ARGB */ cwidth = (cwidth +  0) >> 0; /* 1x LONG to 1x LONG */
#elif	(TCOMPRESS_CHANNELS(format) == 3)
    /* -BGR -> -RGB */ cwidth = (cwidth +  1) >> 1; /* 1x LONG to 1x SHORT */
#elif	(TCOMPRESS_CHANNELS(format) == 2)
    /* LA-- -> AL-- */ cwidth = (cwidth +  3) >> 2; /* 1x LONG to 1x CHAR */
#elif	(TCOMPRESS_CHANNELS(format) == 1)
    /* A--- -> A--- */ cwidth = (cwidth + 31) >> 5; /* 8x LONG to 1x CHAR */
#else
#error
#endif

    /* ensure tile ability (bit on overhead for non-4 resolutions) */
    owidth  = (owidth  + (TX - 1)) & (~(TX - 1));
    oheight = (oheight + (TY - 1)) & (~(TY - 1));

    assert((owidth  & (TX - 1)) == 0);
    assert((oheight & (TY - 1)) == 0);

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
    /* get a two-dimensional extend over the whole output (without re-cast to LONG),
     * then get a tile-extend over that one ()
     */
    Concurrency::extent<2> ee(oheight, owidth);
    Concurrency::tiled_extent<TY, TX> te(ee);

    Concurrency::array_view<const unsigned int, 2> sArr(iheight, iwidth, (const unsigned int *)texs);
    Concurrency::array_view<      unsigned int, 2> dArr(cheight, cwidth, (      unsigned int *)texr);

    Concurrency::parallel_for_each(te /*dArr.extent.tile<TY, TX>(osize)*/, [=](tiled_index<TY, TX> elm) restrict(amp) {
      typedef type accu[DIM];

      /* tile static memory */
//    tile_static UTYPE bTex[2][TY][TX];
      tile_static int   bTex[2][TY][TX];
      tile_static type  fTex[2][TY][TX][DIM];

//    const int y = elm.global[0] - ly;
//    const int x = elm.global[1] - lx;
      const int y = elm.tile[0] * TY;
      const int x = elm.tile[1] * TX;
      const int ly = elm.local[0];
      const int lx = elm.local[1];
#else
    array_view<const unsigned int, 2> sArr(iheight, iwidth, (const unsigned int *)texs);
    array_view<      unsigned int, 2> dArr(cheight, cwidth, (      unsigned int *)texr, true);

    for (int groupsy = 0; groupsy < (owidth  / TY); groupsy++)
    for (int groupsx = 0; groupsx < (oheight / TX); groupsx++) {
      typedef type accu[DIM];

      /* tile static memory */
//    UTYPE bTex[2][TY][TX];
      int   bTex[2][TY][TX];
      type  fTex[2][TY][TX][DIM];

    for (int tiley = 0; tiley < TY; tiley++)
    for (int tilex = 0; tilex < TX; tilex++)
    {
      const int y = groupsy * TY;
      const int x = groupsx * TX;
      const int ly = tiley;
      const int lx = tilex;
#endif
      /* generate this level's 4x4-block from the original surface */
      {
	const int yl = ((y + ly) << l);
	const int xl = ((x + lx) << l);

	accu tt; tt[0] = tt[1] = tt[2] = tt[3] = tt[4] = tt[5] = tt[6] = tt[7] = 0;

	/* access all pixels this level's 4x4-block represents in
	 * the full dimension original surface (high quality mip-mapping)
	 */
	for (int oy = 0; oy < lv; oy += 1)
	for (int ox = 0; ox < lv; ox += 1) {
	  /* assume seamless tiling: wrap pixels around */
	  const int posx = (xl + ox) % iwidth;
	  const int posy = (yl + oy) % iheight;

	  const ULONG &t = sArr(posy, posx);

	  Accu(tt, t);	// +=
	}

	/* build average of each channel */
	Norm(fTex[0][ly][lx], tt, av, level, l);
      }

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      tile_static accu tr; tr[(ly * TX + lx) & 7] = 0;

      tile_static_memory_fence(elm.barrier);
//    elm.barrier.wait_with_tile_static_memory_fence();
#else
      }

      accu tr = {0};
#endif

      /* runs on only 1 thread per tile (reduction) */
#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      if (elm.local == index<2>(0, 0))
#endif
      {
	/* analyze this level's 4x4-block */
	for (int ly = 0; ly < TY; ly += 1)
	for (int lx = 0; lx < TX; lx += 1) {
	  Look(fTex[0][ly][lx], tr);
	}
      }

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      tile_static_memory_fence(elm.barrier);
//    elm.barrier.wait_with_tile_static_memory_fence();
#else
      for (int tiley = 0; tiley < TY; tiley++)
      for (int tilex = 0; tilex < TX; tilex++)
      {
	const int y = groupsy;
	const int x = groupsx;
	const int ly = tiley;
	const int lx = tilex;
#endif

      /* generate this level's 4x4-block from the original surface */
      {
	/* build average of each channel an join */
	ULONG t;

	Code(fTex[0][ly][lx], tr, (A > 2 ? 4 : (A > 1 ? 10 : (A > 0 ? 5 : 6)))); t =
	Qunt(fTex[0][ly][lx], tr, (A > 2 ? 4 : (A > 1 ? 10 : (A > 0 ? 5 : 6))));

	/* write the result */

	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#if	(TCOMPRESS_CHANNELS(format) == 4)
	/* ABGR -> RGBA */
	bTex[0][ly][lx] = t;
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	(TCOMPRESS_CHANNELS(format) == 3)
	/* -BGR -> RGB- */
	bTex[0][ly][lx] = t;
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	(TCOMPRESS_CHANNELS(format) == 2)
	/* AL-- -> LA-- */
	bTex[0][ly][lx] = t;
#else
#error
#endif
      }

      /* put this level's 4x4-block into the destination surface */
      {
	/* assume seamless tiling: wrap pixels around */
	const int posx = (x + lx) % owidth;
	const int posy = (y + ly) % oheight;

	/* convert unaligned output location to "int"-space output location */
	const int linear = ((posy * owidth) + posx) * 1;
	const int lposx = (linear << 0) % (cwidth << 0);
	const int lposy = (linear << 0) / (cwidth << 0);

	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#if	(TCOMPRESS_CHANNELS(format) <= 4)
	/* ABGR -> ARGB */
	if (sizeof(UTYPE) == 4) {
	  /* every single thread */
	  {
	    int t0 = bTex[0][ly][lx + 0];

	    /* write combining */
	    unsigned int val = (ULONG)t0;

	    /* write out all of an "int" */
	    dArr(lposy, lposx) = val;
	  }
	}
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	(TCOMPRESS_CHANNELS(format) <= 3)
	/* -BGR -> -RGB */
	if (sizeof(UTYPE) == 2) {
	  /* every second thread */
	  if (!(elm.local[1] & 1)) {
	    int t0 = bTex[0][ly][lx + 0];
	    int t1 = bTex[0][ly][lx + 1];

	    /* write combining */
	    unsigned int val = (ULONG)((t1 << 16) + (t0 << 0));

	    /* write out all of an "int" */
	    dArr(lposy, lposx) = val;
	  }
	}
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	(TCOMPRESS_CHANNELS(format) <= 2)
	/* --YX -> XY-- */
	/* LA-- -> AL-- */
	if (sizeof(UTYPE) == 1) {
	  /* every fourth thread */
	  if (!(elm.local[1] & 3)) {
	    int t0 = bTex[0][ly][lx + 0];
	    int t1 = bTex[0][ly][lx + 1];
	    int t2 = bTex[0][ly][lx + 2];
	    int t3 = bTex[0][ly][lx + 3];

	    /* write combining */
	    unsigned int val = (ULONG)((t3 << 24) + (t2 << 16) + (t1 << 8) + (t0 << 0));

	    /* write out all of an "int" */
	    dArr(lposy, lposx) = val;
	  }
	}
#else
#error
#endif
      }

//    dTex += 0;
#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
    });

    dArr.synchronize();
#else
    }}
Пример #3
0
  void ff(TextureCompressDXT,format,coding,fiting)(RESOURCEINFO &texo, RESOURCEINFO &texd, ULONG *texs, ULONG *texr, int level, int l, int blocksize, int flags) {
    /* square dimension of this surface-level */
    /* square area of this surface-level */
    const int lv = (1 << l);
    const int av = lv * lv;

    /* ------------------------------------------------------------------------------------------------------- */
    const int NORMALS_SCALEBYLEVEL = ::NORMALS_SCALEBYLEVEL;
    const int  ALPHAS_SCALEBYLEVEL =  ::ALPHAS_SCALEBYLEVEL;
    const float colorgamma       = ::colorgamma;
    const float alphacontrast    = ::alphacontrast;
    const float colorgammainv    = ::colorgammainv;
    const float alphacontrastinv = ::alphacontrastinv;

    int iwidth  = texo.Width;
    int iheight = texo.Height;
    int owidth  = texd.Width;
    int oheight = texd.Height;
    int cwidth  = owidth;
    int cheight = oheight;

    /* get the data back to the CPU */
    cheight = (cheight + 3) / 4;	/* 4x4 LONG ... */
    cwidth  = (cwidth  + 3) / 4;	/* 4x4 LONG ... */
    cwidth *= 2 * blocksize;		/* ... to 2x|4x LONG */

    /* ensure tile ability (bit on overhead for non-4 resolutions) */
    owidth  = (owidth  + (TX - 1)) & (~(TX - 1));
    oheight = (oheight + (TY - 1)) & (~(TY - 1));

    assert((owidth  & (TX - 1)) == 0);
    assert((oheight & (TY - 1)) == 0);

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
    /* constant buffer array */
    Concurrency::array_view<const SingleColourLookup_CCR, 2> lArr(2, 256, (const SingleColourLookup_CCR *)::lookup_34_56_ccr);
    Concurrency::array_view<const   IndexBlockLookup_CCR, 2> yArr(4, 8,   (const   IndexBlockLookup_CCR *)::lookup_c34a57_ccr);

    /* get a two-dimensional extend over the whole output (without re-cast to LONG),
     * then get a tile-extend over that one ()
     */
    Concurrency::extent<2> ee(oheight, owidth);
    Concurrency::tiled_extent<TY, TX> te(ee);

    Concurrency::array_view<const unsigned int, 2> sArr(iheight, iwidth, (const unsigned int *)texs);
    Concurrency::array_view<      unsigned int, 2> dArr(cheight, cwidth, (      unsigned int *)texr);

    Concurrency::parallel_for_each(te /*dArr.extent.tile<TY, TX>(osize)*/, [=](tiled_index<TY, TX> elm) restrict(amp) {
      typedef type accu[DIM];

      /* tile static memory */
//    tile_static UTYPE bTex[2][TY*TX];
      tile_static type  fTex[2][TY*TX][DIM];
      tile_static int   iTex[2][TY*TX][DIM];

      /* generate this level's 4x4-block from the original surface */
//    const int y = elm.global[0] - ly;
//    const int x = elm.global[1] - lx;
      const int y = elm.tile[0] * TY;
      const int x = elm.tile[1] * TX;
      const int ly = elm.local[0];
      const int lx = elm.local[1];
      const int lxy = ly * TX + lx;
#else
    Concurrency::array_view<const SingleColourLookup_CCR, 2> lArr(2, 256, (const SingleColourLookup_CCR *)::lookup_34_56_ccr);

    Concurrency::array_view<const unsigned int, 2> sArr(iheight, iwidth, (const unsigned int *)texs);
    Concurrency::array_view<      unsigned int, 2> dArr(cheight, cwidth, (      unsigned int *)texr, true);

    for (int groupsy = 0; groupsy < (owidth  / TY); groupsy++)
    for (int groupsx = 0; groupsx < (oheight / TX); groupsx++) {
      typedef type accu[DIM];

      /* tile static memory */
//    UTYPE bTex[2][TY*TX];
      type  fTex[2][TY*TX][DIM];
      int   iTex[2][TY*TX][DIM];

      for (int tiley = 0; tiley < TY; tiley++)
      for (int tilex = 0; tilex < TX; tilex++)
      {
	const int y = groupsy * TY;
	const int x = groupsx * TX;
	const int ly = tiley;
	const int lx = tilex;
	const int lxy = ly * TX + lx;
#endif

      {
	const int yl = ((y + ly) << l);
	const int xl = ((x + lx) << l);

	accu tt = {0};

	/* access all pixels this level's 4x4-block represents in
	 * the full dimension original surface (high quality mip-mapping)
	 */
	for (int oy = 0; oy < lv; oy += 1) {
	for (int ox = 0; ox < lv; ox += 1) {
	  /* assume seamless tiling: wrap pixels around */
	  const int posx = (xl + ox) % iwidth;
	  const int posy = (yl + oy) % iheight;

	  const ULONG &t = sArr(posy, posx);

	  Accu(tt, t);	// +=
	}
	}

	/* build average of each channel */
	Norm(fTex[0][lxy], tt, av, level, l);
      }

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      tile_static accu tr; tr[lxy & 7] = 0;

      tile_static_memory_fence(elm.barrier);
//    elm.barrier.wait_with_tile_static_memory_fence();
#else
      }

      accu tr = {0};
#endif

      /* runs on only 1 thread per tile (reduction) */
#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      if (elm.local == index<2>(0, 0))
#endif
      {
	/* analyze this level's 4x4-block */
	for (int lxy = 0; lxy < TY*TX; lxy += 1) {
	  Look(fTex[0][lxy], tr);
	}
      }

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      tile_static_memory_fence(elm.barrier);
//    elm.barrier.wait_with_tile_static_memory_fence();
#else
      for (int tiley = 0; tiley < TY; tiley++)
      for (int tilex = 0; tilex < TX; tilex++)
      {
	const int y = groupsy;
	const int x = groupsx;
	const int ly = tiley;
	const int lx = tilex;
	const int lxy = ly * TX + lx;
#endif

      /* generate this level's 4x4-block from the original surface */
      {
	/* build average of each channel an join */
	Code (fTex[0][lxy], tr,
	  (TCOMPRESS_CHANNELS(format) +
	  (TCOMPRESS_GREYS   (format) ? 2 : 0)) == 2 ? 8 :
	  (TCOMPRESS_SWIZZL  (format) ? 6 : 5));
	Range(iTex[0][lxy],
	      fTex[0][lxy]);

#if	(TCOMPRESS_SWIZZL(format))
	/* swizzle ABGR -> AGBR */
        {
	  int swap =        iTex[0][lxy][1];
	  iTex[0][lxy][1] = iTex[0][lxy][2];
	  iTex[0][lxy][2] = swap           ;
	}
#endif

	/* write the result */

	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#if	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 4)
	/* ABGR -> RGBA */
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 3)
	/* -BGR -> RGB- */
	iTex[0][lxy][0] = 0xFF;
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 2)
	/* --YX -> XY-- */
	/* AL-- -> LA-- */
#if	(format == TCOMPRESS_XYz)
	iTex[0][lxy][0] = iTex[0][lxy][2],  // Y
	iTex[1][lxy][0] = iTex[0][lxy][3];  // X
#else
	iTex[0][lxy][0] = iTex[0][lxy][0],  // A
	iTex[1][lxy][0] = iTex[0][lxy][1];  // Z
#endif
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 1)
	  /* -Z-- -> Z--- */
	  /* A--- -> A--- */
	  /* -LLL -> L--- */
#if	(format == TCOMPRESS_a  )
	iTex[0][lxy][0] = iTex[0][lxy][0];  // A
#elif	(format == TCOMPRESS_A  )
	iTex[0][lxy][0] = iTex[0][lxy][0];  // A
#elif	(format == TCOMPRESS_xyZ)
	iTex[0][lxy][0] = iTex[0][lxy][1];  // Z
#else
	iTex[0][lxy][0] = iTex[0][lxy][3];  // X
#endif
#else
#error
#endif
      }

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
      tile_static_memory_fence(elm.barrier);
//    elm.barrier.wait_with_tile_static_memory_fence();

#define local_is(a,b) elm.local == index<2>(a, b)
#else
      }

      for (int tiley = 0; tiley < TY; tiley++)
      for (int tilex = 0; tilex < TX; tilex++)
      {
	const int y = groupsy;
	const int x = groupsx;
	const int ly = tiley;
	const int lx = tilex;
	const int lxy = ly * TX + lx;

#define local_is(a,b) ((ly == a) && (lx == b))
#endif

      /* put this level's 4x4-block into the destination surface */
      {
	/* round down */
	int posx = (x + lx) >> 2;
	int posy = (y + ly) >> 2;

	/* first and second block */
	unsigned int b[2][2];

        /* compress to DXT1/DXT3/DXT5/ATI1/ATI2 */
#define	sflgs	TCOMPRESS_COLOR(format) ? SQUISH_METRIC_PERCEPTUAL : SQUISH_METRIC_UNIFORM,	\
		TCOMPRESS_TRANS(format),							\
		                fiting

	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#if	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 4) ||		\
	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 3)
	/* 1x LONG per block for DXT1, 2x for the others */

#if	(coding == 1)
	{ posx <<= 0;
	  squish::CompressColorBtc1(elm.barrier, lxy, iTex[0], 0xFFFF, b[1], sflgs, yArr, lArr);

	  dArr(posy, posx + 0) = b[1][0];
	  dArr(posy, posx + 1) = b[1][1];
	}
#elif	(coding == 2)
	{ posx <<= 1;
	  squish::CompressAlphaBtc2(elm.barrier, lxy, iTex[0], 0xFFFF, b[0]       , yArr      );
	  squish::CompressColorBtc2(elm.barrier, lxy, iTex[0], 0xFFFF, b[1], sflgs, yArr, lArr);

	  dArr(posy, posx + 0) = b[0][0];
	  dArr(posy, posx + 1) = b[0][1];
	  dArr(posy, posx + 2) = b[1][0];
	  dArr(posy, posx + 3) = b[1][1];
	}
#elif	(coding == 3)
	{ posx <<= 1;
	  squish::CompressAlphaBtc3(elm.barrier, lxy, iTex[0], 0xFFFF, b[0]       , yArr      );
	  squish::CompressColorBtc3(elm.barrier, lxy, iTex[0], 0xFFFF, b[1], sflgs, yArr, lArr);

	  dArr(posy, posx + 0) = b[0][0];
	  dArr(posy, posx + 1) = b[0][1];
	  dArr(posy, posx + 2) = b[1][0];
	  dArr(posy, posx + 3) = b[1][1];
	}
#else
#error
#endif
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 1)
	/* 1x LONG for ATI1 */

#if	(coding == 4)
	{ posx <<= 0;
	  squish::CompressAlphaBtc3(elm.barrier, lxy, iTex[0], 0xFFFF, b[0]       , yArr      );

	  dArr(posy, posx + 0) = b[0][0];
	  dArr(posy, posx + 1) = b[0][1];
	}
#else
#error
#endif
	/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#elif	((TCOMPRESS_CHANNELS(format) + (TCOMPRESS_GREYS(format) ? 2 : 0)) == 2)
	/* 2x LONG for ATI2 */

#if	(coding == 5)
	{ posx <<= 1;
	  squish::CompressAlphaBtc3(elm.barrier, lxy, iTex[0], 0xFFFF, b[0]       , yArr      );
	  squish::CompressAlphaBtc3(elm.barrier, lxy, iTex[1], 0xFFFF, b[1]       , yArr      );

	  dArr(posy, posx + 0) = b[0][0];
	  dArr(posy, posx + 1) = b[0][1];
	  dArr(posy, posx + 2) = b[1][0];
	  dArr(posy, posx + 3) = b[1][1];
	}
#else
#error
#endif
#else
#error
#endif

#undef	sflgs

//	elm.barrier.wait();

        /* advance pointer of compressed blocks */
//      wTex += blocksize;
//      dTex += blocksize;
      }

#if	defined(SQUASH_USE_AMP) && !defined(SQUASH_USE_AMP_DEBUG)
//    elm.barrier.wait();

//    dTex += 0;
    });

    dArr.synchronize();
#else
    }}