Example #1
0
// Mask and size for the table map. Use 12.5% full
PRIVATE void calculate_table_mask(unsigned int num_elem)
{
	int i;
	size_table = 1;

	// Generate result with all bits less than
	// first bit in num_elem in 1
	while(size_table < num_elem)
		size_table = (size_table << 1) + 1;

	// 3 bits more into account
	for(i = 0; i < 3; i++)
		size_table = (size_table << 1) + 1;

	size_bit_table = (size_table << 1) + 1;
	//size_bit_table = (size_bit_table << 1) + 1;

	_BitScanReverse(&first_bit_size_bit_table, size_bit_table);
	_BitScanReverse(&first_bit_size_table, size_table);
	first_bit_size_bit_table++;
	first_bit_size_table++;

	size_table_see2[0] = size_table_see2[1] = size_table_see2[2] = size_table_see2[3] = size_table;
	size_bit_table_see2[0] = size_bit_table_see2[1] = size_bit_table_see2[2] = size_bit_table_see2[3] = size_bit_table;
}
Example #2
0
int
first_one01( unsigned int u0, unsigned int u1 )
{
    unsigned long index;

    if ( _BitScanReverse( &index, u0 ) ) {
        return 26 - index;
    }
    _BitScanReverse( &index, u1 );
    return 53 - index;
}
Example #3
0
int
first_one12( unsigned int u1, unsigned int u2 )
{
    unsigned long index;

    if ( _BitScanReverse( &index, u1 ) ) {
        return 53 - index;
    }
    _BitScanReverse( &index, u2 );
    return 80 - index;
}
//0 returns 0, 2^31 returns 0 (overflow), works for all else values
//can be implemented like
//mov eax,ecx; or pop eax;
//bsr ecx,eax;
//bsf edx,eax;
//jz _nothing_set;
//cmp edx,ecx	//zero (no carry) if equal, negative (since edx<=ecx) otherwise
//adc ecx,0;	//add the carry bit;
//
//mov eax,1;	//generate the next number 1<<cl
//shl eax,cl;	
//_nothing_set:	//eax = 0 (input value) or 1<<cl
//ret
// ;)
u32 next_pow2(u32 s)
{
	unsigned long idxR,idxF;
	_BitScanReverse(&idxR,s);
	if (0==_BitScanReverse(&idxF,s))
		return s;	//its 0 anyway ...
	else
	{
		//idxR==idxF -> single bit set, power of 2
		//idxR>idxF -> more bits set, round up by next idxR
		idxR+=((s32)idxF-(s32)idxR)>=0?0:1;
	}
	return 1<<idxR;
}
Example #5
0
// log2 - returns -1 if x==0, otherwise log2(x)
inline int log2(size_t x) {
    if (x == 0)
        return -1;
#if defined(__GNUC__)
#   ifdef REALM_PTR_64
    return 63 - __builtin_clzll(x); // returns int
#   else
    return 31 - __builtin_clz(x); // returns int
#   endif
#elif defined(_WIN32)
    unsigned long index = 0;
#   ifdef REALM_PTR_64
    unsigned char c = _BitScanReverse64(&index, x); // outputs unsigned long
#   else
    unsigned char c = _BitScanReverse(&index, x); // outputs unsigned long
#   endif
    return static_cast<int>(index);
#else // not __GNUC__ and not _WIN32
    int r = 0;
    while (x >>= 1) {
        r++;
    }
    return r;
#endif
}
BOOL CSongPropsDlg::OnInitDialog() 
{
	CDialog::OnInitDialog();

	SetIcon(theApp.LoadIcon(IDR_MAINFRAME), 0);

	CPatchGeneralDlg::InitNoteCombo(m_KeySig);
	m_KeySig.SetCurSel(m_Props.m_Key);
	m_TimeSigNumer.SetVal(m_Props.m_Meter.m_Numerator);
	CStringArrayEx	sUnit;
	CPartBassDlg::GetPowerOfTwoStrings(sUnit, 
		CSong::CMeter::MIN_UNIT_EXP, CSong::CMeter::MAX_UNIT_EXP);
	int	nUnits = sUnit.GetSize();
	for (int iUnit = 0; iUnit < nUnits; iUnit++)
		m_TimeSigDenom.AddString(sUnit[iUnit]);
	DWORD	iSelUnit;
	_BitScanReverse(&iSelUnit, m_Props.m_Meter.m_Denominator);
	m_TimeSigDenom.SetCurSel(iSelUnit);
	m_Tempo.SetVal(m_Props.m_Tempo);
	m_Transpose.SetVal(m_Props.m_Transpose);
	m_Comments.SetWindowText(m_Props.m_Comments);

	EnableToolTips();

	return TRUE;  // return TRUE unless you set the focus to a control
	              // EXCEPTION: OCX Property Pages should return FALSE
}
Example #7
0
inline unsigned int CountLeadingZeros(unsigned int x)
{
	unsigned long firstBit;
	if ( _BitScanReverse(&firstBit,x) )
		return 31 - firstBit;
	return 32;
}
Example #8
0
Size findLastBit(Size a) {
#ifdef __GNUC__
#ifdef __X64__
    return sizeof(a)*8 - 1 - __builtin_clzl(a);
#else
    return sizeof(a)*8 - 1 - __builtin_clz(a);
#endif
#elif defined(_MSC_VER)
    unsigned long pos;
#ifdef __X64__
    _BitScanReverse64(&pos, a);
#else
	_BitScanReverse(&pos, a);
#endif
	return sizeof(a)*8 - 1 - pos;
#else
	//Very naive implementation.
	Size c = sizeof(a)*8 - 1;
    const Size mask = 1 << c;
	while(!(a & mask)) {
		a <<= 1;
		c--;
	}
	return c;
#endif
}
Example #9
0
int
first_one2( unsigned int u2 )
{
    unsigned long index;

    _BitScanReverse( &index, u2 );
    return 80 - index;
}
Example #10
0
int
first_one1( unsigned int u1 )
{
    unsigned long index;

    _BitScanReverse( &index, u1 );
    return 53 - index;
}
Example #11
0
int flsl(long value)
{
	unsigned long index = 0;
	unsigned char isNonZero;

	isNonZero = _BitScanReverse(&index, value);
	return isNonZero ? index + 1 : 0;
}
Example #12
0
inline unsigned int BitCountNeededToEncode(unsigned int data)
{
#if defined(_X360)
	return (32 - CountLeadingZeros(data+1)) - 1;
#else
	unsigned long firstBit;
	_BitScanReverse(&firstBit,data+1);
	return firstBit;
#endif
}
Example #13
0
// Returns the smallest n such that (length >> n) >= 4
static int GetMinimumPower(int length)
{
	unsigned long rightmost_bit;
	if (!_BitScanReverse(&rightmost_bit, length))
	{
		assert(false);
		return 0;
	}
	return rightmost_bit - 2;
}
Example #14
0
/* returns the integer logarithm of v (bit width) */
uint32_t bits(const uint32_t v) {
#ifdef _MSC_VER
    unsigned long answer;
    if (v == 0) {
        return 0;
    }
    _BitScanReverse(&answer, v);
    return answer + 1;
#else
    return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */
#endif
}
Example #15
0
static inline unsigned count_leading_ones(boost::uint8_t x)
{
	boost::uint32_t i = ~x;
	i = (i<<24) | 0x00FFFFFF;
#ifdef _MSC_VER
	unsigned long r;
	_BitScanReverse(&r, (unsigned long)i);
	return 31 - r;
#else
	return __builtin_clz(i);
#endif
}
Example #16
0
hv_uint32_t hv_min_max_log2(hv_uint32_t x) {
#if HV_MSVC
  // finds ceil(log2(x))
  // http://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
  // http://msdn.microsoft.com/en-us/library/fbxyd7zd%28v=VS.80%29.aspx
  unsigned long z = 0;
  _BitScanReverse(&z, x);
  return (hv_uint32_t) (z+1);
#else
  return (hv_uint32_t) ((8 * sizeof(unsigned int)) - __builtin_clz(x-1));
#endif // HV_MSVC
}
Example #17
0
CPU_DATA  CPU_CntLeadZeros (CPU_DATA  val)
{
    DWORD  clz;


    if (val == 0u) {
        return (32u);
    }

    _BitScanReverse(&clz, (DWORD)val);

    return (31u - (CPU_DATA)clz);
}
/* called with interrupts disabled (via CLI) from an arbitrary location inside HAL.DLL */
static __inline LONG
ApicHighestVector(PULONG bitmap) {
  int i;
  ULONG bit;
  ULONG value;
  for (i = 0; i < 8; i++) {
    value = bitmap[(7 - i) * 4];
    if (value) {
      _BitScanReverse(&bit, value);
      return ((7 - i) << 5) | bit;
    }
  }
  return -1;
}
Example #19
0
/*
 * Binary logarithm of value (exact if the value is a power of 2,
 * approximate (floored) otherwise)
 */
static matras_id_t
matras_log2(matras_id_t val)
{
	assert(val > 0);
#ifdef WIN32
	unsigned long res = 0;
	unsigned char nonzero = _BitScanReverse(&res, val);
	assert(nonzero); (void)nonzero;
	return (matras_id_t)res;
#else
	return sizeof(unsigned int) * CHAR_BIT -
		__builtin_clz((unsigned int) val) - 1;
#endif
}
Example #20
0
        // 按 bit 数前面 0 的个数
        int Clz(size_t x)
        {
#ifdef _MSC_VER
            unsigned long r = 0;
# ifdef XXLIB_64BIT
            _BitScanReverse64(&r, x);
            return 63 - r;
# else
            _BitScanReverse(&r, x);
            return 31 - r;
# endif
#else
# ifdef XXLIB_64BIT
            return __builtin_clzl(x);
# else
            return __builtin_clz(x);
# endif
#endif
        }
TEST_F(AncestorTreeTest, add_node)
{
	unsigned long depth = 0;
	unsigned long mask = 1;
	unsigned char isNonzero = _BitScanReverse( &depth, mask);
	std::cout << depth << std::endl;

	directed::AncestorTree<int> tree;

	// depth 0
	directed::AncestorNode<int> * node_0 = tree.BuildNode(0);

	// depth 1
	directed::AncestorNode<int> * node_1 = tree.BuildNode(1);
	directed::AncestorNode<int> * node_2 = tree.BuildNode(2);

	// depth 2
	directed::AncestorNode<int> * node_3 = tree.BuildNode(3);
	directed::AncestorNode<int> * node_4 = tree.BuildNode(4);
	directed::AncestorNode<int> * node_5 = tree.BuildNode(5);
	directed::AncestorNode<int> * node_6 = tree.BuildNode(6);


	tree.SetRoot(node_0);

	node_0->SetLeft(node_1);
	node_0->SetRight(node_2);

	node_1->SetLeft(node_3);
	node_1->SetRight(node_4);
	node_2->SetLeft(node_5);
	node_2->SetRight(node_6);


	ASSERT_EQ(node_1, tree.LeastCommonAncestor(node_3, node_4));
	ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_3, node_2));
	ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_0, node_6));
	ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_1, node_2));
	ASSERT_EQ(node_2, tree.LeastCommonAncestor(node_5, node_6));
	ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_4, node_5));
	ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_3, node_6));
}
Example #22
0
// Count Leading Zeroes Word
static void
cntlzw(ThreadState *state, Instruction instr)
{
   unsigned long a;
   uint32_t s;

   s = state->gpr[instr.rS];

   if (!_BitScanReverse(&a, s)) {
      a = 32;
   } else {
      a = 31 - a;
   }

   state->gpr[instr.rA] = a;

   if (instr.rc) {
      updateConditionRegister(state, a);
   }
}
Example #23
0
//Return the number of leading zeros. Deliberately undefined if value == 0
inline unsigned countLeadingUnsetBits(unsigned value)
{
    dbgassertex(value != 0);
#if defined(__GNUC__)
    return __builtin_clz(value);
#elif defined (_WIN32)
    unsigned long index;
    _BitScanReverse(&index, value);
    return (unsigned)((sizeof(unsigned)*8)-1 - index);
#else
    unsigned mask = 1U << ((sizeof(unsigned)*8)-1);
    unsigned i;
    for (i=0; i < sizeof(unsigned)*8; i++)
    {
        if (value & mask)
            return i;
        mask = mask >> 1;
    }
    return i;
#endif
}
Example #24
0
//Return the number of bits including the first non-zero bit.  Undefined if value == 0
inline unsigned getMostSignificantBit(unsigned value)
{
    dbgassertex(value != 0);
#if defined(__GNUC__)
    return (sizeof(unsigned)*8) - __builtin_clz(value);
#elif defined (_WIN32)
    unsigned long index;
    _BitScanReverse(&index, value);
    return (unsigned)index+1;
#else
    unsigned mask = 1U << ((sizeof(unsigned)*8)-1);
    unsigned i;
    for (i=0; i < sizeof(unsigned)*8; i++)
    {
        if (value & mask)
            return sizeof(unsigned)*8-i;
        mask = mask >> 1;
    }
    return 0;
#endif
}
Example #25
0
uint32_t
msb_idx_u32(uint32_t n)
{
#if defined( _MSC_VER )

  uint32_t index;

  _BitScanReverse((unsigned long *)&index,n);

  return index;

#elif defined( __GNUC__ )

  return __builtin_clz(n) ^ 31;

#else

#error "No msb_index()"

#endif
}
Example #26
0
static INLINE unsigned clz(u32 v)
{
#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER)
  return __builtin_clz(v);
#elif defined(_MSC_VER)
  unsigned long idx;

  _BitScanReverse(&idx, v);

  return 31 ^ idx;
#else
  unsigned ret = 0;
  unsigned tmp;

  tmp = !(v & 0xFFFF0000) << 4; v <<= tmp; ret += tmp;
  tmp = !(v & 0xFF000000) << 3; v <<= tmp; ret += tmp;
  tmp = !(v & 0xF0000000) << 2; v <<= tmp; ret += tmp;
  tmp = !(v & 0xC0000000) << 1; v <<= tmp; ret += tmp;
  tmp = !(v & 0x80000000) << 0;            ret += tmp;

  return(ret);
#endif
}
CString CDurationComboBox::DurationToString(double Duration)
{
	CString	s;
	if (Duration) {
		for (int iDenom = 0; iDenom < DENOMINATORS; iDenom++) {
			int denom = 1 << iDenom;
			for (int iUnit = 0; iUnit < UNITS; iUnit++) {
				double	divisor = m_Unit[iUnit] / denom;
				double	r = fabs(fmod(Duration, divisor));
//				printf("%g %g\n", divisor, r);
				if (r < m_Epsilon || fabs(r - divisor) < m_Epsilon) {
					int	numer = round(Duration / divisor);
					DWORD	dots = 0;
					if (SHOW_DOTS) {
						if (numer > 2 && IsPowerOfTwo(numer + 1)) {
							int	DotDenom = denom / ((numer + 1) / 2);
							if (DotDenom) {	// avoid divide by zero
								_BitScanReverse(&dots, denom / DotDenom);
								denom = DotDenom;
								numer = 1;
							}
						}
					}
					s.Format(_T("%d/%d"), numer, denom);
					if (iUnit)
						s.Insert(numer < 0, _T("T"));
					for (DWORD iDot = 0; iDot < dots; iDot++)
						s += '.';
					return(s);	// early out
				}
			}
		}
	}
	s.Format(_T("%g"), Duration);
	return(s);
}
Example #28
0
File: tty.c Project: hghazal/node
static int uv_tty_write_bufs(uv_tty_t* handle, uv_buf_t bufs[], int bufcnt,
    DWORD* error) {
  /* We can only write 8k characters at a time. Windows can't handle */
  /* much more characters in a single console write anyway. */
  WCHAR utf16_buf[8192];
  DWORD utf16_buf_used = 0;
  int i;

#define FLUSH_TEXT()                                                \
  do {                                                              \
    if (utf16_buf_used > 0) {                                       \
      uv_tty_emit_text(handle, utf16_buf, utf16_buf_used, error);   \
      utf16_buf_used = 0;                                           \
    }                                                               \
  } while (0)

  /* Cache for fast access */
  unsigned char utf8_bytes_left = handle->utf8_bytes_left;
  unsigned int utf8_codepoint = handle->utf8_codepoint;
  unsigned char previous_eol = handle->previous_eol;
  unsigned char ansi_parser_state = handle->ansi_parser_state;

  /* Store the error here. If we encounter an error, stop trying to do i/o */
  /* but keep parsing the buffer so we leave the parser in a consistent */
  /* state. */
  *error = ERROR_SUCCESS;

  EnterCriticalSection(&uv_tty_output_lock);

  for (i = 0; i < bufcnt; i++) {
    uv_buf_t buf = bufs[i];
    unsigned int j;

    for (j = 0; j < buf.len; j++) {
      unsigned char c = buf.base[j];

      /* Run the character through the utf8 decoder We happily accept non */
      /* shortest form encodings and invalid code points - there's no real */
      /* harm that can be done. */
      if (utf8_bytes_left == 0) {
        /* Read utf-8 start byte */
        DWORD first_zero_bit;
        unsigned char not_c = ~c;
#ifdef _MSC_VER /* msvc */
        if (_BitScanReverse(&first_zero_bit, not_c)) {
#else /* assume gcc */
        if (first_zero_bit = __builtin_clzl(not_c), c != 0) {
#endif
          if (first_zero_bit == 7) {
            /* Ascii - pass right through */
            utf8_codepoint = (unsigned int) c;

          } else if (first_zero_bit <= 5) {
            /* Multibyte sequence */
            utf8_codepoint = (0xff >> (8 - first_zero_bit)) & c;
            utf8_bytes_left = (char) (6 - first_zero_bit);

          } else {
            /* Invalid continuation */
            utf8_codepoint = UNICODE_REPLACEMENT_CHARACTER;
          }

        } else {
          /* 0xff -- invalid */
          utf8_codepoint = UNICODE_REPLACEMENT_CHARACTER;
        }

      } else if ((c & 0xc0) == 0x80) {
	int perf()
	{
		int Error = 0;
		std::size_t const Count(100000000);

		{
			std::vector<int> Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(int i = 0; i < static_cast<int>(Count); ++i)
				Result[i] = glm::log2(static_cast<int>(i));

			std::clock_t End = clock();

			printf("glm::log2<int>: %ld clocks\n", End - Begin);
		}

		{
			std::vector<glm::ivec4> Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(int i = 0; i < static_cast<int>(Count); ++i)
				Result[i] = glm::log2(glm::ivec4(i));

			std::clock_t End = clock();

			printf("glm::log2<ivec4>: %ld clocks\n", End - Begin);
		}

#		if GLM_HAS_BITSCAN_WINDOWS
		{
			std::vector<glm::ivec4> Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(std::size_t i = 0; i < Count; ++i)
			{
				glm::tvec4<unsigned long, glm::defaultp> Tmp(glm::uninitialize);
				_BitScanReverse(&Tmp.x, i);
				_BitScanReverse(&Tmp.y, i);
				_BitScanReverse(&Tmp.z, i);
				_BitScanReverse(&Tmp.w, i);
				Result[i] = glm::ivec4(Tmp);
			}

			std::clock_t End = clock();

			printf("glm::log2<ivec4> inlined: %ld clocks\n", End - Begin);
		}


		{
			std::vector<glm::tvec4<unsigned long, glm::defaultp> > Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(std::size_t i = 0; i < Count; ++i)
			{
				_BitScanReverse(&Result[i].x, i);
				_BitScanReverse(&Result[i].y, i);
				_BitScanReverse(&Result[i].z, i);
				_BitScanReverse(&Result[i].w, i);
			}

			std::clock_t End = clock();

			printf("glm::log2<ivec4> inlined no cast: %ld clocks\n", End - Begin);
		}


		{
			std::vector<glm::ivec4> Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(std::size_t i = 0; i < Count; ++i)
			{
				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].x), i);
				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].y), i);
				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].z), i);
				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].w), i);
			}

			std::clock_t End = clock();

			printf("glm::log2<ivec4> reinterpret: %ld clocks\n", End - Begin);
		}
#		endif//GLM_HAS_BITSCAN_WINDOWS

		{
			std::vector<float> Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(std::size_t i = 0; i < Count; ++i)
				Result[i] = glm::log2(static_cast<float>(i));

			std::clock_t End = clock();

			printf("glm::log2<float>: %ld clocks\n", End - Begin);
		}

		{
			std::vector<glm::vec4> Result;
			Result.resize(Count);

			std::clock_t Begin = clock();

			for(int i = 0; i < static_cast<int>(Count); ++i)
				Result[i] = glm::log2(glm::vec4(i));

			std::clock_t End = clock();

			printf("glm::log2<vec4>: %ld clocks\n", End - Begin);
		}

		return Error;
	}
Example #30
0
/*
==================
==================
*/
void pixel_shader(

	const unsigned __int32 i_buffer,
	const unsigned __int32 coverage_mask,
	const __m128i bazza[3][4],
	shader_input_& shader_input
) {

	static const __m128 zero = set_zero();
	static const __m128 half = set_all(0.5f);
	static const __m128 one = set_all(1.0f);
	static const __m128 two = one + one;
	static const __m128 three = two + one;
	static const __m128i zero_int = set_zero_si128();
	static const __m128 colour_clamp = broadcast(load_s(255.0f));


	unsigned __int32 depth_mask = 0x0;

	__m128 w_screen[2][4];
	w_screen[0][0] = convert_float(bazza[0][0]) * shader_input.r_area;
	w_screen[0][1] = convert_float(bazza[0][1]) * shader_input.r_area;
	w_screen[0][2] = convert_float(bazza[0][2]) * shader_input.r_area;
	w_screen[0][3] = convert_float(bazza[0][3]) * shader_input.r_area;

	w_screen[1][0] = convert_float(bazza[1][0]) * shader_input.r_area;
	w_screen[1][1] = convert_float(bazza[1][1]) * shader_input.r_area;
	w_screen[1][2] = convert_float(bazza[1][2]) * shader_input.r_area;
	w_screen[1][3] = convert_float(bazza[1][3]) * shader_input.r_area;

	__m128 z_screen[4];
	z_screen[0] = (shader_input.z_delta[X] * w_screen[0][0]) + (shader_input.z_delta[Y] * w_screen[1][0]) + shader_input.z_delta[Z];
	z_screen[1] = (shader_input.z_delta[X] * w_screen[0][1]) + (shader_input.z_delta[Y] * w_screen[1][1]) + shader_input.z_delta[Z];
	z_screen[2] = (shader_input.z_delta[X] * w_screen[0][2]) + (shader_input.z_delta[Y] * w_screen[1][2]) + shader_input.z_delta[Z];
	z_screen[3] = (shader_input.z_delta[X] * w_screen[0][3]) + (shader_input.z_delta[Y] * w_screen[1][3]) + shader_input.z_delta[Z];

	{
		//if (shader_input.is_test) {

		//	__m128 x = convert_float(set_all(shader_input.x));
		//	__m128 y = convert_float(set_all(shader_input.y));
		//	y += set_all(0.5f);
		//	x += set_all(0.5f);
		//	x += set(0.0f, 1.0f, 2.0f, 3.0f);

		//	__m128 y_block[4];
		//	y_block[0] = y;
		//	y_block[1] = y + one;
		//	y_block[2] = y + two;
		//	y_block[3] = y + three;

		//	__m128 z_interpolant[3];
		//	z_interpolant[X] = set_all(shader_input.depth_interpolants[X]);
		//	z_interpolant[Y] = set_all(shader_input.depth_interpolants[Y]);
		//	z_interpolant[Z] = set_all(shader_input.depth_interpolants[Z]);

		//	z_screen[0] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[0]) + z_interpolant[Z];
		//	z_screen[1] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[1]) + z_interpolant[Z];
		//	z_screen[2] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[2]) + z_interpolant[Z];
		//	z_screen[3] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[3]) + z_interpolant[Z];
		//}
	}

	__m128i pixel_mask[4];
	pixel_mask[0] = load_mask[(coverage_mask >> 0) & 0xf];
	pixel_mask[1] = load_mask[(coverage_mask >> 4) & 0xf];
	pixel_mask[2] = load_mask[(coverage_mask >> 8) & 0xf];
	pixel_mask[3] = load_mask[(coverage_mask >> 12) & 0xf];

	__m128 z_buffer[4];
	z_buffer[0] = load(shader_input.depth_buffer + i_buffer + 0);
	z_buffer[1] = load(shader_input.depth_buffer + i_buffer + 4);
	z_buffer[2] = load(shader_input.depth_buffer + i_buffer + 8);
	z_buffer[3] = load(shader_input.depth_buffer + i_buffer + 12);

	__m128i z_mask[4];
	z_mask[0] = (z_screen[0] > z_buffer[0]) & pixel_mask[0];
	z_mask[1] = (z_screen[1] > z_buffer[1]) & pixel_mask[1];
	z_mask[2] = (z_screen[2] > z_buffer[2]) & pixel_mask[2];
	z_mask[3] = (z_screen[3] > z_buffer[3]) & pixel_mask[3];


	depth_mask |= store_mask(z_mask[0]) << 0;
	depth_mask |= store_mask(z_mask[1]) << 4;
	depth_mask |= store_mask(z_mask[2]) << 8;
	depth_mask |= store_mask(z_mask[3]) << 12;


	__m128 z_write[4];
	z_write[0] = blend(z_screen[0], z_buffer[0], z_mask[0]);
	z_write[1] = blend(z_screen[1], z_buffer[1], z_mask[1]);
	z_write[2] = blend(z_screen[2], z_buffer[2], z_mask[2]);
	z_write[3] = blend(z_screen[3], z_buffer[3], z_mask[3]);

	{
		__m128 z_max;
		z_max = z_write[0];
		z_max = min_vec(z_write[1], z_max);
		z_max = min_vec(z_write[2], z_max);
		z_max = min_vec(z_write[3], z_max);

		__m128 z_out = z_max;
		z_max = rotate_left(z_max);
		z_out = min_vec(z_max, z_out);
		z_max = rotate_left(z_max);
		z_out = min_vec(z_max, z_out);
		z_max = rotate_left(z_max);
		z_out = min_vec(z_max, z_out);

		shader_input.z_max = store_s(z_out);
	}


	store(z_write[0], shader_input.depth_buffer + i_buffer + 0);
	store(z_write[1], shader_input.depth_buffer + i_buffer + 4);
	store(z_write[2], shader_input.depth_buffer + i_buffer + 8);
	store(z_write[3], shader_input.depth_buffer + i_buffer + 12);


	if (depth_mask == 0x0) {
		return;
	}


	__m128 screen_barry[2][4];
	screen_barry[0][0] = (w_screen[0][0] * shader_input.barycentric[0][X]) + (w_screen[1][0] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z];
	screen_barry[0][1] = (w_screen[0][1] * shader_input.barycentric[0][X]) + (w_screen[1][1] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z];
	screen_barry[0][2] = (w_screen[0][2] * shader_input.barycentric[0][X]) + (w_screen[1][2] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z];
	screen_barry[0][3] = (w_screen[0][3] * shader_input.barycentric[0][X]) + (w_screen[1][3] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z];

	screen_barry[1][0] = (w_screen[0][0] * shader_input.barycentric[1][X]) + (w_screen[1][0] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z];
	screen_barry[1][1] = (w_screen[0][1] * shader_input.barycentric[1][X]) + (w_screen[1][1] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z];
	screen_barry[1][2] = (w_screen[0][2] * shader_input.barycentric[1][X]) + (w_screen[1][2] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z];
	screen_barry[1][3] = (w_screen[0][3] * shader_input.barycentric[1][X]) + (w_screen[1][3] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z];

	__m128 r_depth[4];
	r_depth[0] = reciprocal(z_screen[0]);
	r_depth[1] = reciprocal(z_screen[1]);
	r_depth[2] = reciprocal(z_screen[2]);
	r_depth[3] = reciprocal(z_screen[3]);

	__m128 w_clip[2][4];
	w_clip[0][0] = screen_barry[0][0] * r_depth[0];
	w_clip[0][1] = screen_barry[0][1] * r_depth[1];
	w_clip[0][2] = screen_barry[0][2] * r_depth[2];
	w_clip[0][3] = screen_barry[0][3] * r_depth[3];

	w_clip[1][0] = screen_barry[1][0] * r_depth[0];
	w_clip[1][1] = screen_barry[1][1] * r_depth[1];
	w_clip[1][2] = screen_barry[1][2] * r_depth[2];
	w_clip[1][3] = screen_barry[1][3] * r_depth[3];

	__m128i colour_out[4];
	{
		const vertex4_* gradients = shader_input.gradients[ATTRIBUTE_COLOUR];

		__m128 red_float[4];
		red_float[0] = (gradients[R].x * w_clip[0][0]) + (gradients[R].y * w_clip[1][0]) + gradients[R].z;
		red_float[1] = (gradients[R].x * w_clip[0][1]) + (gradients[R].y * w_clip[1][1]) + gradients[R].z;
		red_float[2] = (gradients[R].x * w_clip[0][2]) + (gradients[R].y * w_clip[1][2]) + gradients[R].z;
		red_float[3] = (gradients[R].x * w_clip[0][3]) + (gradients[R].y * w_clip[1][3]) + gradients[R].z;

		__m128 green_float[4];
		green_float[0] = (gradients[G].x * w_clip[0][0]) + (gradients[G].y * w_clip[1][0]) + gradients[G].z;
		green_float[1] = (gradients[G].x * w_clip[0][1]) + (gradients[G].y * w_clip[1][1]) + gradients[G].z;
		green_float[2] = (gradients[G].x * w_clip[0][2]) + (gradients[G].y * w_clip[1][2]) + gradients[G].z;
		green_float[3] = (gradients[G].x * w_clip[0][3]) + (gradients[G].y * w_clip[1][3]) + gradients[G].z;

		__m128 blue_float[4];
		blue_float[0] = (gradients[B].x * w_clip[0][0]) + (gradients[B].y * w_clip[1][0]) + gradients[B].z;
		blue_float[1] = (gradients[B].x * w_clip[0][1]) + (gradients[B].y * w_clip[1][1]) + gradients[B].z;
		blue_float[2] = (gradients[B].x * w_clip[0][2]) + (gradients[B].y * w_clip[1][2]) + gradients[B].z;
		blue_float[3] = (gradients[B].x * w_clip[0][3]) + (gradients[B].y * w_clip[1][3]) + gradients[B].z;

		red_float[0] = min_vec(max_vec(red_float[0], zero), colour_clamp);
		red_float[1] = min_vec(max_vec(red_float[1], zero), colour_clamp);
		red_float[2] = min_vec(max_vec(red_float[2], zero), colour_clamp);
		red_float[3] = min_vec(max_vec(red_float[3], zero), colour_clamp);

		green_float[0] = min_vec(max_vec(green_float[0], zero), colour_clamp);
		green_float[1] = min_vec(max_vec(green_float[1], zero), colour_clamp);
		green_float[2] = min_vec(max_vec(green_float[2], zero), colour_clamp);
		green_float[3] = min_vec(max_vec(green_float[3], zero), colour_clamp);

		blue_float[0] = min_vec(max_vec(blue_float[0], zero), colour_clamp);
		blue_float[1] = min_vec(max_vec(blue_float[1], zero), colour_clamp);
		blue_float[2] = min_vec(max_vec(blue_float[2], zero), colour_clamp);
		blue_float[3] = min_vec(max_vec(blue_float[3], zero), colour_clamp);

		__m128i red_int[4];
		red_int[0] = convert_int_trunc(red_float[0]);
		red_int[1] = convert_int_trunc(red_float[1]);
		red_int[2] = convert_int_trunc(red_float[2]);
		red_int[3] = convert_int_trunc(red_float[3]);

		__m128i green_int[4];
		green_int[0] = convert_int_trunc(green_float[0]);
		green_int[1] = convert_int_trunc(green_float[1]);
		green_int[2] = convert_int_trunc(green_float[2]);
		green_int[3] = convert_int_trunc(green_float[3]);

		__m128i blue_int[4];
		blue_int[0] = convert_int_trunc(blue_float[0]);
		blue_int[1] = convert_int_trunc(blue_float[1]);
		blue_int[2] = convert_int_trunc(blue_float[2]);
		blue_int[3] = convert_int_trunc(blue_float[3]);

		colour_out[0] = red_int[0] | (green_int[0] << 8) | (blue_int[0] << 16);
		colour_out[1] = red_int[1] | (green_int[1] << 8) | (blue_int[1] << 16);
		colour_out[2] = red_int[2] | (green_int[2] << 8) | (blue_int[2] << 16);
		colour_out[3] = red_int[3] | (green_int[3] << 8) | (blue_int[3] << 16);
	}

	float4_ u_table[4];
	float4_ v_table[4];


	{
		const vertex4_* gradients = shader_input.gradients[ATTRIBUTE_TEXCOORD];

		__m128 u_axis[4];
		u_axis[0] = (gradients[U].x * w_clip[0][0]) + (gradients[U].y * w_clip[1][0]) + gradients[U].z;
		u_axis[1] = (gradients[U].x * w_clip[0][1]) + (gradients[U].y * w_clip[1][1]) + gradients[U].z;
		u_axis[2] = (gradients[U].x * w_clip[0][2]) + (gradients[U].y * w_clip[1][2]) + gradients[U].z;
		u_axis[3] = (gradients[U].x * w_clip[0][3]) + (gradients[U].y * w_clip[1][3]) + gradients[U].z;

		__m128 v_axis[4];
		v_axis[0] = (gradients[V].x * w_clip[0][0]) + (gradients[V].y * w_clip[1][0]) + gradients[V].z;
		v_axis[1] = (gradients[V].x * w_clip[0][1]) + (gradients[V].y * w_clip[1][1]) + gradients[V].z;
		v_axis[2] = (gradients[V].x * w_clip[0][2]) + (gradients[V].y * w_clip[1][2]) + gradients[V].z;
		v_axis[3] = (gradients[V].x * w_clip[0][3]) + (gradients[V].y * w_clip[1][3]) + gradients[V].z;

		store_u(u_axis[0], u_table[0].f);
		store_u(u_axis[1], u_table[1].f);
		store_u(u_axis[2], u_table[2].f);
		store_u(u_axis[3], u_table[3].f);

		store_u(v_axis[0], v_table[0].f);
		store_u(v_axis[1], v_table[1].f);
		store_u(v_axis[2], v_table[2].f);
		store_u(v_axis[3], v_table[3].f);
	}

	const texture_handler_& texture_handler = *shader_input.texture_handler;

	float2_ du;
	du.x = (u_table[0].f[3] - u_table[0].f[0]) * (float)texture_handler.width;
	du.y = (u_table[3].f[0] - u_table[0].f[0]) * (float)texture_handler.width;

	float2_ dv;
	dv.x = (v_table[0].f[3] - v_table[0].f[0]) * (float)texture_handler.height;
	dv.y = (v_table[3].f[0] - v_table[0].f[0]) * (float)texture_handler.height;

	float area = abs((du.x * dv.y) - (du.y * dv.x))  * shader_input.mip_level_bias;
	unsigned long area_int = 1 + (unsigned long)(area + 0.5f);
	__int32 i_mip_floor;
	_BitScanReverse((unsigned long*)&i_mip_floor, area_int);

	i_mip_floor = max(i_mip_floor, 0);
	i_mip_floor = min(i_mip_floor, texture_handler.n_mip_levels - 1);

	const __int32 width = texture_handler.width >> i_mip_floor;
	const __int32 height = texture_handler.height >> i_mip_floor;
	const __int32 shift = texture_handler.width_shift - i_mip_floor;

	const __m128i texture_width_int = set_all(width);
	const __m128 texture_width = convert_float(set_all(width));
	const __m128 texture_height = convert_float(set_all(height));
	const __m128i width_clamp = set_all(width - 1);
	const __m128i height_clamp = set_all(height - 1);
	const __m128i width_shift = load_s(shift);

	__m128i tex_out[4];
	{
		__m128 u_axis[4];
		u_axis[0] = (load_u(u_table[0].f) * texture_width); // - half;
		u_axis[1] = (load_u(u_table[1].f) * texture_width); // - half;
		u_axis[2] = (load_u(u_table[2].f) * texture_width); // - half;
		u_axis[3] = (load_u(u_table[3].f) * texture_width); // - half;

		__m128 v_axis[4];
		v_axis[0] = (load_u(v_table[0].f) * texture_height); // - half;
		v_axis[1] = (load_u(v_table[1].f) * texture_height); // - half;
		v_axis[2] = (load_u(v_table[2].f) * texture_height); // - half;
		v_axis[3] = (load_u(v_table[3].f) * texture_height); // - half;

		__m128i u_int[4];
		u_int[0] = convert_int_trunc(u_axis[0]);
		u_int[1] = convert_int_trunc(u_axis[1]);
		u_int[2] = convert_int_trunc(u_axis[2]);
		u_int[3] = convert_int_trunc(u_axis[3]);

		__m128i v_int[4];
		v_int[0] = convert_int_trunc(v_axis[0]);
		v_int[1] = convert_int_trunc(v_axis[1]);
		v_int[2] = convert_int_trunc(v_axis[2]);
		v_int[3] = convert_int_trunc(v_axis[3]);

		u_int[0] = max_vec(min_vec(u_int[0], width_clamp), zero_int);
		u_int[1] = max_vec(min_vec(u_int[1], width_clamp), zero_int);
		u_int[2] = max_vec(min_vec(u_int[2], width_clamp), zero_int);
		u_int[3] = max_vec(min_vec(u_int[3], width_clamp), zero_int);

		v_int[0] = max_vec(min_vec(v_int[0], height_clamp), zero_int);
		v_int[1] = max_vec(min_vec(v_int[1], height_clamp), zero_int);
		v_int[2] = max_vec(min_vec(v_int[2], height_clamp), zero_int);
		v_int[3] = max_vec(min_vec(v_int[3], height_clamp), zero_int);


		__m128i i_texels[4];
		i_texels[0] = u_int[0] + (v_int[0] * texture_width_int);
		i_texels[1] = u_int[1] + (v_int[1] * texture_width_int);
		i_texels[2] = u_int[2] + (v_int[2] * texture_width_int);
		i_texels[3] = u_int[3] + (v_int[3] * texture_width_int);

		__int32 i_texels_in[4][4];
		store_u(i_texels[0], i_texels_in[0]);
		store_u(i_texels[1], i_texels_in[1]);
		store_u(i_texels[2], i_texels_in[2]);
		store_u(i_texels[3], i_texels_in[3]);

		unsigned __int32 texels_out[4][4];
		texels_out[0][0] = texture_handler.texture[i_mip_floor][i_texels_in[0][0]];
		texels_out[0][1] = texture_handler.texture[i_mip_floor][i_texels_in[0][1]];
		texels_out[0][2] = texture_handler.texture[i_mip_floor][i_texels_in[0][2]];
		texels_out[0][3] = texture_handler.texture[i_mip_floor][i_texels_in[0][3]];

		texels_out[1][0] = texture_handler.texture[i_mip_floor][i_texels_in[1][0]];
		texels_out[1][1] = texture_handler.texture[i_mip_floor][i_texels_in[1][1]];
		texels_out[1][2] = texture_handler.texture[i_mip_floor][i_texels_in[1][2]];
		texels_out[1][3] = texture_handler.texture[i_mip_floor][i_texels_in[1][3]];

		texels_out[2][0] = texture_handler.texture[i_mip_floor][i_texels_in[2][0]];
		texels_out[2][1] = texture_handler.texture[i_mip_floor][i_texels_in[2][1]];
		texels_out[2][2] = texture_handler.texture[i_mip_floor][i_texels_in[2][2]];
		texels_out[2][3] = texture_handler.texture[i_mip_floor][i_texels_in[2][3]];

		texels_out[3][0] = texture_handler.texture[i_mip_floor][i_texels_in[3][0]];
		texels_out[3][1] = texture_handler.texture[i_mip_floor][i_texels_in[3][1]];
		texels_out[3][2] = texture_handler.texture[i_mip_floor][i_texels_in[3][2]];
		texels_out[3][3] = texture_handler.texture[i_mip_floor][i_texels_in[3][3]];

		tex_out[0] = load_u(texels_out[0]);
		tex_out[1] = load_u(texels_out[1]);
		tex_out[2] = load_u(texels_out[2]);
		tex_out[3] = load_u(texels_out[3]);
	}

	__m128i colour_buffer[4];
	colour_buffer[0] = load(shader_input.colour_buffer + i_buffer + 0);
	colour_buffer[1] = load(shader_input.colour_buffer + i_buffer + 4);
	colour_buffer[2] = load(shader_input.colour_buffer + i_buffer + 8);
	colour_buffer[3] = load(shader_input.colour_buffer + i_buffer + 12);

	colour_buffer[0] = _mm_andnot_si128(z_mask[0], colour_buffer[0]);
	colour_buffer[1] = _mm_andnot_si128(z_mask[1], colour_buffer[1]);
	colour_buffer[2] = _mm_andnot_si128(z_mask[2], colour_buffer[2]);
	colour_buffer[3] = _mm_andnot_si128(z_mask[3], colour_buffer[3]);

	colour_buffer[0] = add_uint8_saturate(colour_buffer[0], colour_out[0] & z_mask[0]);
	colour_buffer[1] = add_uint8_saturate(colour_buffer[1], colour_out[1] & z_mask[1]);
	colour_buffer[2] = add_uint8_saturate(colour_buffer[2], colour_out[2] & z_mask[2]);
	colour_buffer[3] = add_uint8_saturate(colour_buffer[3], colour_out[3] & z_mask[3]);

	colour_buffer[0] = add_uint8_saturate(colour_buffer[0], tex_out[0] & z_mask[0]);
	colour_buffer[1] = add_uint8_saturate(colour_buffer[1], tex_out[1] & z_mask[1]);
	colour_buffer[2] = add_uint8_saturate(colour_buffer[2], tex_out[2] & z_mask[2]);
	colour_buffer[3] = add_uint8_saturate(colour_buffer[3], tex_out[3] & z_mask[3]);

	store(colour_buffer[0], shader_input.colour_buffer + i_buffer + 0);
	store(colour_buffer[1], shader_input.colour_buffer + i_buffer + 4);
	store(colour_buffer[2], shader_input.colour_buffer + i_buffer + 8);
	store(colour_buffer[3], shader_input.colour_buffer + i_buffer + 12);
}