// Mask and size for the table map. Use 12.5% full PRIVATE void calculate_table_mask(unsigned int num_elem) { int i; size_table = 1; // Generate result with all bits less than // first bit in num_elem in 1 while(size_table < num_elem) size_table = (size_table << 1) + 1; // 3 bits more into account for(i = 0; i < 3; i++) size_table = (size_table << 1) + 1; size_bit_table = (size_table << 1) + 1; //size_bit_table = (size_bit_table << 1) + 1; _BitScanReverse(&first_bit_size_bit_table, size_bit_table); _BitScanReverse(&first_bit_size_table, size_table); first_bit_size_bit_table++; first_bit_size_table++; size_table_see2[0] = size_table_see2[1] = size_table_see2[2] = size_table_see2[3] = size_table; size_bit_table_see2[0] = size_bit_table_see2[1] = size_bit_table_see2[2] = size_bit_table_see2[3] = size_bit_table; }
int first_one01( unsigned int u0, unsigned int u1 ) { unsigned long index; if ( _BitScanReverse( &index, u0 ) ) { return 26 - index; } _BitScanReverse( &index, u1 ); return 53 - index; }
int first_one12( unsigned int u1, unsigned int u2 ) { unsigned long index; if ( _BitScanReverse( &index, u1 ) ) { return 53 - index; } _BitScanReverse( &index, u2 ); return 80 - index; }
//0 returns 0, 2^31 returns 0 (overflow), works for all else values //can be implemented like //mov eax,ecx; or pop eax; //bsr ecx,eax; //bsf edx,eax; //jz _nothing_set; //cmp edx,ecx //zero (no carry) if equal, negative (since edx<=ecx) otherwise //adc ecx,0; //add the carry bit; // //mov eax,1; //generate the next number 1<<cl //shl eax,cl; //_nothing_set: //eax = 0 (input value) or 1<<cl //ret // ;) u32 next_pow2(u32 s) { unsigned long idxR,idxF; _BitScanReverse(&idxR,s); if (0==_BitScanReverse(&idxF,s)) return s; //its 0 anyway ... else { //idxR==idxF -> single bit set, power of 2 //idxR>idxF -> more bits set, round up by next idxR idxR+=((s32)idxF-(s32)idxR)>=0?0:1; } return 1<<idxR; }
// log2 - returns -1 if x==0, otherwise log2(x) inline int log2(size_t x) { if (x == 0) return -1; #if defined(__GNUC__) # ifdef REALM_PTR_64 return 63 - __builtin_clzll(x); // returns int # else return 31 - __builtin_clz(x); // returns int # endif #elif defined(_WIN32) unsigned long index = 0; # ifdef REALM_PTR_64 unsigned char c = _BitScanReverse64(&index, x); // outputs unsigned long # else unsigned char c = _BitScanReverse(&index, x); // outputs unsigned long # endif return static_cast<int>(index); #else // not __GNUC__ and not _WIN32 int r = 0; while (x >>= 1) { r++; } return r; #endif }
BOOL CSongPropsDlg::OnInitDialog() { CDialog::OnInitDialog(); SetIcon(theApp.LoadIcon(IDR_MAINFRAME), 0); CPatchGeneralDlg::InitNoteCombo(m_KeySig); m_KeySig.SetCurSel(m_Props.m_Key); m_TimeSigNumer.SetVal(m_Props.m_Meter.m_Numerator); CStringArrayEx sUnit; CPartBassDlg::GetPowerOfTwoStrings(sUnit, CSong::CMeter::MIN_UNIT_EXP, CSong::CMeter::MAX_UNIT_EXP); int nUnits = sUnit.GetSize(); for (int iUnit = 0; iUnit < nUnits; iUnit++) m_TimeSigDenom.AddString(sUnit[iUnit]); DWORD iSelUnit; _BitScanReverse(&iSelUnit, m_Props.m_Meter.m_Denominator); m_TimeSigDenom.SetCurSel(iSelUnit); m_Tempo.SetVal(m_Props.m_Tempo); m_Transpose.SetVal(m_Props.m_Transpose); m_Comments.SetWindowText(m_Props.m_Comments); EnableToolTips(); return TRUE; // return TRUE unless you set the focus to a control // EXCEPTION: OCX Property Pages should return FALSE }
inline unsigned int CountLeadingZeros(unsigned int x) { unsigned long firstBit; if ( _BitScanReverse(&firstBit,x) ) return 31 - firstBit; return 32; }
Size findLastBit(Size a) { #ifdef __GNUC__ #ifdef __X64__ return sizeof(a)*8 - 1 - __builtin_clzl(a); #else return sizeof(a)*8 - 1 - __builtin_clz(a); #endif #elif defined(_MSC_VER) unsigned long pos; #ifdef __X64__ _BitScanReverse64(&pos, a); #else _BitScanReverse(&pos, a); #endif return sizeof(a)*8 - 1 - pos; #else //Very naive implementation. Size c = sizeof(a)*8 - 1; const Size mask = 1 << c; while(!(a & mask)) { a <<= 1; c--; } return c; #endif }
int first_one2( unsigned int u2 ) { unsigned long index; _BitScanReverse( &index, u2 ); return 80 - index; }
int first_one1( unsigned int u1 ) { unsigned long index; _BitScanReverse( &index, u1 ); return 53 - index; }
int flsl(long value) { unsigned long index = 0; unsigned char isNonZero; isNonZero = _BitScanReverse(&index, value); return isNonZero ? index + 1 : 0; }
inline unsigned int BitCountNeededToEncode(unsigned int data) { #if defined(_X360) return (32 - CountLeadingZeros(data+1)) - 1; #else unsigned long firstBit; _BitScanReverse(&firstBit,data+1); return firstBit; #endif }
// Returns the smallest n such that (length >> n) >= 4 static int GetMinimumPower(int length) { unsigned long rightmost_bit; if (!_BitScanReverse(&rightmost_bit, length)) { assert(false); return 0; } return rightmost_bit - 2; }
/* returns the integer logarithm of v (bit width) */ uint32_t bits(const uint32_t v) { #ifdef _MSC_VER unsigned long answer; if (v == 0) { return 0; } _BitScanReverse(&answer, v); return answer + 1; #else return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */ #endif }
static inline unsigned count_leading_ones(boost::uint8_t x) { boost::uint32_t i = ~x; i = (i<<24) | 0x00FFFFFF; #ifdef _MSC_VER unsigned long r; _BitScanReverse(&r, (unsigned long)i); return 31 - r; #else return __builtin_clz(i); #endif }
hv_uint32_t hv_min_max_log2(hv_uint32_t x) { #if HV_MSVC // finds ceil(log2(x)) // http://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array // http://msdn.microsoft.com/en-us/library/fbxyd7zd%28v=VS.80%29.aspx unsigned long z = 0; _BitScanReverse(&z, x); return (hv_uint32_t) (z+1); #else return (hv_uint32_t) ((8 * sizeof(unsigned int)) - __builtin_clz(x-1)); #endif // HV_MSVC }
CPU_DATA CPU_CntLeadZeros (CPU_DATA val) { DWORD clz; if (val == 0u) { return (32u); } _BitScanReverse(&clz, (DWORD)val); return (31u - (CPU_DATA)clz); }
/* called with interrupts disabled (via CLI) from an arbitrary location inside HAL.DLL */ static __inline LONG ApicHighestVector(PULONG bitmap) { int i; ULONG bit; ULONG value; for (i = 0; i < 8; i++) { value = bitmap[(7 - i) * 4]; if (value) { _BitScanReverse(&bit, value); return ((7 - i) << 5) | bit; } } return -1; }
/* * Binary logarithm of value (exact if the value is a power of 2, * approximate (floored) otherwise) */ static matras_id_t matras_log2(matras_id_t val) { assert(val > 0); #ifdef WIN32 unsigned long res = 0; unsigned char nonzero = _BitScanReverse(&res, val); assert(nonzero); (void)nonzero; return (matras_id_t)res; #else return sizeof(unsigned int) * CHAR_BIT - __builtin_clz((unsigned int) val) - 1; #endif }
// 按 bit 数前面 0 的个数 int Clz(size_t x) { #ifdef _MSC_VER unsigned long r = 0; # ifdef XXLIB_64BIT _BitScanReverse64(&r, x); return 63 - r; # else _BitScanReverse(&r, x); return 31 - r; # endif #else # ifdef XXLIB_64BIT return __builtin_clzl(x); # else return __builtin_clz(x); # endif #endif }
TEST_F(AncestorTreeTest, add_node) { unsigned long depth = 0; unsigned long mask = 1; unsigned char isNonzero = _BitScanReverse( &depth, mask); std::cout << depth << std::endl; directed::AncestorTree<int> tree; // depth 0 directed::AncestorNode<int> * node_0 = tree.BuildNode(0); // depth 1 directed::AncestorNode<int> * node_1 = tree.BuildNode(1); directed::AncestorNode<int> * node_2 = tree.BuildNode(2); // depth 2 directed::AncestorNode<int> * node_3 = tree.BuildNode(3); directed::AncestorNode<int> * node_4 = tree.BuildNode(4); directed::AncestorNode<int> * node_5 = tree.BuildNode(5); directed::AncestorNode<int> * node_6 = tree.BuildNode(6); tree.SetRoot(node_0); node_0->SetLeft(node_1); node_0->SetRight(node_2); node_1->SetLeft(node_3); node_1->SetRight(node_4); node_2->SetLeft(node_5); node_2->SetRight(node_6); ASSERT_EQ(node_1, tree.LeastCommonAncestor(node_3, node_4)); ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_3, node_2)); ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_0, node_6)); ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_1, node_2)); ASSERT_EQ(node_2, tree.LeastCommonAncestor(node_5, node_6)); ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_4, node_5)); ASSERT_EQ(node_0, tree.LeastCommonAncestor(node_3, node_6)); }
// Count Leading Zeroes Word static void cntlzw(ThreadState *state, Instruction instr) { unsigned long a; uint32_t s; s = state->gpr[instr.rS]; if (!_BitScanReverse(&a, s)) { a = 32; } else { a = 31 - a; } state->gpr[instr.rA] = a; if (instr.rc) { updateConditionRegister(state, a); } }
//Return the number of leading zeros. Deliberately undefined if value == 0 inline unsigned countLeadingUnsetBits(unsigned value) { dbgassertex(value != 0); #if defined(__GNUC__) return __builtin_clz(value); #elif defined (_WIN32) unsigned long index; _BitScanReverse(&index, value); return (unsigned)((sizeof(unsigned)*8)-1 - index); #else unsigned mask = 1U << ((sizeof(unsigned)*8)-1); unsigned i; for (i=0; i < sizeof(unsigned)*8; i++) { if (value & mask) return i; mask = mask >> 1; } return i; #endif }
//Return the number of bits including the first non-zero bit. Undefined if value == 0 inline unsigned getMostSignificantBit(unsigned value) { dbgassertex(value != 0); #if defined(__GNUC__) return (sizeof(unsigned)*8) - __builtin_clz(value); #elif defined (_WIN32) unsigned long index; _BitScanReverse(&index, value); return (unsigned)index+1; #else unsigned mask = 1U << ((sizeof(unsigned)*8)-1); unsigned i; for (i=0; i < sizeof(unsigned)*8; i++) { if (value & mask) return sizeof(unsigned)*8-i; mask = mask >> 1; } return 0; #endif }
uint32_t msb_idx_u32(uint32_t n) { #if defined( _MSC_VER ) uint32_t index; _BitScanReverse((unsigned long *)&index,n); return index; #elif defined( __GNUC__ ) return __builtin_clz(n) ^ 31; #else #error "No msb_index()" #endif }
static INLINE unsigned clz(u32 v) { #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER) return __builtin_clz(v); #elif defined(_MSC_VER) unsigned long idx; _BitScanReverse(&idx, v); return 31 ^ idx; #else unsigned ret = 0; unsigned tmp; tmp = !(v & 0xFFFF0000) << 4; v <<= tmp; ret += tmp; tmp = !(v & 0xFF000000) << 3; v <<= tmp; ret += tmp; tmp = !(v & 0xF0000000) << 2; v <<= tmp; ret += tmp; tmp = !(v & 0xC0000000) << 1; v <<= tmp; ret += tmp; tmp = !(v & 0x80000000) << 0; ret += tmp; return(ret); #endif }
CString CDurationComboBox::DurationToString(double Duration) { CString s; if (Duration) { for (int iDenom = 0; iDenom < DENOMINATORS; iDenom++) { int denom = 1 << iDenom; for (int iUnit = 0; iUnit < UNITS; iUnit++) { double divisor = m_Unit[iUnit] / denom; double r = fabs(fmod(Duration, divisor)); // printf("%g %g\n", divisor, r); if (r < m_Epsilon || fabs(r - divisor) < m_Epsilon) { int numer = round(Duration / divisor); DWORD dots = 0; if (SHOW_DOTS) { if (numer > 2 && IsPowerOfTwo(numer + 1)) { int DotDenom = denom / ((numer + 1) / 2); if (DotDenom) { // avoid divide by zero _BitScanReverse(&dots, denom / DotDenom); denom = DotDenom; numer = 1; } } } s.Format(_T("%d/%d"), numer, denom); if (iUnit) s.Insert(numer < 0, _T("T")); for (DWORD iDot = 0; iDot < dots; iDot++) s += '.'; return(s); // early out } } } } s.Format(_T("%g"), Duration); return(s); }
static int uv_tty_write_bufs(uv_tty_t* handle, uv_buf_t bufs[], int bufcnt, DWORD* error) { /* We can only write 8k characters at a time. Windows can't handle */ /* much more characters in a single console write anyway. */ WCHAR utf16_buf[8192]; DWORD utf16_buf_used = 0; int i; #define FLUSH_TEXT() \ do { \ if (utf16_buf_used > 0) { \ uv_tty_emit_text(handle, utf16_buf, utf16_buf_used, error); \ utf16_buf_used = 0; \ } \ } while (0) /* Cache for fast access */ unsigned char utf8_bytes_left = handle->utf8_bytes_left; unsigned int utf8_codepoint = handle->utf8_codepoint; unsigned char previous_eol = handle->previous_eol; unsigned char ansi_parser_state = handle->ansi_parser_state; /* Store the error here. If we encounter an error, stop trying to do i/o */ /* but keep parsing the buffer so we leave the parser in a consistent */ /* state. */ *error = ERROR_SUCCESS; EnterCriticalSection(&uv_tty_output_lock); for (i = 0; i < bufcnt; i++) { uv_buf_t buf = bufs[i]; unsigned int j; for (j = 0; j < buf.len; j++) { unsigned char c = buf.base[j]; /* Run the character through the utf8 decoder We happily accept non */ /* shortest form encodings and invalid code points - there's no real */ /* harm that can be done. */ if (utf8_bytes_left == 0) { /* Read utf-8 start byte */ DWORD first_zero_bit; unsigned char not_c = ~c; #ifdef _MSC_VER /* msvc */ if (_BitScanReverse(&first_zero_bit, not_c)) { #else /* assume gcc */ if (first_zero_bit = __builtin_clzl(not_c), c != 0) { #endif if (first_zero_bit == 7) { /* Ascii - pass right through */ utf8_codepoint = (unsigned int) c; } else if (first_zero_bit <= 5) { /* Multibyte sequence */ utf8_codepoint = (0xff >> (8 - first_zero_bit)) & c; utf8_bytes_left = (char) (6 - first_zero_bit); } else { /* Invalid continuation */ utf8_codepoint = UNICODE_REPLACEMENT_CHARACTER; } } else { /* 0xff -- invalid */ utf8_codepoint = UNICODE_REPLACEMENT_CHARACTER; } } else if ((c & 0xc0) == 0x80) {
int perf() { int Error = 0; std::size_t const Count(100000000); { std::vector<int> Result; Result.resize(Count); std::clock_t Begin = clock(); for(int i = 0; i < static_cast<int>(Count); ++i) Result[i] = glm::log2(static_cast<int>(i)); std::clock_t End = clock(); printf("glm::log2<int>: %ld clocks\n", End - Begin); } { std::vector<glm::ivec4> Result; Result.resize(Count); std::clock_t Begin = clock(); for(int i = 0; i < static_cast<int>(Count); ++i) Result[i] = glm::log2(glm::ivec4(i)); std::clock_t End = clock(); printf("glm::log2<ivec4>: %ld clocks\n", End - Begin); } # if GLM_HAS_BITSCAN_WINDOWS { std::vector<glm::ivec4> Result; Result.resize(Count); std::clock_t Begin = clock(); for(std::size_t i = 0; i < Count; ++i) { glm::tvec4<unsigned long, glm::defaultp> Tmp(glm::uninitialize); _BitScanReverse(&Tmp.x, i); _BitScanReverse(&Tmp.y, i); _BitScanReverse(&Tmp.z, i); _BitScanReverse(&Tmp.w, i); Result[i] = glm::ivec4(Tmp); } std::clock_t End = clock(); printf("glm::log2<ivec4> inlined: %ld clocks\n", End - Begin); } { std::vector<glm::tvec4<unsigned long, glm::defaultp> > Result; Result.resize(Count); std::clock_t Begin = clock(); for(std::size_t i = 0; i < Count; ++i) { _BitScanReverse(&Result[i].x, i); _BitScanReverse(&Result[i].y, i); _BitScanReverse(&Result[i].z, i); _BitScanReverse(&Result[i].w, i); } std::clock_t End = clock(); printf("glm::log2<ivec4> inlined no cast: %ld clocks\n", End - Begin); } { std::vector<glm::ivec4> Result; Result.resize(Count); std::clock_t Begin = clock(); for(std::size_t i = 0; i < Count; ++i) { _BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].x), i); _BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].y), i); _BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].z), i); _BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].w), i); } std::clock_t End = clock(); printf("glm::log2<ivec4> reinterpret: %ld clocks\n", End - Begin); } # endif//GLM_HAS_BITSCAN_WINDOWS { std::vector<float> Result; Result.resize(Count); std::clock_t Begin = clock(); for(std::size_t i = 0; i < Count; ++i) Result[i] = glm::log2(static_cast<float>(i)); std::clock_t End = clock(); printf("glm::log2<float>: %ld clocks\n", End - Begin); } { std::vector<glm::vec4> Result; Result.resize(Count); std::clock_t Begin = clock(); for(int i = 0; i < static_cast<int>(Count); ++i) Result[i] = glm::log2(glm::vec4(i)); std::clock_t End = clock(); printf("glm::log2<vec4>: %ld clocks\n", End - Begin); } return Error; }
/* ================== ================== */ void pixel_shader( const unsigned __int32 i_buffer, const unsigned __int32 coverage_mask, const __m128i bazza[3][4], shader_input_& shader_input ) { static const __m128 zero = set_zero(); static const __m128 half = set_all(0.5f); static const __m128 one = set_all(1.0f); static const __m128 two = one + one; static const __m128 three = two + one; static const __m128i zero_int = set_zero_si128(); static const __m128 colour_clamp = broadcast(load_s(255.0f)); unsigned __int32 depth_mask = 0x0; __m128 w_screen[2][4]; w_screen[0][0] = convert_float(bazza[0][0]) * shader_input.r_area; w_screen[0][1] = convert_float(bazza[0][1]) * shader_input.r_area; w_screen[0][2] = convert_float(bazza[0][2]) * shader_input.r_area; w_screen[0][3] = convert_float(bazza[0][3]) * shader_input.r_area; w_screen[1][0] = convert_float(bazza[1][0]) * shader_input.r_area; w_screen[1][1] = convert_float(bazza[1][1]) * shader_input.r_area; w_screen[1][2] = convert_float(bazza[1][2]) * shader_input.r_area; w_screen[1][3] = convert_float(bazza[1][3]) * shader_input.r_area; __m128 z_screen[4]; z_screen[0] = (shader_input.z_delta[X] * w_screen[0][0]) + (shader_input.z_delta[Y] * w_screen[1][0]) + shader_input.z_delta[Z]; z_screen[1] = (shader_input.z_delta[X] * w_screen[0][1]) + (shader_input.z_delta[Y] * w_screen[1][1]) + shader_input.z_delta[Z]; z_screen[2] = (shader_input.z_delta[X] * w_screen[0][2]) + (shader_input.z_delta[Y] * w_screen[1][2]) + shader_input.z_delta[Z]; z_screen[3] = (shader_input.z_delta[X] * w_screen[0][3]) + (shader_input.z_delta[Y] * w_screen[1][3]) + shader_input.z_delta[Z]; { //if (shader_input.is_test) { // __m128 x = convert_float(set_all(shader_input.x)); // __m128 y = convert_float(set_all(shader_input.y)); // y += set_all(0.5f); // x += set_all(0.5f); // x += set(0.0f, 1.0f, 2.0f, 3.0f); // __m128 y_block[4]; // y_block[0] = y; // y_block[1] = y + one; // y_block[2] = y + two; // y_block[3] = y + three; // __m128 z_interpolant[3]; // z_interpolant[X] = set_all(shader_input.depth_interpolants[X]); // z_interpolant[Y] = set_all(shader_input.depth_interpolants[Y]); // z_interpolant[Z] = set_all(shader_input.depth_interpolants[Z]); // z_screen[0] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[0]) + z_interpolant[Z]; // z_screen[1] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[1]) + z_interpolant[Z]; // z_screen[2] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[2]) + z_interpolant[Z]; // z_screen[3] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[3]) + z_interpolant[Z]; //} } __m128i pixel_mask[4]; pixel_mask[0] = load_mask[(coverage_mask >> 0) & 0xf]; pixel_mask[1] = load_mask[(coverage_mask >> 4) & 0xf]; pixel_mask[2] = load_mask[(coverage_mask >> 8) & 0xf]; pixel_mask[3] = load_mask[(coverage_mask >> 12) & 0xf]; __m128 z_buffer[4]; z_buffer[0] = load(shader_input.depth_buffer + i_buffer + 0); z_buffer[1] = load(shader_input.depth_buffer + i_buffer + 4); z_buffer[2] = load(shader_input.depth_buffer + i_buffer + 8); z_buffer[3] = load(shader_input.depth_buffer + i_buffer + 12); __m128i z_mask[4]; z_mask[0] = (z_screen[0] > z_buffer[0]) & pixel_mask[0]; z_mask[1] = (z_screen[1] > z_buffer[1]) & pixel_mask[1]; z_mask[2] = (z_screen[2] > z_buffer[2]) & pixel_mask[2]; z_mask[3] = (z_screen[3] > z_buffer[3]) & pixel_mask[3]; depth_mask |= store_mask(z_mask[0]) << 0; depth_mask |= store_mask(z_mask[1]) << 4; depth_mask |= store_mask(z_mask[2]) << 8; depth_mask |= store_mask(z_mask[3]) << 12; __m128 z_write[4]; z_write[0] = blend(z_screen[0], z_buffer[0], z_mask[0]); z_write[1] = blend(z_screen[1], z_buffer[1], z_mask[1]); z_write[2] = blend(z_screen[2], z_buffer[2], z_mask[2]); z_write[3] = blend(z_screen[3], z_buffer[3], z_mask[3]); { __m128 z_max; z_max = z_write[0]; z_max = min_vec(z_write[1], z_max); z_max = min_vec(z_write[2], z_max); z_max = min_vec(z_write[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.z_max = store_s(z_out); } store(z_write[0], shader_input.depth_buffer + i_buffer + 0); store(z_write[1], shader_input.depth_buffer + i_buffer + 4); store(z_write[2], shader_input.depth_buffer + i_buffer + 8); store(z_write[3], shader_input.depth_buffer + i_buffer + 12); if (depth_mask == 0x0) { return; } __m128 screen_barry[2][4]; screen_barry[0][0] = (w_screen[0][0] * shader_input.barycentric[0][X]) + (w_screen[1][0] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[0][1] = (w_screen[0][1] * shader_input.barycentric[0][X]) + (w_screen[1][1] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[0][2] = (w_screen[0][2] * shader_input.barycentric[0][X]) + (w_screen[1][2] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[0][3] = (w_screen[0][3] * shader_input.barycentric[0][X]) + (w_screen[1][3] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[1][0] = (w_screen[0][0] * shader_input.barycentric[1][X]) + (w_screen[1][0] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; screen_barry[1][1] = (w_screen[0][1] * shader_input.barycentric[1][X]) + (w_screen[1][1] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; screen_barry[1][2] = (w_screen[0][2] * shader_input.barycentric[1][X]) + (w_screen[1][2] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; screen_barry[1][3] = (w_screen[0][3] * shader_input.barycentric[1][X]) + (w_screen[1][3] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; __m128 r_depth[4]; r_depth[0] = reciprocal(z_screen[0]); r_depth[1] = reciprocal(z_screen[1]); r_depth[2] = reciprocal(z_screen[2]); r_depth[3] = reciprocal(z_screen[3]); __m128 w_clip[2][4]; w_clip[0][0] = screen_barry[0][0] * r_depth[0]; w_clip[0][1] = screen_barry[0][1] * r_depth[1]; w_clip[0][2] = screen_barry[0][2] * r_depth[2]; w_clip[0][3] = screen_barry[0][3] * r_depth[3]; w_clip[1][0] = screen_barry[1][0] * r_depth[0]; w_clip[1][1] = screen_barry[1][1] * r_depth[1]; w_clip[1][2] = screen_barry[1][2] * r_depth[2]; w_clip[1][3] = screen_barry[1][3] * r_depth[3]; __m128i colour_out[4]; { const vertex4_* gradients = shader_input.gradients[ATTRIBUTE_COLOUR]; __m128 red_float[4]; red_float[0] = (gradients[R].x * w_clip[0][0]) + (gradients[R].y * w_clip[1][0]) + gradients[R].z; red_float[1] = (gradients[R].x * w_clip[0][1]) + (gradients[R].y * w_clip[1][1]) + gradients[R].z; red_float[2] = (gradients[R].x * w_clip[0][2]) + (gradients[R].y * w_clip[1][2]) + gradients[R].z; red_float[3] = (gradients[R].x * w_clip[0][3]) + (gradients[R].y * w_clip[1][3]) + gradients[R].z; __m128 green_float[4]; green_float[0] = (gradients[G].x * w_clip[0][0]) + (gradients[G].y * w_clip[1][0]) + gradients[G].z; green_float[1] = (gradients[G].x * w_clip[0][1]) + (gradients[G].y * w_clip[1][1]) + gradients[G].z; green_float[2] = (gradients[G].x * w_clip[0][2]) + (gradients[G].y * w_clip[1][2]) + gradients[G].z; green_float[3] = (gradients[G].x * w_clip[0][3]) + (gradients[G].y * w_clip[1][3]) + gradients[G].z; __m128 blue_float[4]; blue_float[0] = (gradients[B].x * w_clip[0][0]) + (gradients[B].y * w_clip[1][0]) + gradients[B].z; blue_float[1] = (gradients[B].x * w_clip[0][1]) + (gradients[B].y * w_clip[1][1]) + gradients[B].z; blue_float[2] = (gradients[B].x * w_clip[0][2]) + (gradients[B].y * w_clip[1][2]) + gradients[B].z; blue_float[3] = (gradients[B].x * w_clip[0][3]) + (gradients[B].y * w_clip[1][3]) + gradients[B].z; red_float[0] = min_vec(max_vec(red_float[0], zero), colour_clamp); red_float[1] = min_vec(max_vec(red_float[1], zero), colour_clamp); red_float[2] = min_vec(max_vec(red_float[2], zero), colour_clamp); red_float[3] = min_vec(max_vec(red_float[3], zero), colour_clamp); green_float[0] = min_vec(max_vec(green_float[0], zero), colour_clamp); green_float[1] = min_vec(max_vec(green_float[1], zero), colour_clamp); green_float[2] = min_vec(max_vec(green_float[2], zero), colour_clamp); green_float[3] = min_vec(max_vec(green_float[3], zero), colour_clamp); blue_float[0] = min_vec(max_vec(blue_float[0], zero), colour_clamp); blue_float[1] = min_vec(max_vec(blue_float[1], zero), colour_clamp); blue_float[2] = min_vec(max_vec(blue_float[2], zero), colour_clamp); blue_float[3] = min_vec(max_vec(blue_float[3], zero), colour_clamp); __m128i red_int[4]; red_int[0] = convert_int_trunc(red_float[0]); red_int[1] = convert_int_trunc(red_float[1]); red_int[2] = convert_int_trunc(red_float[2]); red_int[3] = convert_int_trunc(red_float[3]); __m128i green_int[4]; green_int[0] = convert_int_trunc(green_float[0]); green_int[1] = convert_int_trunc(green_float[1]); green_int[2] = convert_int_trunc(green_float[2]); green_int[3] = convert_int_trunc(green_float[3]); __m128i blue_int[4]; blue_int[0] = convert_int_trunc(blue_float[0]); blue_int[1] = convert_int_trunc(blue_float[1]); blue_int[2] = convert_int_trunc(blue_float[2]); blue_int[3] = convert_int_trunc(blue_float[3]); colour_out[0] = red_int[0] | (green_int[0] << 8) | (blue_int[0] << 16); colour_out[1] = red_int[1] | (green_int[1] << 8) | (blue_int[1] << 16); colour_out[2] = red_int[2] | (green_int[2] << 8) | (blue_int[2] << 16); colour_out[3] = red_int[3] | (green_int[3] << 8) | (blue_int[3] << 16); } float4_ u_table[4]; float4_ v_table[4]; { const vertex4_* gradients = shader_input.gradients[ATTRIBUTE_TEXCOORD]; __m128 u_axis[4]; u_axis[0] = (gradients[U].x * w_clip[0][0]) + (gradients[U].y * w_clip[1][0]) + gradients[U].z; u_axis[1] = (gradients[U].x * w_clip[0][1]) + (gradients[U].y * w_clip[1][1]) + gradients[U].z; u_axis[2] = (gradients[U].x * w_clip[0][2]) + (gradients[U].y * w_clip[1][2]) + gradients[U].z; u_axis[3] = (gradients[U].x * w_clip[0][3]) + (gradients[U].y * w_clip[1][3]) + gradients[U].z; __m128 v_axis[4]; v_axis[0] = (gradients[V].x * w_clip[0][0]) + (gradients[V].y * w_clip[1][0]) + gradients[V].z; v_axis[1] = (gradients[V].x * w_clip[0][1]) + (gradients[V].y * w_clip[1][1]) + gradients[V].z; v_axis[2] = (gradients[V].x * w_clip[0][2]) + (gradients[V].y * w_clip[1][2]) + gradients[V].z; v_axis[3] = (gradients[V].x * w_clip[0][3]) + (gradients[V].y * w_clip[1][3]) + gradients[V].z; store_u(u_axis[0], u_table[0].f); store_u(u_axis[1], u_table[1].f); store_u(u_axis[2], u_table[2].f); store_u(u_axis[3], u_table[3].f); store_u(v_axis[0], v_table[0].f); store_u(v_axis[1], v_table[1].f); store_u(v_axis[2], v_table[2].f); store_u(v_axis[3], v_table[3].f); } const texture_handler_& texture_handler = *shader_input.texture_handler; float2_ du; du.x = (u_table[0].f[3] - u_table[0].f[0]) * (float)texture_handler.width; du.y = (u_table[3].f[0] - u_table[0].f[0]) * (float)texture_handler.width; float2_ dv; dv.x = (v_table[0].f[3] - v_table[0].f[0]) * (float)texture_handler.height; dv.y = (v_table[3].f[0] - v_table[0].f[0]) * (float)texture_handler.height; float area = abs((du.x * dv.y) - (du.y * dv.x)) * shader_input.mip_level_bias; unsigned long area_int = 1 + (unsigned long)(area + 0.5f); __int32 i_mip_floor; _BitScanReverse((unsigned long*)&i_mip_floor, area_int); i_mip_floor = max(i_mip_floor, 0); i_mip_floor = min(i_mip_floor, texture_handler.n_mip_levels - 1); const __int32 width = texture_handler.width >> i_mip_floor; const __int32 height = texture_handler.height >> i_mip_floor; const __int32 shift = texture_handler.width_shift - i_mip_floor; const __m128i texture_width_int = set_all(width); const __m128 texture_width = convert_float(set_all(width)); const __m128 texture_height = convert_float(set_all(height)); const __m128i width_clamp = set_all(width - 1); const __m128i height_clamp = set_all(height - 1); const __m128i width_shift = load_s(shift); __m128i tex_out[4]; { __m128 u_axis[4]; u_axis[0] = (load_u(u_table[0].f) * texture_width); // - half; u_axis[1] = (load_u(u_table[1].f) * texture_width); // - half; u_axis[2] = (load_u(u_table[2].f) * texture_width); // - half; u_axis[3] = (load_u(u_table[3].f) * texture_width); // - half; __m128 v_axis[4]; v_axis[0] = (load_u(v_table[0].f) * texture_height); // - half; v_axis[1] = (load_u(v_table[1].f) * texture_height); // - half; v_axis[2] = (load_u(v_table[2].f) * texture_height); // - half; v_axis[3] = (load_u(v_table[3].f) * texture_height); // - half; __m128i u_int[4]; u_int[0] = convert_int_trunc(u_axis[0]); u_int[1] = convert_int_trunc(u_axis[1]); u_int[2] = convert_int_trunc(u_axis[2]); u_int[3] = convert_int_trunc(u_axis[3]); __m128i v_int[4]; v_int[0] = convert_int_trunc(v_axis[0]); v_int[1] = convert_int_trunc(v_axis[1]); v_int[2] = convert_int_trunc(v_axis[2]); v_int[3] = convert_int_trunc(v_axis[3]); u_int[0] = max_vec(min_vec(u_int[0], width_clamp), zero_int); u_int[1] = max_vec(min_vec(u_int[1], width_clamp), zero_int); u_int[2] = max_vec(min_vec(u_int[2], width_clamp), zero_int); u_int[3] = max_vec(min_vec(u_int[3], width_clamp), zero_int); v_int[0] = max_vec(min_vec(v_int[0], height_clamp), zero_int); v_int[1] = max_vec(min_vec(v_int[1], height_clamp), zero_int); v_int[2] = max_vec(min_vec(v_int[2], height_clamp), zero_int); v_int[3] = max_vec(min_vec(v_int[3], height_clamp), zero_int); __m128i i_texels[4]; i_texels[0] = u_int[0] + (v_int[0] * texture_width_int); i_texels[1] = u_int[1] + (v_int[1] * texture_width_int); i_texels[2] = u_int[2] + (v_int[2] * texture_width_int); i_texels[3] = u_int[3] + (v_int[3] * texture_width_int); __int32 i_texels_in[4][4]; store_u(i_texels[0], i_texels_in[0]); store_u(i_texels[1], i_texels_in[1]); store_u(i_texels[2], i_texels_in[2]); store_u(i_texels[3], i_texels_in[3]); unsigned __int32 texels_out[4][4]; texels_out[0][0] = texture_handler.texture[i_mip_floor][i_texels_in[0][0]]; texels_out[0][1] = texture_handler.texture[i_mip_floor][i_texels_in[0][1]]; texels_out[0][2] = texture_handler.texture[i_mip_floor][i_texels_in[0][2]]; texels_out[0][3] = texture_handler.texture[i_mip_floor][i_texels_in[0][3]]; texels_out[1][0] = texture_handler.texture[i_mip_floor][i_texels_in[1][0]]; texels_out[1][1] = texture_handler.texture[i_mip_floor][i_texels_in[1][1]]; texels_out[1][2] = texture_handler.texture[i_mip_floor][i_texels_in[1][2]]; texels_out[1][3] = texture_handler.texture[i_mip_floor][i_texels_in[1][3]]; texels_out[2][0] = texture_handler.texture[i_mip_floor][i_texels_in[2][0]]; texels_out[2][1] = texture_handler.texture[i_mip_floor][i_texels_in[2][1]]; texels_out[2][2] = texture_handler.texture[i_mip_floor][i_texels_in[2][2]]; texels_out[2][3] = texture_handler.texture[i_mip_floor][i_texels_in[2][3]]; texels_out[3][0] = texture_handler.texture[i_mip_floor][i_texels_in[3][0]]; texels_out[3][1] = texture_handler.texture[i_mip_floor][i_texels_in[3][1]]; texels_out[3][2] = texture_handler.texture[i_mip_floor][i_texels_in[3][2]]; texels_out[3][3] = texture_handler.texture[i_mip_floor][i_texels_in[3][3]]; tex_out[0] = load_u(texels_out[0]); tex_out[1] = load_u(texels_out[1]); tex_out[2] = load_u(texels_out[2]); tex_out[3] = load_u(texels_out[3]); } __m128i colour_buffer[4]; colour_buffer[0] = load(shader_input.colour_buffer + i_buffer + 0); colour_buffer[1] = load(shader_input.colour_buffer + i_buffer + 4); colour_buffer[2] = load(shader_input.colour_buffer + i_buffer + 8); colour_buffer[3] = load(shader_input.colour_buffer + i_buffer + 12); colour_buffer[0] = _mm_andnot_si128(z_mask[0], colour_buffer[0]); colour_buffer[1] = _mm_andnot_si128(z_mask[1], colour_buffer[1]); colour_buffer[2] = _mm_andnot_si128(z_mask[2], colour_buffer[2]); colour_buffer[3] = _mm_andnot_si128(z_mask[3], colour_buffer[3]); colour_buffer[0] = add_uint8_saturate(colour_buffer[0], colour_out[0] & z_mask[0]); colour_buffer[1] = add_uint8_saturate(colour_buffer[1], colour_out[1] & z_mask[1]); colour_buffer[2] = add_uint8_saturate(colour_buffer[2], colour_out[2] & z_mask[2]); colour_buffer[3] = add_uint8_saturate(colour_buffer[3], colour_out[3] & z_mask[3]); colour_buffer[0] = add_uint8_saturate(colour_buffer[0], tex_out[0] & z_mask[0]); colour_buffer[1] = add_uint8_saturate(colour_buffer[1], tex_out[1] & z_mask[1]); colour_buffer[2] = add_uint8_saturate(colour_buffer[2], tex_out[2] & z_mask[2]); colour_buffer[3] = add_uint8_saturate(colour_buffer[3], tex_out[3] & z_mask[3]); store(colour_buffer[0], shader_input.colour_buffer + i_buffer + 0); store(colour_buffer[1], shader_input.colour_buffer + i_buffer + 4); store(colour_buffer[2], shader_input.colour_buffer + i_buffer + 8); store(colour_buffer[3], shader_input.colour_buffer + i_buffer + 12); }