/* ================== ================== */ void Process_Fragment_4x4( __int32 w_seed[2], __int32 i_tile_in, __int32 i_buffer_in, const unsigned __int32 coverage_mask, raster_output_& raster_output, shader_input_& shader_input ) { const __int32 i_buffer = i_buffer_in + (i_tile_in * 4 * 4); __m128i bazza[3][4]; for (__int32 i_edge = 0; i_edge < 2; i_edge++) { __m128i w_row = set_all(w_seed[i_edge]); bazza[i_edge][0] = w_row + load_u(raster_output.reject_table[0][i_edge][0]); bazza[i_edge][1] = w_row + load_u(raster_output.reject_table[0][i_edge][1]); bazza[i_edge][2] = w_row + load_u(raster_output.reject_table[0][i_edge][2]); bazza[i_edge][3] = w_row + load_u(raster_output.reject_table[0][i_edge][3]); } pixel_shader(i_buffer, coverage_mask, bazza, shader_input); const __int32 i_buffer_depth_4x4 = i_buffer / (4 * 4); const __int32 i_buffer_depth_16x16 = i_buffer / (16 * 16); const __int32 i_buffer_depth_64x64 = i_buffer / (64 * 64); shader_input.depth_tiles_4x4[i_buffer_depth_4x4] = shader_input.z_max; shader_input.tile_mask_16x16 |= one_bit_64 << i_buffer_depth_16x16; shader_input.tile_mask_64x64 |= one_bit_64 << i_buffer_depth_64x64; }
void ShaderLoad::LoadData(const uint8_t* data, size_t size, YCommon::YContainers::MemBuffer* buffer) { YASSERT(YEngineData::VerifyShaderBuffer(flatbuffers::Verifier(data, size)), "Invalid Shader Data."); shader_ids = nullptr; num_shader_ids = 0; const YEngineData::Shader* shader_data = YEngineData::GetShader(data); const char* shader_name = shader_data->name()->c_str(); const auto variants = shader_data->variants(); num_shader_ids = variants->size(); shader_ids = static_cast<YRenderer::ShaderID*>( buffer->Allocate(sizeof(*shader_ids) * num_shader_ids)); YASSERT(shader_ids, "Out of memory - could not load shader data."); YRenderer::ShaderID* shader_id_iter = shader_ids; for (auto variant_iter = variants->begin(); variant_iter != variants->end(); ++variant_iter) { const char* variant_name = variant_iter->name()->c_str(); *shader_id_iter++ = YRenderer::Renderer::CreateShader( shader_name, variant_name, variant_iter->vertex_shader()->Data(), variant_iter->vertex_shader()->size(), variant_iter->pixel_shader()->Data(), variant_iter->pixel_shader()->size()); } }
/* ================== ================== */ void Process_Fragments( raster_output_& raster_output, shader_input_& shader_input ) { const __m128 zero = set_all(0.0f); shader_input.tile_mask_16x16 = 0x0; shader_input.tile_mask_64x64 = 0x0; //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_64x64]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_64x64][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_64x64( raster_fragment.w, i_buffer, coverage_mask, raster_output, shader_input ); } } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_16x16]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_16x16][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_16x16( raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input ); } } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_4x4]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_4x4][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input); } } //=============================================================================================== { //const __int32 start = raster_output_::MAX_FRAGMENTS - 1; //const __int32 end = raster_output.n_fragments[raster_output_::PARTIAL_ACCEPT_4x4]; //for (__int32 i_fragment = start; i_fragment > end; i_fragment--) { // raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::PARTIAL_ACCEPT_4x4][i_fragment]; // const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; // const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; // Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input); //} } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments_COMPLETE; __int32 n_depth_fragments = 0; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_complete_& raster_fragment = raster_output.raster_fragment_complete[i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; pixel_shader(i_buffer, coverage_mask, raster_fragment.bazza, shader_input); const __int32 i_buffer_depth_4x4 = i_buffer / (4 * 4); const __int32 i_buffer_depth_16x16 = i_buffer / (16 * 16); const __int32 i_buffer_depth_64x64 = i_buffer / (64 * 64); shader_input.depth_tiles_4x4[i_buffer_depth_4x4] = shader_input.z_max; shader_input.tile_mask_16x16 |= one_bit_64 << i_buffer_depth_16x16; shader_input.tile_mask_64x64 |= one_bit_64 << i_buffer_depth_64x64; } } //=============================================================================================== { //printf_s(" %llu ", shader_input.tile_mask_16x16); __int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_16x16); for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) { unsigned long i_tile_16x16; _BitScanForward64(&i_tile_16x16, shader_input.tile_mask_16x16); shader_input.tile_mask_16x16 ^= one_bit_64 << i_tile_16x16; const __int32 i_tile_4x4 = i_tile_16x16 * (4 * 4); __m128 depth_4x4[4]; depth_4x4[0] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (0 * 4)); depth_4x4[1] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (1 * 4)); depth_4x4[2] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (2 * 4)); depth_4x4[3] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (3 * 4)); __m128 z_max; z_max = depth_4x4[0]; z_max = min_vec(depth_4x4[1], z_max); z_max = min_vec(depth_4x4[2], z_max); z_max = min_vec(depth_4x4[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.depth_tiles_16x16[i_tile_16x16] = store_s(z_out); } } { __int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_64x64); //printf_s(" %llu ", n_tiles); for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) { unsigned long i_tile_64x64; _BitScanForward64(&i_tile_64x64, shader_input.tile_mask_64x64); shader_input.tile_mask_64x64 ^= one_bit_64 << i_tile_64x64; const __int32 i_tile_16x16 = i_tile_64x64 * (4 * 4); __m128 depth_16x16[4]; depth_16x16[0] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (0 * 4)); depth_16x16[1] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (1 * 4)); depth_16x16[2] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (2 * 4)); depth_16x16[3] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (3 * 4)); __m128 z_max; z_max = depth_16x16[0]; z_max = min_vec(depth_16x16[1], z_max); z_max = min_vec(depth_16x16[2], z_max); z_max = min_vec(depth_16x16[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.depth_tiles_64x64[i_tile_64x64] = store_s(z_out); } } }