// Count Leading Zeros static __inline uint16_t _uint16_cntlz( uint16_t x ) { #ifdef __GNUC__ uint16_t nlz32 = (uint16_t)_uint32_cntlz( (uint32_t)x ); uint32_t nlz = _uint32_sub( nlz32, 16 ); return (nlz); #else const uint16_t x0 = _uint16_srl( x, 1 ); const uint16_t x1 = _uint16_or( x, x0 ); const uint16_t x2 = _uint16_srl( x1, 2 ); const uint16_t x3 = _uint16_or( x1, x2 ); const uint16_t x4 = _uint16_srl( x3, 4 ); const uint16_t x5 = _uint16_or( x3, x4 ); const uint16_t x6 = _uint16_srl( x5, 8 ); const uint16_t x7 = _uint16_or( x5, x6 ); const uint16_t x8 = _uint16_not( x7 ); const uint16_t x9 = _uint16_srlm( x8, 1, 0x5555 ); const uint16_t xA = _uint16_sub( x8, x9 ); const uint16_t xB = _uint16_and( xA, 0x3333 ); const uint16_t xC = _uint16_srlm( xA, 2, 0x3333 ); const uint16_t xD = _uint16_add( xB, xC ); const uint16_t xE = _uint16_srl( xD, 4 ); const uint16_t xF = _uint16_addm( xD, xE, 0x0f0f ); const uint16_t x10 = _uint16_srl( xF, 8 ); const uint16_t x11 = _uint16_addm( xF, x10, 0x001f ); return ( x11 ); #endif }
// Count Leading Zeros static inline uint16 _uint16_cntlz( uint16 x ) { #ifdef __GNUC__ /* On PowerPC, this will map to insn: cntlzw */ /* On Pentium, this will map to insn: clz */ uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x ); uint32 nlz = _uint32_sub( nlz32, 16 ); return (nlz); #elif _NV_OS_XBOX_ uint16 nlz32 = (uint16)_CountLeadingZeros( (uint32)x ); return _uint32_sub( nlz32, 16); #else const uint16 x0 = _uint16_srl( x, 1 ); const uint16 x1 = _uint16_or( x, x0 ); const uint16 x2 = _uint16_srl( x1, 2 ); const uint16 x3 = _uint16_or( x1, x2 ); const uint16 x4 = _uint16_srl( x3, 4 ); const uint16 x5 = _uint16_or( x3, x4 ); const uint16 x6 = _uint16_srl( x5, 8 ); const uint16 x7 = _uint16_or( x5, x6 ); const uint16 x8 = _uint16_not( x7 ); const uint16 x9 = _uint16_srlm( x8, 1, 0x5555 ); const uint16 xA = _uint16_sub( x8, x9 ); const uint16 xB = _uint16_and( xA, 0x3333 ); const uint16 xC = _uint16_srlm( xA, 2, 0x3333 ); const uint16 xD = _uint16_add( xB, xC ); const uint16 xE = _uint16_srl( xD, 4 ); const uint16 xF = _uint16_addm( xD, xE, 0x0f0f ); const uint16 x10 = _uint16_srl( xF, 8 ); const uint16 x11 = _uint16_addm( xF, x10, 0x001f ); return ( x11 ); #endif }
static inline uint32_t _half_to_float( uint16_t h ) { const uint32_t h_e_mask = _uint32_li( 0x00007c00 ); const uint32_t h_m_mask = _uint32_li( 0x000003ff ); const uint32_t h_s_mask = _uint32_li( 0x00008000 ); const uint32_t h_f_s_pos_offset = _uint32_li( 0x00000010 ); const uint32_t h_f_e_pos_offset = _uint32_li( 0x0000000d ); const uint32_t h_f_bias_offset = _uint32_li( 0x0001c000 ); const uint32_t f_e_mask = _uint32_li( 0x7f800000 ); const uint32_t f_m_mask = _uint32_li( 0x007fffff ); const uint32_t h_f_e_denorm_bias = _uint32_li( 0x0000007e ); const uint32_t h_f_m_denorm_sa_bias = _uint32_li( 0x00000008 ); const uint32_t f_e_pos = _uint32_li( 0x00000017 ); const uint32_t h_e = _uint32_and( h, h_e_mask ); const uint32_t h_m = _uint32_and( h, h_m_mask ); const uint32_t h_s = _uint32_and( h, h_s_mask ); const uint32_t h_e_f_bias = _uint32_add( h_e, h_f_bias_offset ); const uint32_t h_m_nlz = _uint32_cntlz( h_m ); const uint32_t f_s = _uint32_sll( h_s, h_f_s_pos_offset ); const uint32_t f_e = _uint32_sll( h_e_f_bias, h_f_e_pos_offset ); const uint32_t f_m = _uint32_sll( h_m, h_f_e_pos_offset ); const uint32_t h_f_m_sa = _uint32_sub( h_m_nlz, h_f_m_denorm_sa_bias ); const uint32_t f_e_denorm_unpacked = _uint32_sub( h_f_e_denorm_bias, h_f_m_sa ); const uint32_t h_f_m = _uint32_sll( h_m, h_f_m_sa ); const uint32_t f_m_denorm = _uint32_and( h_f_m, f_m_mask ); const uint32_t f_e_denorm = _uint32_sll( f_e_denorm_unpacked, f_e_pos ); const uint32_t f_em_denorm = _uint32_or( f_e_denorm, f_m_denorm ); const uint32_t is_e_eqz_msb = _uint32_dec( h_e ); const uint32_t is_m_nez_msb = _uint32_neg( h_m ); const uint32_t is_e_flagged_msb = _uint32_sub( h_e_mask, is_e_eqz_msb ); const uint32_t is_denorm_msb = _uint32_and( is_e_eqz_msb, is_m_nez_msb ); const uint32_t is_ninf_msb = _uint32_andc( is_m_nez_msb, is_e_flagged_msb ); const uint32_t is_zero_msb = _uint32_andc( is_e_eqz_msb, is_m_nez_msb ); const uint32_t is_ninf = _uint32_ext( is_ninf_msb ); const uint32_t is_zero = _uint32_ext( is_zero_msb ); const uint32_t f_e_flagged_result = _uint32_muxs( is_e_flagged_msb, f_e_mask, f_e ); const uint32_t f_m_inf_result = _uint32_and( f_m, is_ninf ); const uint32_t f_em_result_partial = _uint32_or( f_e_flagged_result, f_m_inf_result ); const uint32_t f_em_denorm_result = _uint32_muxs( is_denorm_msb, f_em_denorm, f_em_result_partial ); const uint32_t f_em_result = _uint32_andc( f_em_denorm_result, is_zero ); const uint32_t f_result = _uint32_or( f_s, f_em_result ); return (f_result); }
uint32_t half_to_float( uint16_t h ) { const uint32_t h_e_mask = _uint32_li( 0x00007c00 ); const uint32_t h_m_mask = _uint32_li( 0x000003ff ); const uint32_t h_s_mask = _uint32_li( 0x00008000 ); const uint32_t h_f_s_pos_offset = _uint32_li( 0x00000010 ); const uint32_t h_f_e_pos_offset = _uint32_li( 0x0000000d ); const uint32_t h_f_bias_offset = _uint32_li( 0x0001c000 ); const uint32_t f_e_mask = _uint32_li( 0x7f800000 ); const uint32_t f_m_mask = _uint32_li( 0x007fffff ); const uint32_t h_f_e_denorm_bias = _uint32_li( 0x0000007e ); const uint32_t h_f_m_denorm_sa_bias = _uint32_li( 0x00000008 ); const uint32_t f_e_pos = _uint32_li( 0x00000017 ); const uint32_t h_e_mask_minus_one = _uint32_li( 0x00007bff ); const uint32_t h_e = _uint32_and( h, h_e_mask ); const uint32_t h_m = _uint32_and( h, h_m_mask ); const uint32_t h_s = _uint32_and( h, h_s_mask ); const uint32_t h_e_f_bias = _uint32_add( h_e, h_f_bias_offset ); const uint32_t h_m_nlz = _uint32_cntlz( h_m ); const uint32_t f_s = _uint32_sll( h_s, h_f_s_pos_offset ); const uint32_t f_e = _uint32_sll( h_e_f_bias, h_f_e_pos_offset ); const uint32_t f_m = _uint32_sll( h_m, h_f_e_pos_offset ); const uint32_t f_em = _uint32_or( f_e, f_m ); const uint32_t h_f_m_sa = _uint32_sub( h_m_nlz, h_f_m_denorm_sa_bias ); const uint32_t f_e_denorm_unpacked = _uint32_sub( h_f_e_denorm_bias, h_f_m_sa ); const uint32_t h_f_m = _uint32_sll( h_m, h_f_m_sa ); const uint32_t f_m_denorm = _uint32_and( h_f_m, f_m_mask ); const uint32_t f_e_denorm = _uint32_sll( f_e_denorm_unpacked, f_e_pos ); const uint32_t f_em_denorm = _uint32_or( f_e_denorm, f_m_denorm ); const uint32_t f_em_nan = _uint32_or( f_e_mask, f_m ); const uint32_t is_e_eqz_msb = _uint32_dec( h_e ); const uint32_t is_m_nez_msb = _uint32_neg( h_m ); const uint32_t is_e_flagged_msb = _uint32_sub( h_e_mask_minus_one, h_e ); const uint32_t is_zero_msb = _uint32_andc( is_e_eqz_msb, is_m_nez_msb ); const uint32_t is_inf_msb = _uint32_andc( is_e_flagged_msb, is_m_nez_msb ); const uint32_t is_denorm_msb = _uint32_and( is_m_nez_msb, is_e_eqz_msb ); const uint32_t is_nan_msb = _uint32_and( is_e_flagged_msb, is_m_nez_msb ); const uint32_t is_zero = _uint32_ext( is_zero_msb ); const uint32_t f_zero_result = _uint32_andc( f_em, is_zero ); const uint32_t f_denorm_result = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result ); const uint32_t f_inf_result = _uint32_sels( is_inf_msb, f_e_mask, f_denorm_result ); const uint32_t f_nan_result = _uint32_sels( is_nan_msb, f_em_nan, f_inf_result ); const uint32_t f_result = _uint32_or( f_s, f_nan_result ); return (f_result); }