// Count Leading Zeros static __inline uint16_t _uint16_cntlz( uint16_t x ) { #ifdef __GNUC__ uint16_t nlz32 = (uint16_t)_uint32_cntlz( (uint32_t)x ); uint32_t nlz = _uint32_sub( nlz32, 16 ); return (nlz); #else const uint16_t x0 = _uint16_srl( x, 1 ); const uint16_t x1 = _uint16_or( x, x0 ); const uint16_t x2 = _uint16_srl( x1, 2 ); const uint16_t x3 = _uint16_or( x1, x2 ); const uint16_t x4 = _uint16_srl( x3, 4 ); const uint16_t x5 = _uint16_or( x3, x4 ); const uint16_t x6 = _uint16_srl( x5, 8 ); const uint16_t x7 = _uint16_or( x5, x6 ); const uint16_t x8 = _uint16_not( x7 ); const uint16_t x9 = _uint16_srlm( x8, 1, 0x5555 ); const uint16_t xA = _uint16_sub( x8, x9 ); const uint16_t xB = _uint16_and( xA, 0x3333 ); const uint16_t xC = _uint16_srlm( xA, 2, 0x3333 ); const uint16_t xD = _uint16_add( xB, xC ); const uint16_t xE = _uint16_srl( xD, 4 ); const uint16_t xF = _uint16_addm( xD, xE, 0x0f0f ); const uint16_t x10 = _uint16_srl( xF, 8 ); const uint16_t x11 = _uint16_addm( xF, x10, 0x001f ); return ( x11 ); #endif }
// Count Leading Zeros static inline uint16_t _uint16_cntlz( uint16_t x ) { #ifdef __GNUC__ /* On PowerPC, this will map to insn: cntlzw */ /* On Pentium, this will map to insn: clz */ uint32_t x32 = _uint32_sll( x, 16 ); uint16_t nlz = (uint16_t)__builtin_clz( x32 ); return (nlz); #else const uint16_t x0 = _uint16_srl( x, 1 ); const uint16_t x1 = _uint16_or( x, x0 ); const uint16_t x2 = _uint16_srl( x1, 2 ); const uint16_t x3 = _uint16_or( x1, x2 ); const uint16_t x4 = _uint16_srl( x3, 4 ); const uint16_t x5 = _uint16_or( x3, x4 ); const uint16_t x6 = _uint16_srl( x5, 8 ); const uint16_t x7 = _uint16_or( x5, x6 ); const uint16_t x8 = _uint16_not( x7 ); const uint16_t x9 = _uint16_srlm( x8, 1, 0x5555 ); const uint16_t xA = _uint16_sub( x8, x9 ); const uint16_t xB = _uint16_and( xA, 0x3333 ); const uint16_t xC = _uint16_srlm( xA, 2, 0x3333 ); const uint16_t xD = _uint16_add( xB, xC ); const uint16_t xE = _uint16_srl( xD, 4 ); const uint16_t xF = _uint16_addm( xD, xE, 0x0f0f ); const uint16_t x10 = _uint16_srl( xF, 8 ); const uint16_t x11 = _uint16_addm( xF, x10, 0x001f ); return ( x11 ); #endif }
// Count Leading Zeros static inline uint16 _uint16_cntlz( uint16 x ) { #ifdef __GNUC__ /* On PowerPC, this will map to insn: cntlzw */ /* On Pentium, this will map to insn: clz */ uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x ); uint32 nlz = _uint32_sub( nlz32, 16 ); return (nlz); #elif _NV_OS_XBOX_ uint16 nlz32 = (uint16)_CountLeadingZeros( (uint32)x ); return _uint32_sub( nlz32, 16); #else const uint16 x0 = _uint16_srl( x, 1 ); const uint16 x1 = _uint16_or( x, x0 ); const uint16 x2 = _uint16_srl( x1, 2 ); const uint16 x3 = _uint16_or( x1, x2 ); const uint16 x4 = _uint16_srl( x3, 4 ); const uint16 x5 = _uint16_or( x3, x4 ); const uint16 x6 = _uint16_srl( x5, 8 ); const uint16 x7 = _uint16_or( x5, x6 ); const uint16 x8 = _uint16_not( x7 ); const uint16 x9 = _uint16_srlm( x8, 1, 0x5555 ); const uint16 xA = _uint16_sub( x8, x9 ); const uint16 xB = _uint16_and( xA, 0x3333 ); const uint16 xC = _uint16_srlm( xA, 2, 0x3333 ); const uint16 xD = _uint16_add( xB, xC ); const uint16 xE = _uint16_srl( xD, 4 ); const uint16 xF = _uint16_addm( xD, xE, 0x0f0f ); const uint16 x10 = _uint16_srl( xF, 8 ); const uint16 x11 = _uint16_addm( xF, x10, 0x001f ); return ( x11 ); #endif }
uint16_t half_add( uint16_t x, uint16_t y ) { const uint16_t one = _uint16_li( 0x0001 ); const uint16_t msb_to_lsb_sa = _uint16_li( 0x000f ); const uint16_t h_s_mask = _uint16_li( 0x8000 ); const uint16_t h_e_mask = _uint16_li( 0x7c00 ); const uint16_t h_m_mask = _uint16_li( 0x03ff ); const uint16_t h_m_msb_mask = _uint16_li( 0x2000 ); const uint16_t h_m_msb_sa = _uint16_li( 0x000d ); const uint16_t h_m_hidden = _uint16_li( 0x0400 ); const uint16_t h_e_pos = _uint16_li( 0x000a ); const uint16_t h_e_bias_minus_one = _uint16_li( 0x000e ); const uint16_t h_m_grs_carry = _uint16_li( 0x4000 ); const uint16_t h_m_grs_carry_pos = _uint16_li( 0x000e ); const uint16_t h_grs_size = _uint16_li( 0x0003 ); const uint16_t h_snan = _uint16_li( 0xfe00 ); const uint16_t h_e_mask_minus_one = _uint16_li( 0x7bff ); const uint16_t h_grs_round_carry = _uint16_sll( one, h_grs_size ); const uint16_t h_grs_round_mask = _uint16_sub( h_grs_round_carry, one ); const uint16_t x_e = _uint16_and( x, h_e_mask ); const uint16_t y_e = _uint16_and( y, h_e_mask ); const uint16_t is_y_e_larger_msb = _uint16_sub( x_e, y_e ); const uint16_t a = _uint16_sels( is_y_e_larger_msb, y, x); const uint16_t a_s = _uint16_and( a, h_s_mask ); const uint16_t a_e = _uint16_and( a, h_e_mask ); const uint16_t a_m_no_hidden_bit = _uint16_and( a, h_m_mask ); const uint16_t a_em_no_hidden_bit = _uint16_or( a_e, a_m_no_hidden_bit ); const uint16_t b = _uint16_sels( is_y_e_larger_msb, x, y); const uint16_t b_s = _uint16_and( b, h_s_mask ); const uint16_t b_e = _uint16_and( b, h_e_mask ); const uint16_t b_m_no_hidden_bit = _uint16_and( b, h_m_mask ); const uint16_t b_em_no_hidden_bit = _uint16_or( b_e, b_m_no_hidden_bit ); const uint16_t is_diff_sign_msb = _uint16_xor( a_s, b_s ); const uint16_t is_a_inf_msb = _uint16_sub( h_e_mask_minus_one, a_em_no_hidden_bit ); const uint16_t is_b_inf_msb = _uint16_sub( h_e_mask_minus_one, b_em_no_hidden_bit ); const uint16_t is_undenorm_msb = _uint16_dec( a_e ); const uint16_t is_undenorm = _uint16_ext( is_undenorm_msb ); const uint16_t is_both_inf_msb = _uint16_and( is_a_inf_msb, is_b_inf_msb ); const uint16_t is_invalid_inf_op_msb = _uint16_and( is_both_inf_msb, b_s ); const uint16_t is_a_e_nez_msb = _uint16_neg( a_e ); const uint16_t is_b_e_nez_msb = _uint16_neg( b_e ); const uint16_t is_a_e_nez = _uint16_ext( is_a_e_nez_msb ); const uint16_t is_b_e_nez = _uint16_ext( is_b_e_nez_msb ); const uint16_t a_m_hidden_bit = _uint16_and( is_a_e_nez, h_m_hidden ); const uint16_t b_m_hidden_bit = _uint16_and( is_b_e_nez, h_m_hidden ); const uint16_t a_m_no_grs = _uint16_or( a_m_no_hidden_bit, a_m_hidden_bit ); const uint16_t b_m_no_grs = _uint16_or( b_m_no_hidden_bit, b_m_hidden_bit ); const uint16_t diff_e = _uint16_sub( a_e, b_e ); const uint16_t a_e_unbias = _uint16_sub( a_e, h_e_bias_minus_one ); const uint16_t a_m = _uint16_sll( a_m_no_grs, h_grs_size ); const uint16_t a_e_biased = _uint16_srl( a_e, h_e_pos ); const uint16_t m_sa_unbias = _uint16_srl( a_e_unbias, h_e_pos ); const uint16_t m_sa_default = _uint16_srl( diff_e, h_e_pos ); const uint16_t m_sa_unbias_mask = _uint16_andc( is_a_e_nez_msb, is_b_e_nez_msb ); const uint16_t m_sa = _uint16_sels( m_sa_unbias_mask, m_sa_unbias, m_sa_default ); const uint16_t b_m_no_sticky = _uint16_sll( b_m_no_grs, h_grs_size ); const uint16_t sh_m = _uint16_srl( b_m_no_sticky, m_sa ); const uint16_t sticky_overflow = _uint16_sll( one, m_sa ); const uint16_t sticky_mask = _uint16_dec( sticky_overflow ); const uint16_t sticky_collect = _uint16_and( b_m_no_sticky, sticky_mask ); const uint16_t is_sticky_set_msb = _uint16_neg( sticky_collect ); const uint16_t sticky = _uint16_srl( is_sticky_set_msb, msb_to_lsb_sa); const uint16_t b_m = _uint16_or( sh_m, sticky ); const uint16_t is_c_m_ab_pos_msb = _uint16_sub( b_m, a_m ); const uint16_t c_inf = _uint16_or( a_s, h_e_mask ); const uint16_t c_m_sum = _uint16_add( a_m, b_m ); const uint16_t c_m_diff_ab = _uint16_sub( a_m, b_m ); const uint16_t c_m_diff_ba = _uint16_sub( b_m, a_m ); const uint16_t c_m_smag_diff = _uint16_sels( is_c_m_ab_pos_msb, c_m_diff_ab, c_m_diff_ba ); const uint16_t c_s_diff = _uint16_sels( is_c_m_ab_pos_msb, a_s, b_s ); const uint16_t c_s = _uint16_sels( is_diff_sign_msb, c_s_diff, a_s ); const uint16_t c_m_smag_diff_nlz = _uint16_cntlz( c_m_smag_diff ); const uint16_t diff_norm_sa = _uint16_sub( c_m_smag_diff_nlz, one ); const uint16_t is_diff_denorm_msb = _uint16_sub( a_e_biased, diff_norm_sa ); const uint16_t is_diff_denorm = _uint16_ext( is_diff_denorm_msb ); const uint16_t is_a_or_b_norm_msb = _uint16_neg( a_e_biased ); const uint16_t diff_denorm_sa = _uint16_dec( a_e_biased ); const uint16_t c_m_diff_denorm = _uint16_sll( c_m_smag_diff, diff_denorm_sa ); const uint16_t c_m_diff_norm = _uint16_sll( c_m_smag_diff, diff_norm_sa ); const uint16_t c_e_diff_norm = _uint16_sub( a_e_biased, diff_norm_sa ); const uint16_t c_m_diff_ab_norm = _uint16_sels( is_diff_denorm_msb, c_m_diff_denorm, c_m_diff_norm ); const uint16_t c_e_diff_ab_norm = _uint16_andc( c_e_diff_norm, is_diff_denorm ); const uint16_t c_m_diff = _uint16_sels( is_a_or_b_norm_msb, c_m_diff_ab_norm, c_m_smag_diff ); const uint16_t c_e_diff = _uint16_sels( is_a_or_b_norm_msb, c_e_diff_ab_norm, a_e_biased ); const uint16_t is_diff_eqz_msb = _uint16_dec( c_m_diff ); const uint16_t is_diff_exactly_zero_msb = _uint16_and( is_diff_sign_msb, is_diff_eqz_msb ); const uint16_t is_diff_exactly_zero = _uint16_ext( is_diff_exactly_zero_msb ); const uint16_t c_m_added = _uint16_sels( is_diff_sign_msb, c_m_diff, c_m_sum ); const uint16_t c_e_added = _uint16_sels( is_diff_sign_msb, c_e_diff, a_e_biased ); const uint16_t c_m_carry = _uint16_and( c_m_added, h_m_grs_carry ); const uint16_t is_c_m_carry_msb = _uint16_neg( c_m_carry ); const uint16_t c_e_hidden_offset = _uint16_andsrl( c_m_added, h_m_grs_carry, h_m_grs_carry_pos ); const uint16_t c_m_sub_hidden = _uint16_srl( c_m_added, one ); const uint16_t c_m_no_hidden = _uint16_sels( is_c_m_carry_msb, c_m_sub_hidden, c_m_added ); const uint16_t c_e_no_hidden = _uint16_add( c_e_added, c_e_hidden_offset ); const uint16_t c_m_no_hidden_msb = _uint16_and( c_m_no_hidden, h_m_msb_mask ); const uint16_t undenorm_m_msb_odd = _uint16_srl( c_m_no_hidden_msb, h_m_msb_sa ); const uint16_t undenorm_fix_e = _uint16_and( is_undenorm, undenorm_m_msb_odd ); const uint16_t c_e_fixed = _uint16_add( c_e_no_hidden, undenorm_fix_e ); const uint16_t c_m_round_amount = _uint16_and( c_m_no_hidden, h_grs_round_mask ); const uint16_t c_m_rounded = _uint16_add( c_m_no_hidden, c_m_round_amount ); const uint16_t c_m_round_overflow = _uint16_andsrl( c_m_rounded, h_m_grs_carry, h_m_grs_carry_pos ); const uint16_t c_e_rounded = _uint16_add( c_e_fixed, c_m_round_overflow ); const uint16_t c_m_no_grs = _uint16_srlm( c_m_rounded, h_grs_size, h_m_mask ); const uint16_t c_e = _uint16_sll( c_e_rounded, h_e_pos ); const uint16_t c_em = _uint16_or( c_e, c_m_no_grs ); const uint16_t c_normal = _uint16_or( c_s, c_em ); const uint16_t c_inf_result = _uint16_sels( is_a_inf_msb, c_inf, c_normal ); const uint16_t c_zero_result = _uint16_andc( c_inf_result, is_diff_exactly_zero ); const uint16_t c_result = _uint16_sels( is_invalid_inf_op_msb, h_snan, c_zero_result ); return (c_result); }