//-------------------------------------------------------------------------- //-------- execute --------------------------------------------------------- //-------------------------------------------------------------------------- void AssembleElemSolverAlgorithm::execute() { stk::mesh::BulkData & bulk_data = realm_.bulk_data(); // set any data const size_t activeKernelsSize = activeKernels_.size(); for ( size_t i = 0; i < activeKernelsSize; ++i ) activeKernels_[i]->setup(*realm_.timeIntegrator_); run_algorithm(bulk_data, [&](SharedMemData& smdata) { set_zero(smdata.simdrhs.data(), smdata.simdrhs.size()); set_zero(smdata.simdlhs.data(), smdata.simdlhs.size()); // call supplemental; gathers happen inside the elem_execute method for ( size_t i = 0; i < activeKernelsSize; ++i ) activeKernels_[i]->execute( smdata.simdlhs, smdata.simdrhs, smdata.simdPrereqData ); for(int simdElemIndex=0; simdElemIndex<smdata.numSimdElems; ++simdElemIndex) { extract_vector_lane(smdata.simdrhs, simdElemIndex, smdata.rhs); extract_vector_lane(smdata.simdlhs, simdElemIndex, smdata.lhs); apply_coeff(nodesPerEntity_, smdata.elemNodes[simdElemIndex], smdata.scratchIds, smdata.sortPermutation, smdata.rhs, smdata.lhs, __FILE__); } }); }
void init_decod_ld8a(struct dec_state_t * state) { /* Initialize static pointer */ state->exc = state->old_exc + PIT_MAX + L_INTERPOL; /* Static vectors to zero */ set_zero(state->old_exc, PIT_MAX+L_INTERPOL); set_zero(state->mem_syn, M); state->sharp = SHARPMIN; state->old_t0 = 60; state->gain_code = (F)0.0; state->gain_pitch = (F)0.0; lsp_decw_reset(&state->lsp_s); init_exc_err(state->cng_s.exc_err); // ? copy(lsp_reset, state->lsp_old, M); /* for G.729B */ state->seed_fer = 21845; state->past_ftyp = 1; state->seed = INIT_SEED; state->sid_sav = (F)0.; init_lsfq_noise(&state->cng_s.lsfq_s); // ? gain_past_reset(&state->gain_s); state->bad_lsf = 0; /* Initialize bad LSF indicator */ }
double chiral_condensate() { complex double q1, q2; int i0 = 0; q1 = 0.0 + I*0.0; q2 = 0.0 + I*0.0; //To calculate D^{-1}(x,x), we invert solve the equation D R = S //Where the source S is only nonzero at x and for different spinor components //The required number of sources is the number of spinor components // Source 1 set_zero(S0); S0[i0].s1 = 1.0 + I*0.0; gam5D_wilson(S, S0); cg(R1, S, ITER_MAX, DELTACG, &gam5D_SQR_wilson); //Inverting the Dirac operator on source 1 q1 += R1[i0].s1; // Source 2 set_zero(S0); S0[i0].s2 = 1.0 + I*0.0; gam5D_wilson(S, S0); cg(R2, S, ITER_MAX, DELTACG, &gam5D_SQR_wilson); //Inverting the Dirac operator on source 2 q2 += R2[i0].s2; if(fabs(cimag(q1 - q2))>sqrt(DELTACG)) { printf("\n Imaginary part of chiral condensate detected!!! \n"); }; //q1 and q2 are the diagonal components (11 and 22) of the propagator //q1 - q2 is tr(gamma_5 D^-1) return creal(q1 - q2); }
void ARingZZGMP::syzygy(const ElementType& a, const ElementType& b, ElementType& x, ElementType& y) const { M2_ASSERT(!is_zero(b)); // First check the special cases a = 0, b = 1, -1. Other cases: use gcd. if (is_zero(a)) { set_from_long(x, 1); set_zero(y); return; } if (mpz_cmp_ui(&b,1) == 0) { set_from_long(x, 1); negate(y, a); return; } if (mpz_cmp_si(&b,-1) == 0) { set_from_long(x, 1); set(y, a); return; } elem g; init(g); mpz_gcd(&g,&a,&b); divide(y,a,g); divide(x,b,g); if (mpz_sgn(&x) > 0) negate(y,y); else negate(x,x); clear(g); }
void invert(ElementType& result, const ElementType& a) const { if (is_unit(a)) set(result, a); else set_zero(result); }
void displaySpinPropagator4d() { TIMER("displaySpinPropagator4d"); // qlat::Coordinate total_site(16, 16, 16, 32); qlat::Coordinate total_site(4, 4, 4, 8); qlat::Geometry geo; geo.init(total_site, 1); qlat::DisplayInfo(cname, fname, "geo =\n%s\n", qlat::show(geo).c_str()); std::array<double, qlat::DIMN> momtwist; momtwist[0] = 0.0; momtwist[1] = 0.0; momtwist[2] = 0.0; momtwist[3] = 0.0; const double mass = 0.1; qlat::SpinPropagator4d prop; prop.init(geo); set_zero(prop); qlat::Coordinate xgsrc(0, 0, 0, 0); qlat::Coordinate xlsrc = geo.coordinate_l_from_g(xgsrc); if (geo.is_local(xlsrc)) { qlat::set_unit(prop.get_elem(xlsrc)); } qlat::prop_spin_propagator4d(prop, mass, momtwist); qlat::Coordinate xgsnk(0, 0, 0, 0); qlat::Coordinate xlsnk = geo.coordinate_l_from_g(xgsnk); qlat::DisplayInfo(cname, fname, "xgsnk = %s .\n", qlat::show(xgsnk).c_str()); if (geo.is_local(xlsnk)) { qlat::Display(cname, fname, "prop[xgsnk] =\n%s\n", qlat::show(prop.get_elem(xlsnk)).c_str()); } }
int ddr_init( void ) { // tell dramc to configure set_val( P1MEMCCMD, 0x4 ); // set refresh period set_val( P1REFRESH, nstoclk(7800) ); // set timing para set_val( P1CASLAT, ( 3 << 1 ) ); set_val( P1T_DQSS, 0x1 ); // 0.75 - 1.25 set_val( P1T_MRD, 0x2 ); set_val( P1T_RAS, nstoclk(45) ); set_val( P1T_RC, nstoclk(68) ); unsigned int trcd = nstoclk( 23 ); set_val( P1T_RCD, trcd | (( trcd - 3 ) << 3 ) ); unsigned int trfc = nstoclk( 80 ); set_val( P1T_RFC, trfc | ( ( trfc-3 ) << 5 ) ); unsigned int trp = nstoclk( 23 ); set_val( P1T_RP, trp | ( ( trp - 3 ) << 3 ) ); set_val( P1T_RRD, nstoclk(15) ); set_val( P1T_WR, nstoclk(15) ); set_val( P1T_WTR, 0x7 ); set_val( P1T_XP, 0x2 ); set_val( P1T_XSR, nstoclk(120) ); set_val( P1T_ESR, nstoclk(120) ); // set mem cfg set_nbit( P1MEMCFG, 0, 3, 0x2 ); /* 10 column address */ /* set_nbit: 把从第bit位开始的一共len位消零,然后把这几位设为val */ set_nbit( P1MEMCFG, 3, 3, 0x3 ); /* 14 row address */ set_zero( P1MEMCFG, 6 ); /* A10/AP */ set_nbit( P1MEMCFG, 15, 3, 0x2 ); /* Burst 4 */ set_nbit( P1MEMCFG2, 0, 4, 0x5 ); set_2bit( P1MEMCFG2, 6, 0x1 ); /* 32 bit */ set_nbit( P1MEMCFG2, 8, 3, 0x3 ); /* Mobile DDR SDRAM */ set_2bit( P1MEMCFG2, 11, 0x1 ); set_one( P1_chip_0_cfg, 16 ); /* Bank-Row-Column organization */ // memory init set_val( P1DIRECTCMD, 0xc0000 ); // NOP set_val( P1DIRECTCMD, 0x000 ); // precharge set_val( P1DIRECTCMD, 0x40000 );// auto refresh set_val( P1DIRECTCMD, 0x40000 );// auto refresh set_val( P1DIRECTCMD, 0xa0000 ); // EMRS set_val( P1DIRECTCMD, 0x80032 ); // MRS set_val( MEM_SYS_CFG, 0x0 ); // set dramc to "go" status set_val( P1MEMCCMD, 0x000 ); // wait ready while( !(( read_val( P1MEMSTAT ) & 0x3 ) == 0x1)); }
void *fir_new(long n) { int i; t_fir *x = (t_fir *)newobject(fir_class); t_float *coefs = x->f_coefs; t_float *ff = x->f_ff; dsp_setup((t_pxobject *)x, 1); outlet_new((t_object *)x, "signal"); x->f_length = n + 1; set_zero(coefs, MAXSIZE); set_zero(ff, MAXSIZE); return (x); }
void make_vector_zeroes( container & vec, const typename container::size_type & d1) { vec.resize(d1); for(auto it=vec.begin(); it!=vec.end(); ++it) { set_zero(*it); } }
bool invert(ElementType& result,const ElementType& a) const { if (is_unit(a)) { mpq_inv(&result, &a); return true; } set_zero(result); return false; }
/*-------------------------------------------------------------------------- * init_decod_ld8k - Initialization of variables for the decoder section. *-------------------------------------------------------------------------- */ void init_decod_ld8k(void) { /* Initialize static pointer */ exc = old_exc + PIT_MAX + L_INTERPOL; /* Static vectors to zero */ set_zero(old_exc,PIT_MAX + L_INTERPOL); set_zero(mem_syn, M); sharp = SHARPMIN; old_t0 = 60; gain_code = (F)0.; gain_pitch = (F)0.; lsp_decw_reset(); return; }
void power(ElementType& result, const ElementType& a, int n) const { if (is_zero(a)) set_zero(result); else if (n < 0) { invert(result, a); fq_zech_pow_ui(&result, &result, -n, mContext); } else fq_zech_pow_ui(&result, &a, n, mContext); }
int drw_sudoku() { char *ptr; //const char ptr[40]; int i,j,row=0,col=0; //int b[9][9]={{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0}}; int b[9][9]; for(i=0;i<9;i++) { for(j=0;j<9;j++) { if(row==col) b[i][j]=3; else b[i][j]=-1; ++col; } ++row; } setcolor(2); //G settextstyle(DEFAULT_FONT,VERT_DIR,4); outtextxy(110,100,"SU-DOKU"); settextstyle(DEFAULT_FONT,HORIZ_DIR,4); outtextxy(130,60,"SU-DOKU"); setcolor(4); //R rectangle(154,119,346,311); setcolor(15); //W rectangle(157,122,343,308); //1*1 GRID: //cout<<b[0][0]; row=0;col=0; settextstyle(DEFAULT_FONT,HORIZ_DIR,1); for(i=160;i<=320;i=i+20) { for(j=125;j<=285;j=j+20) { rectangle(i,j,i+20,j+20); if(row==col) { set_zero(i+3,i+17,j+3,j+17,11); setcolor(5); sprintf(ptr,"%d",b[row][col]);//**ptr=b[row][col]; outtextxy(i+7,j+7,ptr); } setcolor(15); ++row; } ++col; } //3*3 GRID: setcolor(4); //R for(i=160;i<=320;i=i+60) { for(j=125;j<=285;j=j+60) rectangle(i,j,i+60,j+60); } return 0; }
/*-------------------------------------------------------------------------- * init_decod_ld8c - Initialization of variables for the decoder section. *-------------------------------------------------------------------------- */ void init_decod_ld8c(void) { /* Initialize static pointer */ exc = old_exc + PIT_MAX + L_INTERPOL; /* Static vectors to zero */ set_zero(old_exc, PIT_MAX+L_INTERPOL); set_zero(mem_syn, M_BWD); sharp = SHARPMIN; prev_t0 = 60; prev_t0_frac = 0; gain_code = (F)0.; gain_pitch = (F)0.; lsp_decw_resete(freq_prev, prev_lsp, &prev_ma); set_zero(A_bwd_mem, M_BWDP1); set_zero(A_t_bwd_mem, M_BWDP1); A_bwd_mem[0] = (F)1.; A_t_bwd_mem[0] = (F)1.; prev_voicing = 0; prev_bfi = 0; prev_lp_mode = 0; c_fe = (F)0.; c_int = (F)1.1; /* Filter interpolation parameter */ set_zero(prev_filter, M_BWDP1); prev_filter[0] = (F)1.; prev_pitch = 30; stat_pitch = 0; set_zero(old_A_bwd, M_BWDP1); set_zero(rexp, M_BWDP1); old_A_bwd[0] = (F)1.; set_zero(old_rc_bwd, 2); gain_pit_mem = (F)0.; gain_cod_mem = (F)0.; c_muting = (F)1.; count_bfi = 0; stat_bwd = 0; /* for G.729B */ seed_fer = (INT16)21845; past_ftyp = 3; seed = INIT_SEED; sid_sav = (FLOAT)0.; init_lsfq_noise(); return; }
vector_zeroer(container & vec, const other_container & other_vec) { if(!other_vec.empty()) { vec.resize(1); set_zero(vec.front()); vec.resize(other_vec.size(),vec.front()); } else { vec.resize(0); } }
void fix_step_sigmas( T & v_sigmas, const T & v_min, const T & v_max ) { typedef typename std::decay<decltype(v_sigmas.get_head())>::type value_type; value_type zero; set_zero(zero); if((v_sigmas.get_head() <= zero) || (v_sigmas.get_head() <= v_max.get_head() - v_min.get_head() )) v_sigmas.get_head() = (v_max.get_head() - v_min.get_head())/10.; fix_step_sigmas(v_sigmas.get_tail(),v_min.get_tail(),v_max.get_tail()); return; }
void fix_step_sigmas( T & v_sigmas ) { typedef typename std::decay<decltype(v_sigmas.get_head())>::type value_type; value_type zero; set_zero(zero); if(v_sigmas.get_head() <= zero) v_sigmas.get_head() = units_cast<value_type>(1.); fix_step_sigmas(v_sigmas.get_tail()); return; }
void fix_step_sigmas( const T & v_sigmas ) { typedef typename std::decay<decltype(v_sigmas[0])>::type value_type; value_type zero; set_zero(zero); for(int_type i=0; i<ssize(v_sigmas); ++i) { if( v_sigmas[i]<=zero ) v_sigmas[i] = units_cast<value_type>(1.); } return; }
void fix_step_sigmas( T & v_sigmas, const T & v_min, const T & v_max ) { assert(ssize(v_min)==ssize(v_max)); typedef typename std::decay<decltype(v_sigmas[0])>::type value_type; value_type zero; set_zero(zero); if(ssize(v_sigmas) != ssize(v_min)) { v_sigmas.resize(ssize(v_min)); for(auto & v : v_sigmas) set_zero(v); } for(int_type i=0; i<ssize(v_sigmas); ++i) { if( (v_sigmas[i]<=zero) || (v_sigmas[i] >= v_max[i]-v_min[i])) v_sigmas[i] = (v_max[i]-v_min[i])/10.; } return; }
static void test_32(skiatest::Reporter* reporter) { uint32_t buffer[TOTAL]; for (int count = 0; count < MAX_COUNT; ++count) { for (int alignment = 0; alignment < MAX_ALIGNMENT; ++alignment) { set_zero(buffer, sizeof(buffer)); uint32_t* base = &buffer[PAD + alignment]; sk_memset32(base, VALUE32, count); compare32(buffer, 0, PAD + alignment); compare32(base, VALUE32, count); compare32(base + count, 0, TOTAL - count - PAD - alignment); } } }
void fprint_fermion_mat() { int i, j; complex double x; complex double basis[GRIDPOINTS]; complex double out[GRIDPOINTS]; complex double temp[GRIDPOINTS]; FILE *fp; fp = fopen("fmat_real.dat", "w"); printf("\n Output fermion determinant...\n"); set_zero(basis); for(i = 0; i<GRIDPOINTS; i++) { basis[i] = 1.0; fermion_fp(out, temp, basis); //printf("{"); for(j = 0; j < GRIDPOINTS-1; j++) { x = out[j]; fprintf(fp, "%f ", creal(x)); } fprintf(fp, "%f\n", creal(out[GRIDPOINTS-1])); //printf("},"); basis[i] = 0.0; } fclose(fp); fp = fopen("fmat_imag.dat", "w"); for(i = 0; i<GRIDPOINTS; i++) { basis[i] = 1.0; fermion_fp(out, temp, basis); //printf("{"); for(j = 0; j < GRIDPOINTS-1; j++) { x = out[j]; fprintf(fp, "%f ", cimag(x)); } fprintf(fp, "%f\n", cimag(out[GRIDPOINTS-1])); //printf("},"); basis[i] = 0.0; } fclose(fp); }
/*---------------------------------------------------------------------------* * Function vad_init * * ~~~~~~~~~~~~~~~~~~ * * * * -> Initialization of variables for voice activity detection * * * *---------------------------------------------------------------------------*/ void vad_init(struct vad_state_t * state) { /* Static vectors to zero */ set_zero(state->MeanLSF, M); /* Initialize VAD parameters */ state->MeanSE = (F)0.0; state->MeanSLE = (F)0.0; state->MeanE = (F)0.0; state->MeanSZC = (F)0.0; state->count_sil = 0; state->count_update = 0; state->count_ext = 0; state->less_count = 0; state->flag = 1; state->Min = FLT_MAX_G729; }
static void test_16(skiatest::Reporter* reporter) { uint16_t buffer[TOTAL]; for (int count = 0; count < MAX_COUNT; ++count) { for (int alignment = 0; alignment < MAX_ALIGNMENT; ++alignment) { set_zero(buffer, sizeof(buffer)); uint16_t* base = &buffer[PAD + alignment]; sk_memset16(base, VALUE16, count); REPORTER_ASSERT(reporter, compare16(buffer, 0, PAD + alignment) && compare16(base, VALUE16, count) && compare16(base + count, 0, TOTAL - count - PAD - alignment)); } } }
/*---------------------------------------------------------------------------* * Function vad_init * * ~~~~~~~~~~~~~~~~~~ * * * * -> Initialization of variables for voice activity detection * * * *---------------------------------------------------------------------------*/ void vad_init(void) { /* Static vectors to zero */ set_zero(MeanLSF, M); /* Initialize VAD parameters */ MeanSE = (float) 0.0; MeanSLE = (float) 0.0; MeanE = (float) 0.0; MeanSZC = (float) 0.0; count_sil = 0; count_update = 0; count_ext = 0; less_count = 0; flag = 1; Min = FLT_MAX_G729; return; }
void fir_set(t_fir *x, Symbol *s, int ac, Atom *av) { int i, j; int m = MIN(MAXSIZE, ac); t_float *coefs = x->f_coefs; set_zero(coefs, MAXSIZE); for (i=j=0; i < m; i++) { if (av[i].a_type == A_FLOAT) { *coefs++ = av[i].a_w.w_float; j++; } else if (av[i].a_type == A_LONG) { *coefs++ = (float)av[i].a_w.w_long; j++; } } //x->f_length = j; }
bool init ( PSORA_RADIO_RX_STREAM pRxStream, UCHAR* output, uint out_size ) { // CF_11CCA CF_11CCA::cca_pwr_threshold() = 1000*1000*4; // CF_RxStream CF_RxStream::rxstream_pointer() = pRxStream; CF_RxStream::rxstream_touched() = 0; // CF_VecDC vcs& vdc = CF_VecDC::direct_current(); set_zero(vdc); // CF_RxFrameBuffer CF_RxFrameBuffer::rx_frame_buf() = output; CF_RxFrameBuffer::rx_frame_buf_size() = out_size; return reset (); }
void power_mpz(ElementType& result, const ElementType& a, mpz_ptr n) const { if (is_zero(a)) { set_zero(result); return; } bool neg = false; if (mpz_sgn(n) < 0) { neg = true; mpz_neg(n, n); invert(result, a); } else copy(result, a); fmpz_t fn; fmpz_init_set_readonly(fn, n); fq_zech_pow(&result, &result, fn, mContext); fmpz_clear_readonly(fn); if (neg) mpz_neg(n, n); }
int main() { matrix_t mat_a, mat_b; matrix_t mat_c; struct timeval start_time, end_time; random_matrix(&mat_a, 4); random_matrix(&mat_b, 4); null_matrix(&mat_c, 4); print_matrix(mat_a); printf("\n"); print_matrix(mat_b); printf("\n"); print_matrix(mat_c); gettimeofday(&start_time, 0); matrix_multiplication(mat_a, mat_b, mat_c); gettimeofday(&end_time, 0); printf("Normal Multiplication\n"); print_matrix(mat_c); print_time_taken(start_time, end_time); mat_c = set_zero(mat_c); mat_c = matrix_multiplication_strassen(mat_a, mat_b, mat_c, 2); printf("Strassen Multiplication\n"); print_matrix(mat_c); }
int main() { // Time measurement. TimePeriod cpu_time; cpu_time.tick(); // Create space, set Dirichlet BC, enumerate basis functions. Space* space = new Space(A, B, NELEM, DIR_BC_LEFT, DIR_BC_RIGHT, P_INIT, NEQ); int ndof = Space::get_num_dofs(space); info("ndof: %d", ndof); // Initialize the weak formulation. WeakForm wf; wf.add_matrix_form(jacobian); wf.add_vector_form(residual); // Initialize the FE problem. bool is_linear = false; DiscreteProblem *dp = new DiscreteProblem(&wf, space, is_linear); // Set zero initial condition. double *coeff_vec = new double[ndof]; set_zero(coeff_vec, ndof); // Set up the solver, matrix, and rhs according to the solver selection. SparseMatrix* matrix = create_matrix(matrix_solver); Vector* rhs = create_vector(matrix_solver); Solver* solver = create_linear_solver(matrix_solver, matrix, rhs); int it = 1; bool success = false; while (1) { // Obtain the number of degrees of freedom. int ndof = Space::get_num_dofs(space); // Assemble the Jacobian matrix and residual vector. dp->assemble(coeff_vec, matrix, rhs); // Calculate the l2-norm of residual vector. double res_l2_norm = get_l2_norm(rhs); // Info for user. info("---- Newton iter %d, ndof %d, res. l2 norm %g", it, Space::get_num_dofs(space), res_l2_norm); // If l2 norm of the residual vector is within tolerance, then quit. // NOTE: at least one full iteration forced // here because sometimes the initial // residual on fine mesh is too small. if(res_l2_norm < NEWTON_TOL && it > 1) break; // Multiply the residual vector with -1 since the matrix // equation reads J(Y^n) \deltaY^{n+1} = -F(Y^n). for(int i=0; i<ndof; i++) rhs->set(i, -rhs->get(i)); // Solve the linear system. if(!(success = solver->solve())) error ("Matrix solver failed.\n"); // Add \deltaY^{n+1} to Y^n. for (int i = 0; i < ndof; i++) coeff_vec[i] += solver->get_solution()[i]; // If the maximum number of iteration has been reached, then quit. if (it >= NEWTON_MAX_ITER) error ("Newton method did not converge."); it++; } info("Total running time: %g s", cpu_time.accumulated()); // Test variable. info("ndof = %d.", Space::get_num_dofs(space)); if (success) { info("Success!"); return ERROR_SUCCESS; } else { info("Failure!"); return ERROR_FAILURE; } }
/* ================== ================== */ void Vertex_Lighting( const __int32 n_triangles, const vertex_light_manager_& vertex_light_manager, const float4_ positions[4][3], float4_ colour[4][3] ) { static const float r_screen_scale_x = 1.0f / screen_scale_x; static const float r_screen_scale_y = 1.0f / screen_scale_y; const __m128 attenuation_factor = set_all(800.0f); const __m128 specular_scale = set_all(100.0f); const __m128 diffuse_scale = set_all(20.0f); const __m128 zero = set_all(0.0f); const __m128 one = set_all(1.0f); __m128 r_screen_scale[2]; r_screen_scale[X] = set_all(r_screen_scale_x); r_screen_scale[Y] = set_all(r_screen_scale_y); __m128 screen_shift[2]; screen_shift[X] = set_all(screen_shift_x); screen_shift[Y] = set_all(screen_shift_y); __m128 clip_space_position[3][4]; __m128 vertex_colour[3][4]; for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { __m128 vertex_position[4]; for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { vertex_position[i_triangle] = load_u(positions[i_triangle][i_vertex].f); vertex_colour[i_vertex][i_triangle] = load_u(colour[i_triangle][i_vertex].f); } Transpose(vertex_position); Transpose(vertex_colour[i_vertex]); __m128 depth = reciprocal(vertex_position[Z]); clip_space_position[i_vertex][X] = ((vertex_position[X] - screen_shift[X]) * r_screen_scale[X]) * depth; clip_space_position[i_vertex][Y] = ((vertex_position[Y] - screen_shift[Y]) * r_screen_scale[Y]) * depth; clip_space_position[i_vertex][Z] = depth; } __m128 a[3]; a[X] = clip_space_position[1][X] - clip_space_position[0][X]; a[Y] = clip_space_position[1][Y] - clip_space_position[0][Y]; a[Z] = clip_space_position[1][Z] - clip_space_position[0][Z]; __m128 b[3]; b[X] = clip_space_position[2][X] - clip_space_position[0][X]; b[Y] = clip_space_position[2][Y] - clip_space_position[0][Y]; b[Z] = clip_space_position[2][Z] - clip_space_position[0][Z]; __m128 normal[4]; normal[X] = (a[Y] * b[Z]) - (a[Z] * b[Y]); normal[Y] = (a[Z] * b[X]) - (a[X] * b[Z]); normal[Z] = (a[X] * b[Y]) - (a[Y] * b[X]); __m128 mag = (normal[X] * normal[X]) + (normal[Y] * normal[Y]) + (normal[Z] * normal[Z]); mag = _mm_rsqrt_ps(mag); normal[X] *= mag; normal[Y] *= mag; normal[Z] *= mag; for (__int32 i_light = 0; i_light < 1; i_light++) { for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { __m128 light_position[3]; __m128 light_colour[3]; const float intensity = vertex_light_manager.light_sources[i_light].intensity; for (__int32 i_axis = X; i_axis < W; i_axis++) { light_position[i_axis] = set_all(vertex_light_manager.light_sources[i_light].position.f[i_axis]); light_colour[i_axis] = set_all(vertex_light_manager.light_sources[i_light].colour.f[i_axis] * intensity); } const __m128 extent = set_all(40.0f); __m128i is_valid = set_all(-1); is_valid &= (clip_space_position[i_vertex][X] - light_position[X]) < extent; is_valid &= (clip_space_position[i_vertex][Y] - light_position[Y]) < extent; is_valid &= (clip_space_position[i_vertex][Z] - light_position[Z]) < extent; light_position[X] = set_all(0.0f); light_position[Y] = set_all(0.0f); light_position[Z] = set_all(0.0f); light_colour[X] = set_all(100.0f); light_colour[Y] = set_all(100.0f); light_colour[Z] = set_all(100.0f); __m128 light_ray[3]; light_ray[X] = clip_space_position[i_vertex][X] - light_position[X]; light_ray[Y] = clip_space_position[i_vertex][Y] - light_position[Y]; light_ray[Z] = clip_space_position[i_vertex][Z] - light_position[Z]; __m128 mag = (light_ray[X] * light_ray[X]) + (light_ray[Y] * light_ray[Y]) + (light_ray[Z] * light_ray[Z]); mag = _mm_rsqrt_ps(mag); light_ray[X] *= mag; light_ray[Y] *= mag; light_ray[Z] *= mag; __m128 dot = (normal[X] * light_ray[X]) + (normal[Y] * light_ray[Y]) + (normal[Z] * light_ray[Z]); dot &= dot > zero; dot = (dot * dot) * mag; __m128 distance = set_zero(); for (__int32 i_axis = X; i_axis < W; i_axis++) { __m128 d = light_position[i_axis] - clip_space_position[i_vertex][i_axis]; distance += (d * d); } __m128 scalar = reciprocal(distance) * attenuation_factor; scalar = max_vec(scalar, zero); scalar = min_vec(scalar, one); for (__int32 i_channel = R; i_channel < A; i_channel++) { vertex_colour[i_vertex][i_channel] += dot * specular_scale * light_colour[i_channel]; vertex_colour[i_vertex][i_channel] += mag * diffuse_scale * light_colour[i_channel]; } } } for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { Transpose(vertex_colour[i_vertex]); for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { store_u(vertex_colour[i_vertex][i_triangle], colour[i_triangle][i_vertex].f); } } }