int itemNextStep(LWEnvelopeID *env, LWEnvKeyframeID *k, int *step, LWDVector val) { int i, curstep, newstep; double t; i = MINDEX(step); curstep = step[i]; newstep = curstep; t = curstep; t /= sceneInfo->framesPerSecond; if(step[0]==curstep && k[0]) { val[0] = keyValue(env,k[0]); if(k[0] = envInfo->nextKey(env[0],k[0])) { step[0] = keyStep(env,k[0]); newstep = step[0]; } } else { val[0] = envInfo->evaluate(env[0], t); } if(step[1]==curstep && k[1]) { val[1] = keyValue(env,k[1]); if(k[1] = envInfo->nextKey(env[1],k[1])) { step[1] = keyStep(env,k[1]); newstep = newstep>curstep ? MIN(newstep,step[1]):step[1]; } } else { val[1] = envInfo->evaluate(env[1], t); } if(step[2]==curstep && k[2]) { val[2] = keyValue(env,k[2]); if(k[2] = envInfo->nextKey(env[2],k[2])) { step[2] = keyStep(env,k[2]); newstep = newstep>curstep ? MIN(newstep,step[2]):step[2]; } } else { val[2] = envInfo->evaluate(env[2], t); } if(!k[0]) step[0] = newstep; if(!k[1]) step[1] = newstep; if(!k[2]) step[2] = newstep; return curstep; }
/** * Reference implementation of the matrix vector multiply * algorithm. Used to verify the answer. Do NOT change this function. */ static void matvec_ref() { int i, j; for (i = 0; i < SIZE; i++) for (j = 0; j < SIZE; j++) vec_ref[i] += mat_a[MINDEX(i, j)] * vec_b[j]; }
static void matvec_sse() { /* Assume that the data size is an even multiple of the 128 bit * SSE vectors (i.e. 4 floats) */ assert(!(SIZE & 0x3)); /* TASK: Implement your SSE version of the matrix-vector * multiplication here. */ /* HINT: You might find at least the following instructions * useful: * - _mm_setzero_ps * - _mm_load_ps * - _mm_hadd_ps * - _mm_cvtss_f32 * * HINT: You can create the sum of all elements in a vector * using two hadd instructions. */ __m128 dummy=_mm_setzero_ps(); for(int i=0;i<SIZE;++i){ __m128 temp=_mm_setzero_ps(); for(int j=0;j<SIZE;j+=4){ __m128 mm_vec_b=_mm_load_ps((__m128*)(vec_b+j)); __m128 mm_matr=_mm_load_ps((__m128*)(mat_a+MINDEX(i,j))); __m128 out=_mm_mul_ps(mm_vec_b,mm_matr); temp=_mm_add_ps(temp,out); // vec_c[i]+=_mm_cvtss_f32(_mm_dp_ps(mm_matr,mm_vec_b,0xf1)); } __m128 res=_mm_hadd_ps(_mm_hadd_ps(temp,dummy),dummy); vec_c[i]=_mm_cvtss_f32(res); } }
/** * Initialize mat_a and vec_b with "random" data. Write to every * element in mat_c to make sure that the kernel allocates physical * memory to every page in the matrix before we start doing * benchmarking. */ static void init() { int i, j; mat_a = _mm_malloc(sizeof(*mat_a) * SIZE * SIZE, XMM_ALIGNMENT_BYTES); vec_b = _mm_malloc(sizeof(*vec_b) * SIZE, XMM_ALIGNMENT_BYTES); vec_c = _mm_malloc(sizeof(*vec_c) * SIZE, XMM_ALIGNMENT_BYTES); vec_ref = _mm_malloc(sizeof(*vec_ref) * SIZE, XMM_ALIGNMENT_BYTES); if (!mat_a || !vec_b || !vec_c || !vec_ref) { fprintf(stderr, "Memory allocation failed\n"); abort(); } for (i = 0; i < SIZE; i++) { for (j = 0; j < SIZE; j++) mat_a[MINDEX(i, j)] = ((7 * i + j) & 0x0F) * 0x1P-2F; vec_b[i] = ((i * 17) & 0x0F) * 0x1P-2F; } memset(vec_c, 0, sizeof(vec_c)); memset(vec_ref, 0, sizeof(vec_ref)); }