test (__m128 s1, __m128 s2) { return _mm_mul_ps (s1, s2); }
nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; nvdwtype = fr->ntype; vdwparam = fr->nbfp; vdwtype = mdatoms->typeA; rcutoff_scalar = fr->rvdw; rcutoff = _mm_set1_ps(rcutoff_scalar); rcutoff2 = _mm_mul_ps(rcutoff,rcutoff); sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6); rvdw = _mm_set1_ps(fr->rvdw); /* Avoid stupid compiler warnings */ jnrA = jnrB = jnrC = jnrD = 0; j_coord_offsetA = 0; j_coord_offsetB = 0; j_coord_offsetC = 0; j_coord_offsetD = 0; outeriter = 0; inneriter = 0; for(iidx=0;iidx<4*DIM;iidx++)
// function that implements the kernel of the seismic modeling algorithm void seismic_exec(float **VEL, float **PPF, float **APF, float **NPF, float* seismicPulseVector, int spPosX, int spPosY, int xDim, int yDim, int timeSteps) { int i,j; // spatial loops counters int t; // time loop counter #ifdef _VERBOSE int progressTimer = -1; #endif // make sure packing _all_ the data into sets of 4 element is ok assert( xDim % 4 == 0 ); #ifdef _VERBOSE printf("processing...\n"); printf("point of explosion = %d, %d\n", spPosX, spPosY); #endif // there are 16 XMM registers in 64 bit mode, so there is no need to spill to stack __m128 s_ppf, s_vel, s_actual, s_above1, s_left1, s_under1, s_right1, s_two, s_sixteen, s_sixty; __m128 s_above2, s_under2, s_left2, s_right2; float two[4] = {2.0f, 2.0f, 2.0f, 2.0f }; float sixteen[4] = {16.0f,16.0f,16.0f,16.0f}; float sixty[4] = {60.f,60.f,60.f,60.f}; // preload XMM registers with constant values. s_two = _mm_load_ps( two ); s_sixteen = _mm_load_ps( sixteen ); s_sixty = _mm_load_ps( sixty ); // time loop for (t = 0; t < timeSteps; t++) { #ifdef _VVERBOSE printf("----------------------------------------------\ntimestep: %d\n\n", t ); #endif // add pulse APF[spPosY][spPosX] += seismicPulseVector[t]; for(i=2; i<(yDim-2); i++) { for(j=2 + ALIGNMENT_OFFSET; j<(xDim-2); j+=4) { s_ppf = _mm_load_ps( &(PPF[i][j]) ); s_vel = _mm_load_ps( &(VEL[i][j]) ); s_actual = _mm_load_ps( &(APF[i][j]) ); s_left1 = _mm_load_ps( &(APF[i-1][j]) ); s_left2 = _mm_load_ps( &(APF[i-2][j]) ); s_right2 = _mm_load_ps( &(APF[i+2][j]) ); s_right1 = _mm_load_ps( &(APF[i+1][j]) ); s_above1 = _mm_loadu_ps( &(APF[i][j-1]) ); s_under1 = _mm_loadu_ps( &(APF[i][j+1]) ); s_above2 = _mm_loadl_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(1, 0, 0, 0)), &(APF[i][j-2])); s_under2 = _mm_loadh_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(0, 0, 3, 2)), &(APF[i][j+4])); // sum elements with an offset of one s_under1 = _mm_add_ps( s_under1, _mm_add_ps( s_above1, _mm_add_ps( s_left1, s_right1))); // sum elements with an offset of two s_above2 = _mm_add_ps( s_left2, _mm_add_ps( s_right2, _mm_add_ps( s_under2, s_above2))); // multiply with 16 s_under1 = _mm_mul_ps( s_sixteen, s_under1 ); // s_under1 = _mm_sub_ps( _mm_sub_ps( s_under1, s_above2), _mm_mul_ps( s_sixty, s_actual ) ); s_under1 = _mm_add_ps( _mm_mul_ps( s_vel, s_under1), _mm_sub_ps(_mm_mul_ps( s_two, s_actual ), s_ppf) ); // save the result _mm_store_ps( &(NPF[i][j]), s_under1); #ifdef _VVERBOSE printf("[%d][%d]\n", i, j); #endif } #ifdef _VVERBOSE printf("\n"); #endif } #ifdef _VERBOSE // shows one # at each 10% of the total processing time if (t/(timeSteps/10) > progressTimer ) { printf("#"); progressTimer++; fflush(stdout); } #endif // switch pointers instead of copying data PPF = APF; APF = NPF; NPF = PPF; } #ifdef _VERBOSE printf("\nend process!\n"); #endif }
shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_ps(fr->epsfac); charge = mdatoms->chargeA; krf = _mm_set1_ps(fr->ic->k_rf); krf2 = _mm_set1_ps(fr->ic->k_rf*2.0); crf = _mm_set1_ps(fr->ic->c_rf); nvdwtype = fr->ntype; vdwparam = fr->nbfp; vdwtype = mdatoms->typeA; /* Setup water-specific parameters */ inr = nlist->iinr[0]; iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0])); iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1])); iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2])); vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; /* Avoid stupid compiler warnings */ jnrA = jnrB = jnrC = jnrD = 0; j_coord_offsetA = 0; j_coord_offsetB = 0; j_coord_offsetC = 0; j_coord_offsetD = 0; outeriter = 0; inneriter = 0; for(iidx=0;iidx<4*DIM;iidx++)
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const int filters = dt_image_flipped_filter(&piece->pipe->image); dt_iop_temperature_data_t *d = (dt_iop_temperature_data_t *)piece->data; if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp != 4) { const float coeffsi[3] = {d->coeffs[0]/65535.0f, d->coeffs[1]/65535.0f, d->coeffs[2]/65535.0f}; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static) #endif for(int j=0; j<roi_out->height; j++) { int i=0; const uint16_t *in = ((uint16_t *)ivoid) + j*roi_out->width; float *out = ((float*)ovoid) + j*roi_out->width; // process unaligned pixels for ( ; i < ((4-(j*roi_out->width & 3)) & 3) ; i++,out++,in++) *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)]; const __m128 coeffs = _mm_set_ps(coeffsi[FC(j+roi_out->y, roi_out->x+i+3, filters)], coeffsi[FC(j+roi_out->y, roi_out->x+i+2, filters)], coeffsi[FC(j+roi_out->y, roi_out->x+i+1, filters)], coeffsi[FC(j+roi_out->y, roi_out->x+i , filters)]); // process aligned pixels with SSE for( ; i < roi_out->width - 3 ; i+=4,out+=4,in+=4) { _mm_stream_ps(out,_mm_mul_ps(coeffs,_mm_set_ps(in[3],in[2],in[1],in[0]))); } // process the rest for( ; i<roi_out->width; i++,out++,in++) *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)]; } _mm_sfence(); } else if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp == 4) { #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static) #endif for(int j=0; j<roi_out->height; j++) { const float *in = ((float *)ivoid) + j*roi_out->width; float *out = ((float*)ovoid) + j*roi_out->width; for(int i=0; i<roi_out->width; i++,out++,in++) *out = *in * d->coeffs[FC(j+roi_out->x, i+roi_out->y, filters)]; } } else { const int ch = piece->colors; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float*)ivoid) + ch*k*roi_out->width; float *out = ((float*)ovoid) + ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=ch,out+=ch) for(int c=0; c<3; c++) out[c] = in[c]*d->coeffs[c]; } } for(int k=0; k<3; k++) piece->pipe->processed_maximum[k] = d->coeffs[k] * piece->pipe->processed_maximum[k]; }
void BM3D_Final_Process::CollaborativeFilter(int plane, FLType *ResNum, FLType *ResDen, const FLType *src, const FLType *ref, const PosPairCode &code) const { PCType GroupSize = static_cast<PCType>(code.size()); // When para.GroupSize > 0, limit GroupSize up to para.GroupSize if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize) { GroupSize = d.para.GroupSize; } // Construct source group and reference group guided by matched pos code block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize); block_group refGroup(ref, ref_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize); // Initialize L2-norm of Wiener coefficients FLType L2Wiener = 0; // Apply forward 3D transform to the source group and the reference group d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); d.f[plane].fp[GroupSize - 1].execute_r2r(refGroup.data(), refGroup.data()); // Apply empirical Wiener filtering to the source group guided by the reference group const FLType sigmaSquare = d.f[plane].wienerSigmaSqr[GroupSize - 1]; auto srcp = srcGroup.data(); auto refp = refGroup.data(); const auto upper = srcp + srcGroup.size(); #if defined(__SSE2__) static const ptrdiff_t simd_step = 4; const ptrdiff_t simd_residue = srcGroup.size() % simd_step; const ptrdiff_t simd_width = srcGroup.size() - simd_residue; const __m128 sgm_sqr = _mm_set_ps1(sigmaSquare); __m128 l2wiener_sum = _mm_setzero_ps(); for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, refp += simd_step) { const __m128 s1 = _mm_load_ps(srcp); const __m128 r1 = _mm_load_ps(refp); const __m128 r1sqr = _mm_mul_ps(r1, r1); const __m128 wiener = _mm_mul_ps(r1sqr, _mm_rcp_ps(_mm_add_ps(r1sqr, sgm_sqr))); const __m128 d1 = _mm_mul_ps(s1, wiener); _mm_store_ps(srcp, d1); l2wiener_sum = _mm_add_ps(l2wiener_sum, _mm_mul_ps(wiener, wiener)); } alignas(16) FLType l2wiener_sum_f32[4]; _mm_store_ps(l2wiener_sum_f32, l2wiener_sum); L2Wiener += l2wiener_sum_f32[0] + l2wiener_sum_f32[1] + l2wiener_sum_f32[2] + l2wiener_sum_f32[3]; #endif for (; srcp < upper; ++srcp, ++refp) { const FLType refSquare = *refp * *refp; const FLType wienerCoef = refSquare / (refSquare + sigmaSquare); *srcp *= wienerCoef; L2Wiener += wienerCoef * wienerCoef; } // Apply backward 3D transform to the filtered group d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Calculate weight for the filtered group // Also include the normalization factor to compensate for the amplification introduced in 3D transform FLType denWeight = L2Wiener <= 0 ? 1 : FLType(1) / L2Wiener; FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]); // Store the weighted filtered group to the numerator part of the final estimation // Store the weight to the denominator part of the final estimation srcGroup.AddTo(ResNum, dst_stride[plane], numWeight); srcGroup.CountTo(ResDen, dst_stride[plane], denWeight); }
// This function switch on the nearest lights to an object of a certain radius void CDXLight::UpdateDynamicLights(DWORD ID, D3DVECTOR *pos, float Radius) { _MM_ALIGN16 XMMVector Pos, Acc, Square; DWORD idx,ldx, LightsInList=0; float Dist, MaxDist=0.0f; // setup the position for XMM use *(D3DVECTOR*)&Pos.d3d=*pos; // if 1st call in this rendering if(!LightsLoaded){ // Enable the lights + 1 ( light #0 is the Sun ) idx=0; // while(idx<MAX_DYNAMIC_LIGHTS && LightList[idx].CameraDistance<=DYNAMIC_LIGHT_INSIDE_RANGE){ while(idx<DynamicLights){ m_pD3DD->SetLight(idx+1, &LightList[idx++].Light); } LightsLoaded=true; } //reset object own light list for(idx=0; idx<MAX_SAMETIME_LIGHTS; idx++){ SwitchedList[idx].Distance=DYNAMIC_LIGHT_INSIDE_RANGE+1.0f; SwitchedList[idx].Index=0; } //for each light in list setup a distance list idx=0; //while(idx<MAX_DYNAMIC_LIGHTS && LightList[idx].CameraDistance<=DYNAMIC_LIGHT_INSIDE_RANGE){ while(idx<DynamicLights){ // Light defaults to OFF LightsToOn[idx]=false; // if this is a light illuminating ONLY THE OWNER, and we r not the owners, skip if(LightList[idx].Flags.OwnLight && LightList[idx].LightID!=ID){ idx++; continue; } // if this is a light illuminating NOT THE OWNER, and we r the owners, skip if(LightList[idx].Flags.NotSelfLight && LightList[idx].LightID==ID){ idx++; continue; } // get Vectors distance on X/Y/Z Axis and Square it for incoming use Acc.Xmm=_mm_sub_ps(Pos.Xmm, LightList[idx].Pos.Xmm); Square.Xmm=_mm_mul_ps(Acc.Xmm, Acc.Xmm); // Get the Distance Dist=sqrtf(Square.d3d.x+Square.d3d.y+Square.d3d.z); // If Object out of the Light range if((Dist-Radius)>LightList[idx].Light.dvRange) {idx++; continue; } // Calculation for SPOT LIGHTs cone if(LightList[idx].Light.dltType==D3DLIGHT_SPOT){ // Get the Distance btw Light & Object //float dx=Pos.d3d.x-LightList[idx].Light.dvPosition.x; //float dy=Pos.d3d.y-LightList[idx].Light.dvPosition.y; //float dz=Pos.d3d.z-LightList[idx].Light.dvPosition.z; // Calculate Horizontal and Vertical Angle btw Light & Object float ax=atan2(Acc.d3d.x, Acc.d3d.y); float ay=atan2(Acc.d3d.z,sqrtf(Square.d3d.x+Square.d3d.y)); // transform the angles in same Sign Domain of the light angles X/Y if(fabs(ax-LightList[idx].alphaX)>PI) ax+=ax>LightList[idx].alphaX ? -2*PI : 2*PI; if(fabs(ay-LightList[idx].alphaY)>PI) ay+=ax>LightList[idx].alphaY ? -2*PI : 2*PI; float lPhy=LightList[idx].phi; float laX=LightList[idx].alphaX; float laY=LightList[idx].alphaY; // Calculate the Angular Delta given by Object Radius float dPhi=atanf(Radius/Dist); #ifdef LIGHT_ENGINE_DEBUG REPORT_VALUE("Max :",(int)(LightList[idx].alphaX*180/PI)); REPORT_VALUE("Min :",(int)(LightList[idx].alphaY*180/PI)); #ifdef DEBUG_LOD_ID sprintf(TheLODNames[gDebugLodID], "%3.1f %3.1f", ax*180/PI, dPhi*180/PI); #endif #endif if(ax-dPhi>(laX+lPhy) || ax+dPhi<(laX-lPhy) || ay-dPhi>(laY+lPhy) || ay+dPhi<(laY-lPhy)) { idx++; continue; } } Dist-=Radius; // if List still has a slot, add immediatly the light if(LightsInList<MAX_SAMETIME_LIGHTS){ // setup new distance and index in the list SwitchedList[LightsInList].Distance=Dist; SwitchedList[LightsInList++].Index=idx; // flag the light as going to be switched on LightsToOn[idx]=true; // and update the longest in list if(Dist>MaxDist) MaxDist=Dist; } else { // else only if more near than any light in list if(Dist<MaxDist){ // setup a temporary for the new Max Distance float TempDist=0.0f; // check if lower distance than any in the object lites list for(ldx=0; ldx<MAX_SAMETIME_LIGHTS; ldx++){ // if distance is less if(SwitchedList[ldx].Distance==MaxDist){ // disable the previous light LightsToOn[SwitchedList[ldx].Index]=false; // setup new distance and index in the list SwitchedList[ldx].Distance=Dist; SwitchedList[ldx].Index=idx; // enable the new light LightsToOn[idx]=true; } // check if new longest distance if(SwitchedList[ldx].Distance>TempDist) TempDist=SwitchedList[ldx].Distance; } // update new Max Distance MaxDist=TempDist; } } idx++; } EnableMappedLights(); }
iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_ps(fr->ic->epsfac); charge = mdatoms->chargeA; nvdwtype = fr->ntype; vdwparam = fr->nbfp; vdwtype = mdatoms->typeA; vdwgridparam = fr->ljpme_c6grid; sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald); ewclj = _mm_set1_ps(fr->ic->ewaldcoeff_lj); ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj)); sh_ewald = _mm_set1_ps(fr->ic->sh_ewald); ewtab = fr->ic->tabq_coul_FDV0; ewtabscale = _mm_set1_ps(fr->ic->tabq_scale); ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale); /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ rcutoff_scalar = fr->ic->rcoulomb; rcutoff = _mm_set1_ps(rcutoff_scalar); rcutoff2 = _mm_mul_ps(rcutoff,rcutoff); sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6); rvdw = _mm_set1_ps(fr->ic->rvdw); /* Avoid stupid compiler warnings */
void PresetOutputs::PerPixelMath_sse(const PipelineContext &context) { for (int x = 0; x < gx; x++) { for (int y = 0; y < gy; y += 4) { // fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y], // rad_mesh[x][y] * 2.0f - 1.0f)); __m128 rad_mesh_scaled = _mm_sub_ps( _mm_mul_ps( _mm_load_ps(&this->rad_mesh[x][y]), _mm_set_ps1(2.0f)), _mm_set_ps1(1.0f)); __m128 zoom_mesh2 = _mm_load_ps(&this->zoom_mesh[x][y]); __m128 zoomexp_mesh2 = _mm_load_ps(&this->zoomexp_mesh[x][y]); __m128 fZoom2 = _mm_pow(zoom_mesh2, _mm_pow(zoomexp_mesh2, rad_mesh_scaled)); // fZoom2Inv = 1.0f / fZoom2; __m128 fZoomInv = _mm_rcp_ps(fZoom2); // this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f; __m128 x_mesh2 = _mm_add_ps( _mm_mul_ps( _mm_load_ps(&this->orig_x[x][y]), _mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), // CONSIDER: common sub-expression _mm_set_ps1(0.5f)); // this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y]; __m128 cx_mesh2 = _mm_load_ps(&this->cx_mesh[x][y]); __m128 sx_mesh2 = _mm_load_ps(&this->sx_mesh[x][y]); _mm_store_ps(&this->x_mesh[x][y], _mm_add_ps( _mm_div_ps( _mm_sub_ps(x_mesh2,cx_mesh2), sx_mesh2), cx_mesh2 )); // this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f; __m128 y_mesh2 = _mm_add_ps( _mm_mul_ps( _mm_load_ps(&this->orig_y[x][y]), _mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), _mm_set_ps1(0.5f)); // this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y]; __m128 cy_mesh2 = _mm_load_ps(&this->cy_mesh[x][y]); __m128 sy_mesh2 = _mm_load_ps(&this->sy_mesh[x][y]); _mm_store_ps(&this->y_mesh[x][y], _mm_add_ps( _mm_div_ps( _mm_sub_ps(y_mesh2,cy_mesh2), sy_mesh2), cy_mesh2 )); } } const float fWarpTime = context.time * this->fWarpAnimSpeed; const float fWarpScaleInv = 1.0f / this->fWarpScale; const float f[4] = { 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10), 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7), 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3), 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5) }; for (int x = 0; x < gx; x++) { for (int y = 0; y < gy; y+=4) { //float orig_x = this->orig_x[x][y]; //float orig_y = this->orig_y[x][y]; //float warp_mesh = this->warp_mesh[x][y] * 0.0035f; const __m128 orig_x2 = _mm_load_ps(&this->orig_x[x][y]); const __m128 orig_y2 = _mm_load_ps(&this->orig_y[x][y]); const __m128 warp_mesh2 = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f)); // this->x_mesh[x][y] += // (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) + // (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2]))); _mm_store_ps(&this->x_mesh[x][y], _mm_add_ps(_mm_load_ps(&this->x_mesh[x][y]), _mm_add_ps( _mm_mul_ps(warp_mesh2, _mm_sinf( _mm_add_ps( _mm_set_ps1(fWarpTime*0.333f), _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), _mm_sub_ps( _mm_mul_ps(orig_x2, _mm_set_ps1(f[0])), _mm_mul_ps(orig_y2, _mm_set_ps1(f[3])) ))))), _mm_mul_ps(warp_mesh2, _mm_cosf( _mm_sub_ps( _mm_set_ps1(fWarpTime*0.753f), _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), _mm_sub_ps( _mm_mul_ps(orig_x2, _mm_set_ps1(f[1])), _mm_mul_ps(orig_y2, _mm_set_ps1(f[2])) )))))))); // this->y_mesh[x][y] += // (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) + // (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3]))); _mm_store_ps(&this->y_mesh[x][y], _mm_add_ps(_mm_load_ps(&this->y_mesh[x][y]), _mm_add_ps( _mm_mul_ps(warp_mesh2, _mm_cosf( _mm_sub_ps( _mm_set_ps1(fWarpTime*0.375f), _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), _mm_add_ps( _mm_mul_ps(orig_x2, _mm_set_ps1(f[2])), _mm_mul_ps(orig_y2, _mm_set_ps1(f[1])) ))))), _mm_mul_ps(warp_mesh2, _mm_sinf( _mm_add_ps( _mm_set_ps1(fWarpTime*0.825f), _mm_mul_ps(_mm_set_ps1(fWarpScaleInv), _mm_add_ps( _mm_mul_ps(orig_x2, _mm_set_ps1(f[0])), _mm_mul_ps(orig_y2, _mm_set_ps1(f[3])) )))))))); } } for (int x = 0; x < gx; x++) { for (int y = 0; y < gy; y+=4) { // const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y]; // const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y]; const __m128 u2 = _mm_sub_ps(_mm_load_ps(&this->x_mesh[x][y]),_mm_load_ps(&this->cx_mesh[x][y])); const __m128 v2 = _mm_sub_ps(_mm_load_ps(&this->y_mesh[x][y]),_mm_load_ps(&this->cy_mesh[x][y])); // const float rot = this->rot_mesh[x][y]; // const float cos_rot = cosf(rot); // const float sin_rot = sinf(rot); __m128 sin_rot, cos_rot; _mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot); // this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y]; _mm_store_ps(&this->x_mesh[x][y], _mm_add_ps( _mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)), _mm_sub_ps(_mm_load_ps(&this->cx_mesh[x][y]), _mm_load_ps(&this->dx_mesh[x][y])) )); // this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y]; _mm_store_ps(&this->y_mesh[x][y], _mm_add_ps( _mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)), _mm_sub_ps(_mm_load_ps(&this->cy_mesh[x][y]), _mm_load_ps(&this->dy_mesh[x][y])) )); } } }
//hosts that have power-of-2 blocksizes //ann aligned memory will allways end up doing //sse processing... void Multitap::process(float *inputs, float *outputs, unsigned long nSamples, bool replace) { //no use in using SSE for less samples then 16! //host calling VST plugins with less than 16-sample buffers should be shot, tortured and shot again if(nSamples <= 16 || !sse) { processFPU(inputs,outputs,nSamples,replace); return; } //let's see if the current index is a multiple of 4 //if it isn't, we need to process untill it *IS* unsigned long startSize = (4 - (indexfpu & 3)) & 3; unsigned long blockSize = 0; //stupid, but who cares ;-) while(startSize + blockSize + 4 <= nSamples) blockSize += 4; //we'll have to process a maximum of 4 samples at the end... unsigned long endSize = nSamples - (startSize + blockSize); if(startSize) processFPU(inputs,outputs,startSize,replace); inputs += startSize; outputs += startSize; if(blockSize) { nSamples = blockSize; unsigned long index = indexfpu >> 2; #ifndef _WIN64 _mm_empty(); // No MMX on x64 #endif _mm_prefetch(((char *)&delay[0]),0); _mm_prefetch(((char *)&[0]),0); //are the buffers 16-byte aligned?? if ((((int)inputs & 15) == 0) && (((int)outputs & 15) == 0)) { nSamples >>= 2; while(nSamples--) { float *x = inputs + 4; float *y = outputs + 4; _mm_prefetch((char *) x,0); _mm_prefetch((char *) y,0); buffer[index] = _mm_load_ps(inputs); __m128 out_sse = _mm_setzero_ps(); for(long z=0;z<32;z+=4) { long tmp1 = (index - delay[z+0]) & mask; long tmp2 = (index - delay[z+1]) & mask; long tmp3 = (index - delay[z+2]) & mask; long tmp4 = (index - delay[z+3]) & mask; //out += amp[z] * buffer[tmp1] out_sse = _mm_add_ps(_mm_mul_ps(amp[z],buffer[tmp1]),out_sse); _mm_prefetch(((char *)&buffer[tmp1]) + 16,0); out_sse = _mm_add_ps(_mm_mul_ps(amp[z+1],buffer[tmp2]),out_sse); _mm_prefetch(((char *)&buffer[tmp2]) + 16,0); out_sse = _mm_add_ps(_mm_mul_ps(amp[z+2],buffer[tmp3]),out_sse); _mm_prefetch(((char *)&buffer[tmp3]) + 16,0); out_sse = _mm_add_ps(_mm_mul_ps(amp[z+3],buffer[tmp4]),out_sse); _mm_prefetch(((char *)&buffer[tmp4]) + 16,0); } if(replace) _mm_store_ps(outputs,out_sse); else _mm_store_ps(outputs,_mm_add_ps(out_sse,_mm_load_ps(outputs))); index++; index &= mask; inputs = x; outputs = y; } } else //non-aligned buffers! {
void intrin_sse_mult_su3_na(su3_matrixf* aa, su3_matrixf* bb, su3_matrixf* cc) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5; xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->e[0][0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->e[0][1]) ); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->e[0][2]) ); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&((bb)->e[1][0]) ); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&((bb)->e[1][1]) ); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&((bb)->e[1][2]) ); xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) ); xmm6 = _mm_load_ss((float *)&((aa)->e[0][1].real) ); xmm4 = _mm_load_ss((float *)&((aa)->e[1][0].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) ); xmm5 = _mm_load_ss((float *)&((aa)->e[2][0].real) ); xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x00 ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm4 = _mm_shuffle_ps( xmm4, xmm4, 0x00 ); xmm3 = _mm_mul_ps( xmm3, xmm0 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm5 = _mm_shuffle_ps( xmm5, xmm5, 0x00 ); xmm4 = _mm_mul_ps( xmm4, xmm0 ); xmm3 = _mm_add_ps( xmm3, xmm6 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm5 = _mm_mul_ps( xmm5, xmm0 ); xmm4 = _mm_add_ps( xmm4, xmm7 ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[0][2].real) ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm5 = _mm_add_ps( xmm5, xmm6 ); xmm3 = _mm_add_ps( xmm3, xmm7 ); xmm6 = _mm_load_ss((float *)&((aa)->e[1][1].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][2].real) ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm4 = _mm_add_ps( xmm4, xmm6 ); xmm5 = _mm_add_ps( xmm5, xmm7 ); xmm6 = _mm_load_ss((float *)&((aa)->e[0][0].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xb1 ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0xb1 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xb1 ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm0 = _mm_xor_ps( xmm0, _sse_sgn24.xmm ); xmm1 = _mm_xor_ps( xmm1, _sse_sgn24.xmm ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn24.xmm ); xmm6 = _mm_mul_ps( xmm6, xmm0 ); xmm7 = _mm_mul_ps( xmm7, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm6 ); xmm4 = _mm_add_ps( xmm4, xmm7 ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][2].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm2 ); xmm7 = _mm_mul_ps( xmm7, xmm0 ); xmm5 = _mm_add_ps( xmm5, xmm6 ); xmm4 = _mm_add_ps( xmm4, xmm7 ); xmm6 = _mm_load_ss((float *)&((aa)->e[0][1].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm7 = _mm_mul_ps( xmm7, xmm0 ); xmm3 = _mm_add_ps( xmm3, xmm6 ); xmm5 = _mm_add_ps( xmm5, xmm7 ); xmm0 = _mm_load_ss((float *)&((aa)->e[0][2].imag) ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x00 ); xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 ); xmm0 = _mm_mul_ps( xmm0, xmm2 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm0 ); xmm5 = _mm_add_ps( xmm5, xmm6 ); xmm4 = _mm_add_ps( xmm4, xmm7 ); xmm3 = _mm_xor_ps( xmm3, _sse_sgn24.xmm ); xmm4 = _mm_xor_ps( xmm4, _sse_sgn24.xmm ); xmm5 = _mm_xor_ps( xmm5, _sse_sgn24.xmm ); _mm_storel_pi((__m64 *)&((cc)->e[0][0]), xmm3 ); _mm_storel_pi((__m64 *)&((cc)->e[1][0]), xmm4 ); _mm_storel_pi((__m64 *)&((cc)->e[2][0]), xmm5 ); _mm_storeh_pi((__m64 *)&((cc)->e[0][1]), xmm3 ); _mm_storeh_pi((__m64 *)&((cc)->e[1][1]), xmm4 ); _mm_storeh_pi((__m64 *)&((cc)->e[2][1]), xmm5 ); xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->e[2][0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->e[2][1]) ); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->e[2][2]) ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 ); xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) ); xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm3 = _mm_mul_ps( xmm3, xmm0 ); xmm4 = _mm_mul_ps( xmm4, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) ); xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 ); xmm0 = _mm_xor_ps( xmm0, _sse_sgn24.xmm ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 ); xmm1 = _mm_xor_ps( xmm1, _sse_sgn24.xmm ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn24.xmm ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm4 = _mm_mul_ps( xmm4, xmm0 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm3 = _mm_xor_ps( xmm3, _sse_sgn24.xmm ); _mm_storel_pi((__m64 *)&((cc)->e[0][2]), xmm3 ); _mm_storeh_pi((__m64 *)&((cc)->e[1][2]), xmm3 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) ); xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm6 = _mm_add_ps( xmm6, xmm5 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn3.xmm ); xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm6 = _mm_add_ps( xmm6, xmm7 ); xmm7 = xmm6 ; xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE ); xmm6 = _mm_add_ps( xmm6, xmm7 ); xmm6 = _mm_xor_ps( xmm6, _sse_sgn24.xmm ); _mm_storel_pi((__m64 *)&((cc)->e[2][2]), xmm6 ); }
/** Assumes input matrix has dimension n divisible by STRIDE. */ inline void squarepad_sgemm (const int n, float *A, float *B, float *C) { omp_set_num_threads(NUM_THREADS); #pragma omp parallel { __m128 mmA1, mmA2, mmA3, mmA4, mmB1, mmB2, mmB3, mmB4, mmC; __m128 mmProd1, mmProd2, mmProd3, mmProd4; __m128 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; float tempSum0[4], tempSum1[4], tempSum2[4], tempSum3[4], tempSum4[4]; float tempSum5[4], tempSum6[4], tempSum7[4]; int J_STRIDE, I_STRIDE; if (n < 512) { J_STRIDE = BLOCK_BIG; I_STRIDE = BLOCK_BIG; } else if (n < 768) { I_STRIDE = BLOCK_MED; J_STRIDE = BLOCK_BIG; } else if (n < 801) { I_STRIDE = BLOCK_MED; J_STRIDE = BLOCK_MED; } else if (n < 830) { I_STRIDE = BLOCK_MED; J_STRIDE = BLOCK_SML; } else { I_STRIDE = BLOCK_MED; J_STRIDE = BLOCK_ONE; } #pragma omp for schedule(dynamic) nowait for (int j = 0; j < n; j += J_STRIDE) for (int i = 0; i < n; i += I_STRIDE) for (int j2 = j; j2 < (j + J_STRIDE); j2++) for (int i2 = i; i2 < (i + I_STRIDE); i2 += 8) { sum0 = _mm_set1_ps(0); sum1 = _mm_set1_ps(0); sum2 = _mm_set1_ps(0); sum3 = _mm_set1_ps(0); sum4 = _mm_set1_ps(0); sum5 = _mm_set1_ps(0); sum6 = _mm_set1_ps(0); sum7 = _mm_set1_ps(0); for (int k = 0; k < n; k += K_STRIDE) { mmB1 = _mm_load_ps(B + j2*n + k); //Bload mmB2 = _mm_load_ps(B + j2*n + k + 4); mmB3 = _mm_load_ps(B + j2*n + k + 8); mmB4 = _mm_load_ps(B + j2*n + k + 12); mmA1 = _mm_load_ps(A + i2*n + k); //0ALoad mmA2 = _mm_load_ps(A + i2*n + k + 4); mmA3 = _mm_load_ps(A + i2*n + k + 8); mmA4 = _mm_load_ps(A + i2*n + k + 12); mmProd1 = _mm_mul_ps(mmA1, mmB1); //0Product mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 1)*n + k); //1A mmA2 = _mm_load_ps(A + (i2 + 1)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 1)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 1)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //0Sum mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum0 = _mm_add_ps(mmProd1, sum0); mmProd1 = _mm_mul_ps(mmA1, mmB1); //1P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 2)*n + k); //2A mmA2 = _mm_load_ps(A + (i2 + 2)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 2)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 2)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //1S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum1 = _mm_add_ps(mmProd1, sum1); mmProd1 = _mm_mul_ps(mmA1, mmB1); //2P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 3)*n + k); //3A mmA2 = _mm_load_ps(A + (i2 + 3)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 3)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 3)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //2S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum2 = _mm_add_ps(mmProd1, sum2); mmProd1 = _mm_mul_ps(mmA1, mmB1); //3P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 4)*n + k); //4A mmA2 = _mm_load_ps(A + (i2 + 4)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 4)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 4)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //3S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum3 = _mm_add_ps(mmProd1, sum3); mmProd1 = _mm_mul_ps(mmA1, mmB1); //4P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 5)*n + k); //5A mmA2 = _mm_load_ps(A + (i2 + 5)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 5)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 5)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //4S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum4 = _mm_add_ps(mmProd1, sum4); mmProd1 = _mm_mul_ps(mmA1, mmB1); //5P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 6)*n + k); //6A mmA2 = _mm_load_ps(A + (i2 + 6)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 6)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 6)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //5S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum5 = _mm_add_ps(mmProd1, sum5); mmProd1 = _mm_mul_ps(mmA1, mmB1); //6P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmA1 = _mm_load_ps(A + (i2 + 7)*n + k); //7A mmA2 = _mm_load_ps(A + (i2 + 7)*n + k + 4); mmA3 = _mm_load_ps(A + (i2 + 7)*n + k + 8); mmA4 = _mm_load_ps(A + (i2 + 7)*n + k + 12); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //6S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum6 = _mm_add_ps(mmProd1, sum6); mmProd1 = _mm_mul_ps(mmA1, mmB1); //7P mmProd2 = _mm_mul_ps(mmA2, mmB2); mmProd3 = _mm_mul_ps(mmA3, mmB3); mmProd4 = _mm_mul_ps(mmA4, mmB4); mmProd1 = _mm_add_ps(mmProd1, mmProd2); //7S mmProd3 = _mm_add_ps(mmProd3, mmProd4); mmProd1 = _mm_add_ps(mmProd1, mmProd3); sum7 = _mm_add_ps(mmProd1, sum7); } sum0 = _mm_hadd_ps(sum0, sum1); sum2 = _mm_hadd_ps(sum2, sum3); sum0 = _mm_hadd_ps(sum0, sum2); sum4 = _mm_hadd_ps(sum4, sum5); sum6 = _mm_hadd_ps(sum6, sum7); sum4 = _mm_hadd_ps(sum4, sum6); mmC = _mm_load_ps(C + j2*n + i2); mmC = _mm_add_ps(mmC, sum0); _mm_store_ps(C + j2*n + i2, mmC); mmC = _mm_load_ps(C + j2*n + i2 + 4); mmC = _mm_add_ps(mmC, sum4); _mm_store_ps(C + j2*n + i2 + 4, mmC); } } }
inline vector4f operator*(const vector4f& lhs, const vector4f& rhs) { return _mm_mul_ps(lhs, rhs); }
int sse3_ChirpData_ak8( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { #ifdef USE_MANUAL_CALLSTACK call_stack.enter("sse3_ChirpData_ak8()"); #endif int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); __m128d DFOUR = _mm_set_pd(4.0, 4.0); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); __m128d di1 = _mm_set_pd(2.0, 0.0); // set time patterns for eventual moveldup/movehdup __m128d di2 = _mm_set_pd(3.0, 1.0); for (i = 0; i < vEnd; i += 4) { const float *d = (const float *) (cx_DataArray + i); float *cd = (float *) (cx_ChirpDataArray + i); __m128d a1, a2; __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 z; __m128 s; __m128 c; __m128 m; // load the signal to be chirped d1 = _mm_load_ps(d); d2 = _mm_load_ps(d+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate); a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate); // update times for next di1 = _mm_add_pd(di1, DFOUR); di2 = _mm_add_pd(di2, DFOUR); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // 3 1 2 0 // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations, Estrin's method z = _mm_mul_ps(y, y); s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F), SS3F), z), _mm_add_ps(_mm_mul_ps(y, SS2F), SS1F)), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F), CC2F), z), _mm_add_ps(_mm_mul_ps(y, CC1F), ONE)); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y)); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) c = _mm_mul_ps(c, m); // c3 c1 c2 c0 s = _mm_mul_ps(s, m); // chirp the data cd1 = _mm_moveldup_ps(c); // c1 c1 c0 c0 cd2 = _mm_movehdup_ps(c); // c3 c3 c2 c2 cd1 = _mm_mul_ps(cd1, d1); // c1.i1 c1.r1 c0.i0 c0.r0 cd2 = _mm_mul_ps(cd2, d2); // c3.i3 c3.r3 c2.i2 c2.r2 d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_moveldup_ps(s); td2 = _mm_movehdup_ps(s); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(cd, cd1); _mm_stream_ps(cd+4, cd2); } // handle tail elements with scalar code for (; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; }
void sgemm( int m_a, int n_a, float *A, float *B, float *C ) { __m128 partialRowB1,partialRowB2,partialRowB3; __m128 partialSumC1,partialSumC2,partialSumC3; __m128 partialColA1,partialColA2,partialColA3,partialColA4,partialColA5,partialColA6,partialColA7,partialColA8,partialColA9; if(m_a%36 == 0 && n_a%36 == 0){ //Optimized for 36x36 matrix for( int v = 0; v < m_a; v++){ int j; //Break into 12x12 matrices for( j = 0; j < n_a/3*3; j+= 3 ) { //load next 3 elems of row B partialRowB1 = _mm_load1_ps(B+(v+m_a*j)); partialRowB2 = _mm_load1_ps(B+(v+m_a*(j+1))); partialRowB3 = _mm_load1_ps(B+(v+m_a*(j+2))); int i; for( i = 0; i < m_a/12*12; i += 12 ) { //Load 12 elems from column 1 of A partialColA1 = _mm_loadu_ps(A+(i+m_a*j)); partialColA2 = _mm_loadu_ps(A+(i+m_a*j+4)); partialColA3 = _mm_loadu_ps(A+(i+m_a*j+8)); //load 12 elems from column 2 of A partialColA4 = _mm_loadu_ps(A+(i+m_a*(j+1))); partialColA5 = _mm_loadu_ps(A+(i+m_a*(j+1)+4)); partialColA6 = _mm_loadu_ps(A+(i+m_a*(j+1)+8)); //load 12 elems from column 3 of A partialColA7 = _mm_loadu_ps(A+(i+m_a*(j+2))); partialColA8 = _mm_loadu_ps(A+(i+m_a*(j+2)+4)); partialColA9 = _mm_loadu_ps(A+(i+m_a*(j+2)+8)); //Multiply first col of A with first elem of B & sum into respective elem of C partialSumC1 = _mm_add_ps(_mm_loadu_ps(C+(i+v*m_a)), _mm_mul_ps(partialColA1, partialRowB1)); partialSumC2 = _mm_add_ps(_mm_loadu_ps(C+(i+v*m_a+4)), _mm_mul_ps(partialColA2, partialRowB1)); partialSumC3 = _mm_add_ps(_mm_loadu_ps(C+(i+v*m_a+8)), _mm_mul_ps(partialColA3, partialRowB1)); //Multiply second col of A with second elem of B & sum into respective elem of C partialSumC1 = _mm_add_ps(partialSumC1, _mm_mul_ps(partialColA4, partialRowB2)); partialSumC2 = _mm_add_ps(partialSumC2, _mm_mul_ps(partialColA5, partialRowB2)); partialSumC3 = _mm_add_ps(partialSumC3, _mm_mul_ps(partialColA6, partialRowB2)); //Multiply last col of A with last elem of B & sum into respective C & store _mm_storeu_ps(C+i+v*m_a, _mm_add_ps(partialSumC1, _mm_mul_ps(partialColA7, partialRowB3))); _mm_storeu_ps(C+i+v*m_a+4, _mm_add_ps(partialSumC2, _mm_mul_ps(partialColA8, partialRowB3))); _mm_storeu_ps(C+i+v*m_a+8, _mm_add_ps(partialSumC3, _mm_mul_ps(partialColA9, partialRowB3))); } } } } //Handles matrices of size other than 36x36 else{ for(int v = 0; v < n_a; v++){ //goes through output column in C int m_axv = m_a*v; for( int j = 0; j < m_a; j++ ) { int jxm_a = j*m_a; partialRowB1 = _mm_load1_ps(B+(j+m_axv)); //load current elem of row in B int i; for( i = 0; i < m_a/16*16; i += 16 ) { //load next 16 col elems of A into packed sp partialColA1 = _mm_loadu_ps(A+(i+m_axv)); partialColA2 = _mm_loadu_ps(A+(i+m_axv+4)); partialColA3 = _mm_loadu_ps(A+(i+m_axv+8)); partialColA4 = _mm_loadu_ps(A+(i+m_axv+12)); //Compute part of elem in C, store in C _mm_storeu_ps((C + i + jxm_a), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a), _mm_mul_ps(partialColA1, partialRowB1))); _mm_storeu_ps((C + i + jxm_a+4), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a+4), _mm_mul_ps(partialColA2, partialRowB1))); _mm_storeu_ps((C + i + jxm_a+8), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a+8), _mm_mul_ps(partialColA3, partialRowB1))); _mm_storeu_ps((C + i + jxm_a+12), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a+12), _mm_mul_ps(partialColA4, partialRowB1))); } //fringe case for( i = m_a/16*16; i < m_a/4*4; i += 4 ) { partialColA1 = _mm_loadu_ps(A+(i+m_axv)); //load next 4 col elems of A into packed sp _mm_storeu_ps((C + i + jxm_a), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a), _mm_mul_ps(partialColA1, partialRowB1))); } //finish off matrices with dimensions %4 != 0 for( i = m_a/4*4; i < m_a; i++){ C[i + jxm_a] += A[i+m_axv] * B[j+m_axv]; } } } } }
// ============================================================================= // // sse3_vChirpData // version by: Alex Kan // http://tbp.berkeley.edu/~alexkan/seti/ // int sse3_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; #ifdef USE_MANUAL_CALLSTACK call_stack.enter("sse3_ChirpData_ak()"); #endif if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate); a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4), SS3), y), SS2), y), SS1), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3), CC2), y), CC1), y), ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) s = _mm_mul_ps(s, m); c = _mm_mul_ps(c, m); // chirp the data cd1 = _mm_shuffle_ps(c, c, 0x50); cd2 = _mm_shuffle_ps(c, c, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); // handle tail elements with scalar code for ( ; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; }
void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const { f32 *src = (f32*)&mat; __m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps(); __m128 row0=_mm_setzero_ps(), row1=_mm_setzero_ps(), row2=_mm_setzero_ps(), row3=_mm_setzero_ps(); __m128 det=_mm_setzero_ps(), tmp1=_mm_setzero_ps(); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); _mm_store_ss(&detv, det); Mat44 t; if(out) { src = (f32*)out->mat; } else { src = t.mat; } minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); };
void game_player_tick(player_t *pl, float dt) { if(pl->magic != 0xC4 && pl->magic != 0xC9 && pl->magic != 0x66 && pl->magic != 0x69) return; camera_t *cam = &(pl->cam); float vs = 0.12f*100.0f*dt; // trace motion v4f_t no, tno, tv; no.m = _mm_setzero_ps(); if(pl->magic != 0x66 && pl->magic != 0x69) { if(pl->vflip) { no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.x.m, _mm_set1_ps(-pl->lv.v.x*vs))); no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.y.m, _mm_set1_ps(pl->lv.v.y*vs))); no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.z.m, _mm_set1_ps(pl->lv.v.w*vs))); no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.w.m, _mm_set1_ps(pl->lv.v.z*vs))); } else { no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.x.m, _mm_set1_ps(pl->lv.v.x*vs))); no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.y.m, _mm_set1_ps(pl->lv.v.y*vs))); no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.z.m, _mm_set1_ps(pl->lv.v.z*vs))); no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.w.m, _mm_set1_ps(pl->lv.v.w*vs))); } } // add gravity pl->grav_v += 0.3f*vs; if(pl->grav_v > 2.5f) pl->grav_v = 2.5f; no.v.y = pl->grav_v*vs; // check distance v4f_t tv2; tv2.m = _mm_mul_ps(no.m, no.m); //float md = sqrtf(vx*vx + vy*vy + vz*vz + vw*vw)*vs; float md = tv2.v.x + tv2.v.y + tv2.v.z + tv2.v.w; if(md > 0.0000001f) { // normalise for direction tv.m = no.m; vec_norm(&tv); // cast a ray float r = 0.5f; tno.m = cam->o.m; int side; // trace away for(;;) { float d = trace_box(root, &tno, &tv, NULL, NULL, NULL, md + r, &side); // // apply collision // if(d < 0.0f) { // no collision. jump to point. cam->o.m = _mm_add_ps(cam->o.m, _mm_mul_ps(_mm_set1_ps(md), tv.m)); break; } else { // we've hit a plane. slide back. if(side == F_YP) { if(pl->grav_v >= 0.0f) { pl->grav_v = 0.0f; pl->grounded = 1; } } else if(side == F_YN) { if(pl->grav_v <= 0.0f) pl->grav_v = 0.0f; } float dd = md - (d - r); cam->o.m = _mm_add_ps(cam->o.m, _mm_mul_ps(_mm_set1_ps(md - dd), tv.m)); // mask out velocity. tv.a[side&3] = 0.0f; // reduce distance. md = dd; } } } }
void CAEUtil::ClampArray(float *data, uint32_t count) { #if !defined(HAVE_SSE) || !defined(__SSE__) for (uint32_t i = 0; i < count; ++i) data[i] = SoftClamp(data[i]); #else const __m128 c1 = _mm_set_ps1(27.0f); const __m128 c2 = _mm_set_ps1(27.0f + 9.0f); /* work around invalid alignment */ while (((uintptr_t)data & 0xF) && count > 0) { data[0] = SoftClamp(data[0]); ++data; --count; } uint32_t even = count & ~0x3; for (uint32_t i = 0; i < even; i+=4, data+=4) { /* tanh approx clamp */ __m128 dt = _mm_load_ps(data); __m128 tmp = _mm_mul_ps(dt, dt); *(__m128*)data = _mm_div_ps( _mm_mul_ps( dt, _mm_add_ps(c1, tmp) ), _mm_add_ps(c2, tmp) ); } if (even != count) { uint32_t odd = count - even; if (odd == 1) data[0] = SoftClamp(data[0]); else { __m128 dt; __m128 tmp; __m128 out; if (odd == 2) { /* tanh approx clamp */ dt = _mm_setr_ps(data[0], data[1], 0, 0); tmp = _mm_mul_ps(dt, dt); out = _mm_div_ps( _mm_mul_ps( dt, _mm_add_ps(c1, tmp) ), _mm_add_ps(c2, tmp) ); data[0] = ((float*)&out)[0]; data[1] = ((float*)&out)[1]; } else { /* tanh approx clamp */ dt = _mm_setr_ps(data[0], data[1], data[2], 0); tmp = _mm_mul_ps(dt, dt); out = _mm_div_ps( _mm_mul_ps( dt, _mm_add_ps(c1, tmp) ), _mm_add_ps(c2, tmp) ); data[0] = ((float*)&out)[0]; data[1] = ((float*)&out)[1]; data[2] = ((float*)&out)[2]; } } } #endif }
M_Matrix44 M_MatrixInvert44_SSE(M_Matrix44 A) { M_Matrix44 Ainv; float *src = &A.m[0][0]; float *dst = &Ainv.m[0][0]; __m128 minor0, minor1, minor2, minor3; __m128 row0, row1, row2, row3; __m128 det, tmp1; tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)), (__m64 *)(src+4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)), (__m64 *)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)), (__m64 *)(src+6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)), (__m64 *)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64 *)(dst), minor0); _mm_storeh_pi((__m64 *)(dst+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64 *)(dst+4), minor1); _mm_storeh_pi((__m64 *)(dst+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64 *)(dst+8), minor2); _mm_storeh_pi((__m64 *)(dst+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64 *)(dst+12), minor3); _mm_storeh_pi((__m64 *)(dst+14), minor3); return (Ainv); }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_graduatednd_data_t *data = (dt_iop_graduatednd_data_t *)piece->data; const int ch = piece->colors; const int ix= (roi_in->x); const int iy= (roi_in->y); const float iw=piece->buf_in.width*roi_out->scale; const float ih=piece->buf_in.height*roi_out->scale; const float hw=iw/2.0; const float hh=ih/2.0; const float hw_inv=1.0/hw; const float hh_inv=1.0/hh; const float v=(-data->rotation/180)*M_PI; const float sinv=sin(v); const float cosv=cos(v); const float filter_radie=sqrt((hh*hh)+(hw*hw))/hh; const float offset=data->offset/100.0*2; float color[3]; hsl2rgb(color,data->hue,data->saturation,0.5); if (data->density < 0) for ( int l=0; l<3; l++ ) color[l] = 1.0-color[l]; #if 1 const float filter_compression = 1.0/filter_radie/(1.0-(0.5+(data->compression/100.0)*0.9/2.0))*0.5; #else const float compression = data->compression/100.0f; const float t = 1.0f - .8f/(.8f + compression); const float c = 1.0f + 1000.0f*powf(4.0, compression); #endif if (data->density > 0) { #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, color, data, ivoid, ovoid) schedule(static) #endif for(int y=0; y<roi_out->height; y++) { int k=roi_out->width*y*ch; const float *in = (float*)ivoid + k; float *out = (float*)ovoid + k; float length = (sinv * (-1.0+ix*hw_inv) - cosv * (-1.0+(iy+y)*hh_inv) - 1.0 + offset) * filter_compression; const float length_inc = sinv * hw_inv * filter_compression; __m128 c = _mm_set_ps(0,color[2],color[1],color[0]); __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f),c); for(int x=0; x<roi_out->width; x++, in+=ch, out+=ch) { #if 1 // !!! approximation is ok only when highest density is 8 // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8 // use exp2f approximation to calculate e^(ln2*x/8) // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error const float t = 0.693147181f /* ln2 */ * (data->density * CLIP( 0.5f+length )/8.0f); float d1 = t*t*0.5f; float d2 = d1*t*0.333333333f; float d3 = d2*t*0.25f; float d = 1+t+d1+d2+d3; /* taylor series for e^x till x^4 */ //printf("%d %d %f\n",y,x,d); __m128 density = _mm_set1_ps(d); density = _mm_mul_ps(density,density); density = _mm_mul_ps(density,density); density = _mm_mul_ps(density,density); #else // use fair exp2f __m128 density = _mm_set1_ps(exp2f(data->density * CLIP( 0.5f+length ))); #endif /* max(0,in / (c + (1-c)*density)) */ _mm_stream_ps(out,_mm_max_ps(_mm_set1_ps(0.0f),_mm_div_ps(_mm_load_ps(in),_mm_add_ps(c,_mm_mul_ps(c1,density))))); length += length_inc; } } } else { #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, color, data, ivoid, ovoid) schedule(static) #endif for(int y=0; y<roi_out->height; y++) { int k=roi_out->width*y*ch; const float *in = (float*)ivoid + k; float *out = (float*)ovoid + k; float length = (sinv * (-1.0f+ix*hw_inv) - cosv * (-1.0f+(iy+y)*hh_inv) - 1.0f + offset) * filter_compression; const float length_inc = sinv * hw_inv * filter_compression; __m128 c = _mm_set_ps(0,color[2],color[1],color[0]); __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f),c); for(int x=0; x<roi_out->width; x++, in+=ch, out+=ch) { #if 1 // !!! approximation is ok only when lowest density is -8 // for input x = (-data->density * CLIP( 0.5-length ), calculate 2^x as (e^(ln2*x/8))^8 // use exp2f approximation to calculate e^(ln2*x/8) // in worst case - density==-8,CLIP(0.5-length) == 1.0 it gives 0.6% of error const float t = 0.693147181f /* ln2 */ * (-data->density * CLIP( 0.5f-length )/8.0f); float d1 = t*t*0.5f; float d2 = d1*t*0.333333333f; float d3 = d2*t*0.25f; float d = 1+t+d1+d2+d3; /* taylor series for e^x till x^4 */ __m128 density = _mm_set1_ps(d); density = _mm_mul_ps(density,density); density = _mm_mul_ps(density,density); density = _mm_mul_ps(density,density); #else __m128 density = _mm_set1_ps(exp2f(-data->density * CLIP( 0.5f-length ))); #endif /* max(0,in * (c + (1-c)*density)) */ _mm_stream_ps(out,_mm_max_ps(_mm_set1_ps(0.0f),_mm_mul_ps(_mm_load_ps(in),_mm_add_ps(c,_mm_mul_ps(c1,density))))); length += length_inc; } } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }