TD* SigMalloc(int len) { Fw32u size = sizeof(TD) * len; TD* memPtr = (TD*)fwMalloc(size); return memPtr; }
static SYS_INLINE FwStatus iResizeShift_C1R(const TS *pSrc, FwiSize srcSize, int srcStep, FwiRect srcRoi, TS *pDst, int dstStep, FwiSize dstRoiSize, double xFr, double yFr, double xShift, double yShift, int interpolation) { double fEnd_x,fEnd_y; long iStart_x, iEnd_x, iStart_y, iEnd_y; int x=0,y=0; //short half_FW_WEIGHT = FW_WEIGHT/2 ; if (xFr <= 0.0 || yFr <= 0.0) return fwStsResizeFactorErr; if (interpolation != FWI_INTER_LINEAR) return fwStsInterpolationErr; FwStatus status = My_FW_ParaCheck<TS>(pSrc, srcSize, srcStep, srcRoi, pDst, dstStep, dstRoiSize, 1); if (status !=fwStsNoErr) return status; if(xShift > 0.0) { iStart_x = 0; fEnd_x = (double)(srcRoi.width-xShift) / xFr; if (fEnd_x <1) return fwStsWrongIntersectROI; if((double)dstRoiSize.width < fEnd_x) iEnd_x = dstRoiSize.width; else iEnd_x = (int)fEnd_x; } else { iStart_x =(long)( xShift / xFr); fEnd_x = (double)(srcRoi.width) / xFr; if((double)dstRoiSize.width < fEnd_x) iEnd_x = dstRoiSize.width; else iEnd_x = (int)fEnd_x; } if(yShift > 0.0) { iStart_y = 0; fEnd_y = (double)(srcRoi.height-yShift) / yFr; if (fEnd_y <1) return fwStsWrongIntersectROI; if((double)dstRoiSize.height < fEnd_y) iEnd_y = dstRoiSize.height; else iEnd_y = (int)fEnd_y; } else { iStart_y = (long)(yShift / yFr); fEnd_y = (double)(srcRoi.width) / yFr; if((double)dstRoiSize.height < fEnd_y) iEnd_y = dstRoiSize.height; else iEnd_y = (int)fEnd_y; } //use fwMalloc instead of malloc for aligned address Linear_Array *pX_Array_Value = (Linear_Array*) fwMalloc(dstRoiSize.width*sizeof(Linear_Array)); Linear_Array *pY_Array_Value = (Linear_Array*) fwMalloc(dstRoiSize.height*sizeof(Linear_Array)); //resizeshift_pixel_mapping3(srcStep,srcRoi,pDst,dstStep,xFr,yFr,xShift,yShift,pX_Array_Value, // pY_Array_Value,iStart_x,iEnd_x,iStart_y,iEnd_y); double fraction_X, fractY, one_Minus_X, one_Minus_Y, weight_shift, srcRoix, srcRoiy; int ceil_X, ceil_Y, floor_X, floor_Y; int ifraction_x, ifraction_y, ione_minus_x, ione_minus_y; srcRoix = (double)srcRoi.x + xShift; srcRoiy = (double)srcRoi.y + yShift; weight_shift = (double) (1<<FW_WEIGHT) ; //Pre-calculate the y coefficient. for (y = iStart_y; y < iEnd_y; y++) { floor_Y = (int)floor((double)y * yFr); ceil_Y = floor_Y + 1; //Protection for over-boundary reading if (ceil_Y >= srcRoi.height) ceil_Y = floor_Y; fractY = y*yFr - floor_Y; one_Minus_Y = 1.0 - fractY; //Shifted for integer calculation ifraction_y = (int)(fractY * weight_shift); ione_minus_y = (int)(one_Minus_Y * weight_shift); floor_Y = (int)((srcRoiy + (double)floor_Y)*(double)srcStep + 0.5); ceil_Y = (int)((srcRoiy + (double)ceil_Y)*(double)srcStep +0.5); pY_Array_Value[y].floor=floor_Y; pY_Array_Value[y].ceil=ceil_Y; // pY_Array_Value[y].fraction=fractY; // pY_Array_Value[y].one_minus_val=one_Minus_Y; pY_Array_Value[y].ifraction=(short)ifraction_y; pY_Array_Value[y].ione_Minus_Val=(short)ione_minus_y; } //Pre-calculate the x coefficient. for (x = iStart_x; x < iEnd_x; x++) { floor_X = (int)floor((double)x * xFr); //Protection for over-boundary reading ceil_X = floor_X + 1; if (ceil_X >= srcRoi.width) ceil_X = floor_X; fraction_X = x*xFr - floor_X; one_Minus_X = 1.0 - fraction_X; //Shifted for integer calculation ifraction_x = (int)(fraction_X * weight_shift); ione_minus_x = (int)(one_Minus_X * weight_shift); floor_X = (int)((srcRoix + (double)floor_X) + 0.5); ceil_X = (int)((srcRoix + (double)ceil_X) + 0.5); pX_Array_Value[x].floor=floor_X; pX_Array_Value[x].ceil=ceil_X; // pX_Array_Value[x].fraction=fraction_X; // pX_Array_Value[x].one_minus_val=one_Minus_X; pX_Array_Value[x].ifraction=(short)ifraction_x; pX_Array_Value[x].ione_Minus_Val=(short)ione_minus_x; } // stat_from_interpolation = resizeshift_interpolation_func(pSrc,srcRoi,pDst,dstStep,dstRoiSize, //xFr,yFr,xShift,yShift,pX_Array_Value,pY_Array_Value,iStart_x,iEnd_x,iStart_y,iEnd_y); //This function will be used for Fw8u type only. if (sizeof(TS) != 1) return fwStsErr; else //if(sizeof(TYPE) == 1) // if TYPE == Fw8u { unsigned char p1, p2, p3, p4, t1, t2; //__m128i rxmm7; //rxmm7 = _mm_set1_epi8(0); //int tempval_width=dstRoiSize.width%16; //for (y = 0; y < dstRoiSize.height; y++) //{ // // if(dstRoiSize.width>=16) // { // for (x = 0; x < dstRoiSize.width-tempval_width; x+=16) // _mm_storeu_si128 ((__m128i *)(pDst + x + y*dstStep),rxmm7); // for (;x < dstRoiSize.width; x++) *(pDst + x + y*dstStep) = 0; // } // else // { // for (x=0;x < dstRoiSize.width; x++) *(pDst + x + y*dstStep) = 0; // } //} __m128i rxmm0 , rxmm1, rxmm2, rxmm3, rxmm4, rxmm5, rxmm6, rxmm7, rxmm8; XMM128 pp1={0}, pp2={0}, pp3={0}, pp4={0}; XMM128 pIfx={0}, pIofx={0}; Fw8u *pSrc_FloorY; Fw8u *pSrc_CeilY; short half_FW_WEIGHT = FW_WEIGHT/2 ; rxmm8 = _mm_set1_epi16(half_FW_WEIGHT); for (y = iStart_y; y < iEnd_y; y++) { pSrc_CeilY = (Fw8u*)pSrc; pSrc_FloorY = (Fw8u*)pSrc; pSrc_CeilY += pY_Array_Value[y].ceil; pSrc_FloorY += pY_Array_Value[y].floor; ifraction_y = pY_Array_Value[y].ifraction; ione_minus_y = pY_Array_Value[y].ione_Minus_Val; rxmm0 = _mm_set1_epi16((short)ione_minus_y); rxmm7 = _mm_set1_epi16((short)ifraction_y); if((iEnd_x-iStart_x)>=8) { for (x = iStart_x; x <= iEnd_x-8; x+=8) // process 8 pixels in parallel { for (int xx = 0; xx < 8; xx++) // process 8 pixels in parallel { ceil_X = pX_Array_Value[xx+x].ceil; floor_X = pX_Array_Value[xx+x].floor; ifraction_x = pX_Array_Value[xx+x].ifraction; ione_minus_x = pX_Array_Value[xx+x].ione_Minus_Val; pp1.u16[xx] = (unsigned short)(*(pSrc_FloorY + floor_X)); pp2.u16[xx] = (unsigned short)(*(pSrc_FloorY + ceil_X)); pp3.u16[xx] = (unsigned short)(*(pSrc_CeilY + floor_X)); pp4.u16[xx] = (unsigned short)(*(pSrc_CeilY + ceil_X)); pIfx.u16[xx] = (unsigned short)ifraction_x; pIofx.u16[xx]= (unsigned short)ione_minus_x; } rxmm1 = _mm_load_si128(&pp1.i); rxmm2 = _mm_load_si128(&pp2.i); rxmm3 = _mm_load_si128(&pp3.i); rxmm4 = _mm_load_si128(&pp4.i); rxmm5 = _mm_load_si128(&pIfx.i); // ifraction_x rxmm6 = _mm_load_si128(&pIofx.i); // ione_minus_x // resize // t1 = (unsigned char)((ione_minus_x *p1 + ifraction_x *p2) >> FW_WEIGHT); rxmm1 = _mm_mullo_epi16 (rxmm1, rxmm6); // ione_minus_x *p1 rxmm2 = _mm_mullo_epi16 (rxmm2, rxmm5); // ifraction_x *p2 rxmm1 = _mm_add_epi16(rxmm1, rxmm2); rxmm1 = _mm_add_epi16(rxmm1, rxmm8); rxmm1 = _mm_srli_epi16(rxmm1, FW_WEIGHT); // t2 = (unsigned char)((ione_minus_x *p3 + ifraction_x *p4) >> FW_WEIGHT); rxmm3 = _mm_mullo_epi16 (rxmm3, rxmm6); // ione_minus_x *p1 rxmm4 = _mm_mullo_epi16 (rxmm4, rxmm5); // ifraction_x *p2 rxmm3 = _mm_add_epi16(rxmm3, rxmm4); rxmm3 = _mm_add_epi16(rxmm3, rxmm8); rxmm3 = _mm_srli_epi16(rxmm3, FW_WEIGHT); // *(pDst + x + y*dstStep) = (unsigned char)((ione_minus_y *t1 + ifraction_y * t2) >> FW_WEIGHT) rxmm1 = _mm_mullo_epi16 (rxmm1, rxmm0); // ione_minus_y * t1 rxmm3 = _mm_mullo_epi16 (rxmm3, rxmm7); // ifraction_y * t2 rxmm1 = _mm_add_epi16(rxmm1, rxmm3); rxmm1 = _mm_add_epi16(rxmm1, rxmm8); rxmm1 = _mm_srli_epi16(rxmm1, FW_WEIGHT); rxmm1 = _mm_packus_epi16(rxmm1, rxmm1); // convert to 8 bit _mm_storel_epi64((__m128i *)(pDst + x + y*dstStep), rxmm1); } for (; x < iEnd_x; x++) // for remaining pixels { ceil_X=pX_Array_Value[x].ceil; floor_X=pX_Array_Value[x].floor; ifraction_x = pX_Array_Value[x].ifraction; ione_minus_x = pX_Array_Value[x].ione_Minus_Val; p1 = *(pSrc_FloorY + floor_X); p2 = *(pSrc_FloorY + ceil_X); p3 = *(pSrc_CeilY + floor_X); p4 = *(pSrc_CeilY + ceil_X); // ione_minus_x and ifraction_x value has been shifted by FW_WEIGHT, but no sturation is needed // + half_FW_WEIGHT for rounding t1 = (Fw8u)((ione_minus_x *p1 + ifraction_x *p2 + half_FW_WEIGHT) >> FW_WEIGHT); t2 = (Fw8u)((ione_minus_x *p3 + ifraction_x *p4 + half_FW_WEIGHT) >> FW_WEIGHT); *(pDst + x + y*dstStep) = (Fw8u)((ione_minus_y*t1 + ifraction_y*t2 + half_FW_WEIGHT) >> FW_WEIGHT); } } else { for (x = iStart_x; x < iEnd_x; x++) // for remaining pixels { ceil_X=pX_Array_Value[x].ceil; floor_X=pX_Array_Value[x].floor; ifraction_x = pX_Array_Value[x].ifraction; ione_minus_x = pX_Array_Value[x].ione_Minus_Val; p1 = *(pSrc_FloorY + floor_X); p2 = *(pSrc_FloorY + ceil_X); p3 = *(pSrc_CeilY + floor_X); p4 = *(pSrc_CeilY + ceil_X); // ione_minus_x and ifraction_x value has been shifted by FW_WEIGHT, but no sturation is needed // + half_FW_WEIGHT for rounding t1 = (Fw8u)((ione_minus_x *p1 + ifraction_x *p2 + half_FW_WEIGHT) >> FW_WEIGHT); t2 = (Fw8u)((ione_minus_x *p3 + ifraction_x *p4 + half_FW_WEIGHT) >> FW_WEIGHT); *(pDst + x + y*dstStep) = (Fw8u)((ione_minus_y*t1 + ifraction_y*t2 + half_FW_WEIGHT) >> FW_WEIGHT); } } } }