// Multiply matrix to data vector. When encoding, it contains data in Data // and stores error correction codes in Out. When decoding it contains // broken data followed by ECC in Data and stores recovered data to Out. // We do not use this function now, everything is moved to UpdateECC. void RSCoder16::Process(const uint *Data, uint *Out) { uint ProcData[gfSize]; for (uint I = 0; I < ND; I++) ProcData[I]=Data[I]; if (Decoding) { // Replace broken data units with first available valid recovery codes. // 'Data' array must contain recovery codes after data. for (uint I=0, R=ND; I < ND; I++) if (!ValidFlags[I]) // For every broken data unit. { while (!ValidFlags[R]) // Find a valid recovery unit. R++; ProcData[I]=Data[R]; R++; } } uint H=Decoding ? NE : NR; for (uint I = 0; I < H; I++) { uint R = 0; // Result of matrix row multiplication to data. uint *MXi=MX + I * ND; for (uint J = 0; J < ND; J++) R ^= gfMul(MXi[J], ProcData[J]); Out[I] = R; } }
// Data and ECC addresses must be properly aligned for SSE. bool RSCoder16::SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize) { // Check data alignment and SSSE3 support. if ((size_t(Data) & (SSE_ALIGNMENT-1))!=0 || (size_t(ECC) & (SSE_ALIGNMENT-1))!=0 || _SSE_Version<SSE_SSSE3) return false; uint M=MX[ECCNum * ND + DataNum]; // Prepare tables containing products of M and 4, 8, 12, 16 bit length // numbers, which have 4 high bits in 0..15 range and other bits set to 0. // Store high and low bytes of resulting 16 bit product in separate tables. __m128i T0L,T1L,T2L,T3L; // Low byte tables. __m128i T0H,T1H,T2H,T3H; // High byte tables. for (uint I=0; I<16; I++) { ((byte *)&T0L)[I]=gfMul(I,M); ((byte *)&T0H)[I]=gfMul(I,M)>>8; ((byte *)&T1L)[I]=gfMul(I<<4,M); ((byte *)&T1H)[I]=gfMul(I<<4,M)>>8; ((byte *)&T2L)[I]=gfMul(I<<8,M); ((byte *)&T2H)[I]=gfMul(I<<8,M)>>8; ((byte *)&T3L)[I]=gfMul(I<<12,M); ((byte *)&T3H)[I]=gfMul(I<<12,M)>>8; } size_t Pos=0; __m128i LowByteMask=_mm_set1_epi16(0xff); // 00ff00ff...00ff __m128i Low4Mask=_mm_set1_epi8(0xf); // 0f0f0f0f...0f0f __m128i High4Mask=_mm_slli_epi16(Low4Mask,4); // f0f0f0f0...f0f0 for (; Pos+2*sizeof(__m128i)<=BlockSize; Pos+=2*sizeof(__m128i)) { // We process two 128 bit chunks of source data at once. __m128i *D=(__m128i *)(Data+Pos); // Place high bytes of both chunks to one variable and low bytes to // another, so we can use the table lookup multiplication for 16 values // 4 bit length each at once. __m128i HighBytes0=_mm_srli_epi16(D[0],8); __m128i LowBytes0=_mm_and_si128(D[0],LowByteMask); __m128i HighBytes1=_mm_srli_epi16(D[1],8); __m128i LowBytes1=_mm_and_si128(D[1],LowByteMask); __m128i HighBytes=_mm_packus_epi16(HighBytes0,HighBytes1); __m128i LowBytes=_mm_packus_epi16(LowBytes0,LowBytes1); // Multiply bits 0..3 of low bytes. Store low and high product bytes // separately in cumulative sum variables. __m128i LowBytesLow4=_mm_and_si128(LowBytes,Low4Mask); __m128i LowBytesMultSum=_mm_shuffle_epi8(T0L,LowBytesLow4); __m128i HighBytesMultSum=_mm_shuffle_epi8(T0H,LowBytesLow4); // Multiply bits 4..7 of low bytes. Store low and high product bytes separately. __m128i LowBytesHigh4=_mm_and_si128(LowBytes,High4Mask); LowBytesHigh4=_mm_srli_epi16(LowBytesHigh4,4); __m128i LowBytesHigh4MultLow=_mm_shuffle_epi8(T1L,LowBytesHigh4); __m128i LowBytesHigh4MultHigh=_mm_shuffle_epi8(T1H,LowBytesHigh4); // Add new product to existing sum, low and high bytes separately. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,LowBytesHigh4MultLow); HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,LowBytesHigh4MultHigh); // Multiply bits 0..3 of high bytes. Store low and high product bytes separately. __m128i HighBytesLow4=_mm_and_si128(HighBytes,Low4Mask); __m128i HighBytesLow4MultLow=_mm_shuffle_epi8(T2L,HighBytesLow4); __m128i HighBytesLow4MultHigh=_mm_shuffle_epi8(T2H,HighBytesLow4); // Add new product to existing sum, low and high bytes separately. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesLow4MultLow); HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesLow4MultHigh); // Multiply bits 4..7 of high bytes. Store low and high product bytes separately. __m128i HighBytesHigh4=_mm_and_si128(HighBytes,High4Mask); HighBytesHigh4=_mm_srli_epi16(HighBytesHigh4,4); __m128i HighBytesHigh4MultLow=_mm_shuffle_epi8(T3L,HighBytesHigh4); __m128i HighBytesHigh4MultHigh=_mm_shuffle_epi8(T3H,HighBytesHigh4); // Add new product to existing sum, low and high bytes separately. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesHigh4MultLow); HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesHigh4MultHigh); // Combine separate low and high cumulative sum bytes to 16-bit words. __m128i HighBytesHigh4Mult0=_mm_unpacklo_epi8(LowBytesMultSum,HighBytesMultSum); __m128i HighBytesHigh4Mult1=_mm_unpackhi_epi8(LowBytesMultSum,HighBytesMultSum); // Add result to ECC. __m128i *StoreECC=(__m128i *)(ECC+Pos); StoreECC[0]=_mm_xor_si128(StoreECC[0],HighBytesHigh4Mult0); StoreECC[1]=_mm_xor_si128(StoreECC[1],HighBytesHigh4Mult1); } // If we have non 128 bit aligned data in the end of block, process them // in a usual way. We cannot do the same in the beginning of block, // because Data and ECC can have different alignment offsets. for (; Pos<BlockSize; Pos+=2) *(ushort*)(ECC+Pos) ^= gfMul( M, *(ushort*)(Data+Pos) ); return true; }
// Apply Gauss–Jordan elimination to find inverse of decoder matrix. // We have the square NDxND matrix, but we do not store its trivial // diagonal "1" rows matching valid data, so we work with NExND matrix. // Our original Cauchy matrix does not contain 0, so we skip search // for non-zero pivot. void RSCoder16::InvertDecoderMatrix() { uint *MI=new uint[NE * ND]; // We'll create inverse matrix here. memset(MI, 0, ND * NE * sizeof(*MI)); // Initialize to identity matrix. for (uint Kr = 0, Kf = 0; Kr < NE; Kr++, Kf++) { while (ValidFlags[Kf]) // Skip trivial rows. Kf++; MI[Kr * ND + Kf] = 1; // Set diagonal 1. } // Kr is the number of row in our actual reduced NE x ND matrix, // which does not contain trivial diagonal 1 rows. // Kf is the number of row in full ND x ND matrix with all trivial rows // included. for (uint Kr = 0, Kf = 0; Kf < ND; Kr++, Kf++) // Select pivot row. { while (ValidFlags[Kf] && Kf < ND) { // Here we process trivial diagonal 1 rows matching valid data units. // Their processing can be simplified comparing to usual rows. // In full version of elimination we would set MX[I * ND + Kf] to zero // after MI[..]^=, but we do not need it for matrix inversion. for (uint I = 0; I < NE; I++) MI[I * ND + Kf] ^= MX[I * ND + Kf]; Kf++; } if (Kf == ND) break; uint *MXk = MX + Kr * ND; // k-th row of main matrix. uint *MIk = MI + Kr * ND; // k-th row of inversion matrix. uint PInv = gfInv( MXk[Kf] ); // Pivot inverse. // Divide the pivot row by pivot, so pivot cell contains 1. for (uint I = 0; I < ND; I++) { MXk[I] = gfMul( MXk[I], PInv ); MIk[I] = gfMul( MIk[I], PInv ); } for (uint I = 0; I < NE; I++) if (I != Kr) // For all rows except containing the pivot cell. { // Apply Gaussian elimination Mij -= Mkj * Mik / pivot. // Since pivot is already 1, it is reduced to Mij -= Mkj * Mik. uint *MXi = MX + I * ND; // i-th row of main matrix. uint *MIi = MI + I * ND; // i-th row of inversion matrix. uint Mik = MXi[Kf]; // Cell in pivot position. for (uint J = 0; J < ND; J++) { MXi[J] ^= gfMul(MXk[J] , Mik); MIi[J] ^= gfMul(MIk[J] , Mik); } } } // Copy data to main matrix. for (uint I = 0; I < NE * ND; I++) MX[I] = MI[I]; delete[] MI; }
// ----------------------------------------------------------------------------- int main () // ----------------------------------------------------------------------------- { // verify basic operations (mul, add, div): // a+b = b+a // a*b = b*a // a+(b+c) = (a+b)+c // a*(b*c) = (a*b)*c // a*c + b*c = (a+b)*c // a*(b/a) = b (if a != 0) gfInit(); for (int a=0; a<GF_N; a++) { for (int b=0; b<GF_N; b++) { if (gfAdd(a, b) != gfAdd(b, a)) return 1; if (gfMul(a, b) != gfMul(b, a)) return 2; if (gfMul(a, b) != gfMul(b, a)) return 2; for (int c=0; c<GF_N; c++) { if (gfAdd(a, gfAdd(b, c)) != gfAdd(gfAdd(a, b), c)) return 3; if (gfMul(a, gfMul(b, c)) != gfMul(gfMul(a, b), c)) return 4; if (gfAdd(gfMul(a, c), gfMul(b, c)) != gfMul(gfAdd(a, b), c)) return 5; } if (a != GF_0) if (gfMul(a, gfDiv(b, a)) != b) return 6; } } // verify polynomial operations: // A = BQ + R const int M = GF_N; // max deg gfExp A[M+1]; int nA; gfExp B[M+1]; int nB; gfExp Q[M+1]; int nQ; gfExp R1[M+2]; gfExp* R = R1 + 1; int nR; gfExp Z[M+1]; int nZ; gfExp Z1[2*M]; int nZ1; gfExp Z2[2*M]; int nZ2; gfExp* P = R; int nP; // alias gfExp* N = B; int nN; // alias gfExp* Y = Q; int nY; // alias gfExp Mem[8*(M+1) + 3]; // // -------------------- test polDiv, polMul, gfPolAdd(): -------------------- // for (int test=0; test<100000; test++) { // // // clear all -- should not be required: // // for (int i=0; i<=M; i++) // // A[i] = B[i] = Q[i] = R[i] = Z[i] = 0; // // R[-1] = 0; // nB = randInt(0, M); // nA = randInt(nB, M); // randPol(A, nA); // randPol(B, nB); // if (gfPolDeg(B, nB) == -1) // avoid dividing by B=0 // continue; // nB = polDiv(A, nA, B, nB, Q, &nQ, R, &nR); // nZ = gfPolMul(Q, nQ, B, nB, Z); // B * Q // if (nZ > nA) // return 7; // nZ = gfPolAdd(Z, nZ, R, nR, Z); // + R // if (! polCmp(Z, A, nZ, nA)) // return 8; // } // // -------------------- test gfPolEEA(): -------------------- // for (int test=0; test<100000; test++) { // nN = randInt(1, M); // nA = randInt(0, nN-1); // randPol(A, nA); // randPol(N, nN); // if (gfPolDeg(A, nA) == -1) // avoid dividing by A=0 // continue; // gfPolEEA(N, nN, A, nA, P, &nP, Q, &nQ, Mem); // nZ1 = gfPolMul(P, nP, N, nN, Z1); // nZ2 = gfPolMul(Q, nQ, A, nA, Z2); // if (! polCmp(Z1, Z2, nZ1, nZ2)) // return 9; // } // -------------------- test gfPolEvalSeq() against gfPolEval(): -------------------- for (int test=0; test<10000; test++) { nA = randInt(0, GF_N - 2); // limit of gfPolEvalSeq() nY = randInt(0, M); gfVec* Yv = Y; randPol(A, nA); gfExp x = GF_Z(1);//randE1(); gfPolEvalSeq(A, nA, Yv, nY, x); for (int i=0; i<=nY; i++) { gfExp y2 = gfPolEval(A, nA, x); gfExp y1 = gfV2E[Yv[nY-i]]; if (y2 != y1) return 10; x = gfMul(x, GF_Z(1)); } } return 0; }