PetscErrorCode MatMultTranspose_SeqBSTRM_5(Mat A,Vec xx,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; Mat_SeqBSTRM *sbstrm = (Mat_SeqBSTRM*)A->spptr; PetscScalar zero = 0.0; PetscScalar *z = 0; PetscScalar *x,*xb; const MatScalar *v1, *v2, *v3, *v4, *v5; PetscScalar x1, x2, x3, x4, x5; PetscErrorCode ierr; PetscInt mbs =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j; PetscInt nonzerorow=0; PetscInt slen; PetscFunctionBegin; ierr = VecSet(zz,zero);CHKERRQ(ierr); ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArray(zz,&z);CHKERRQ(ierr); slen = 5*(ai[mbs]-ai[0]); v1 = sbstrm->as; v2 = v1 + slen; v3 = v2 + slen; v4 = v3 + slen; v5 = v4 + slen; xb = x; for (i=0; i<mbs; i++) { n = ai[i+1] - ai[i]; nonzerorow += (n>0); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; xb += 5; ib = aj + ai[i]; PetscPrefetchBlock(ib+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j=0; j<n; j++) { cval = ib[j]*5; z[cval] += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5; z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5; z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5; z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5; z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5; v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5; } } ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); PetscFunctionReturn(0); }
PetscErrorCode MatMult_SeqSBSTRM_5(Mat A,Vec xx,Vec zz) { Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; Mat_SeqSBSTRM *sbstrm = (Mat_SeqSBSTRM*)A->spptr; PetscScalar zero = 0.0; PetscScalar *z = 0; PetscScalar *x,*xb; const MatScalar *v1, *v2, *v3, *v4, *v5; PetscScalar x1, x2, x3, x4, x5; PetscScalar xr1, xr2, xr3, xr4, xr5; PetscScalar sum1, sum2, sum3, sum4, sum5; PetscErrorCode ierr; PetscInt mbs =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; PetscInt nonzerorow=0; PetscInt slen; PetscFunctionBegin; ierr = VecSet(zz,zero);CHKERRQ(ierr); ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArray(zz,&z);CHKERRQ(ierr); slen = 5*(ai[mbs]-ai[0]); v1 = sbstrm->as; v2 = v1 + slen; v3 = v2 + slen; v4 = v3 + slen; v5 = v4 + slen; xb = x; for (i=0; i<mbs; i++) { n = ai[i+1] - ai[i]; nonzerorow += (n>0); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; sum1 = z[5*i]; sum2 = z[5*i+1]; sum3 = z[5*i+2]; sum4 = z[5*i+3]; sum5 = z[5*i+4]; jmin = 0; ib = aj + ai[i]; if (*ib == i) { /* (diag of A)*x */ sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5; sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5; sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5; sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v4[4]*x5; sum5 += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5; v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5; jmin++; } PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j=jmin; j<n; j++) { cval = ib[j]*5; z[cval] += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5; z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5; z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5; z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5; z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5; xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3]; xr5 = x[cval+4]; sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3 + v1[3]*xr4 + v1[4]*xr5; sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3 + v2[3]*xr4 + v2[4]*xr5; sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3 + v3[3]*xr4 + v3[4]*xr5; sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3 + v4[3]*xr4 + v4[4]*xr5; sum5 += v5[0]*xr1 + v5[1]*xr2 + v5[2]*xr3 + v5[3]*xr4 + v5[4]*xr5; v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5; } z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; z[5*i+3] = sum4; z[5*i+4] = sum5; xb += 5; } ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); PetscFunctionReturn(0); }
PetscErrorCode MatMultAdd_SeqBSTRM_5(Mat A,Vec xx,Vec yy,Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr; PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; PetscScalar *yarray,*zarray; MatScalar *v1,*v2,*v3,*v4,*v5; PetscErrorCode ierr; PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; PetscBool usecprow=a->compressedrow.use; PetscInt slen; PetscFunctionBegin; ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); if (zz != yy) { ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); } else { zarray = yarray; } idx = a->j; if (usecprow) { if (zz != yy) { ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); } mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } slen = 5*(ii[mbs]-ii[0]); v1 = bstrm->as; v2 = v1 + slen; v3 = v2 + slen; v4 = v3 + slen; v5 = v4 + slen; for (i=0; i<mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 5*ridx[i]; y = yarray + 5*ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j=0; j<n; j++) { xb = x + 5*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5; sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5; sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5; sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5; sum5 += v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5; v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; if (!usecprow) { z += 5; y += 5; } } ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); if (zz != yy) { ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); } ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); PetscFunctionReturn(0); }