int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) { int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); if (sizeof(size_t)==4) { #if 1 || (defined(__APPLE__) && defined(__MACH__)) if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); #else /* boundary of 32 was experimentally determined on Linux 2.6.22, might have to be adjusted on AIX... */ if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) { sigset_t oset; int ret; sigprocmask(SIG_SETMASK,&all_masked,&oset); ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); sigprocmask(SIG_SETMASK,&oset,NULL); return ret; } #endif } else if ((OPENSSL_ppccap_P&PPC_FPU64)) /* this is a "must" on POWER6, but run-time detection * is not implemented yet... */ return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); return bn_mul_mont_int(rp,ap,bp,np,n0,num); }
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) { int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); else return bn_mul_mont_int(rp,ap,bp,np,n0,num); }
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) { int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); if (!(num & 1) && num >= 6) { if ((num & 15) == 0 && num <= 64 && (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) == (CFR_MONTMUL | CFR_MONTSQR)) { typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); static const bn_mul_mont_f funcs[4] = { bn_mul_mont_t4_8, bn_mul_mont_t4_16, bn_mul_mont_t4_24, bn_mul_mont_t4_32 }; bn_mul_mont_f worker = funcs[num / 16 - 1]; if ((*worker) (rp, ap, bp, np, n0)) return 1; /* retry once and fall back */ if ((*worker) (rp, ap, bp, np, n0)) return 1; return bn_mul_mont_vis3(rp, ap, bp, np, n0, num); } if ((OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3)) return bn_mul_mont_vis3(rp, ap, bp, np, n0, num); else if (num >= 8 && (OPENSSL_sparcv9cap_P[0] & (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) == (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) return bn_mul_mont_fpu(rp, ap, bp, np, n0, num); } return bn_mul_mont_int(rp, ap, bp, np, n0, num); }
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) { int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); if (num < 4) return 0; if ((num & 3) == 0) return bn_mul4x_mont_int(rp, ap, bp, np, n0, num); /* * There used to be [optional] call to bn_mul_mont_fpu64 here, * but above subroutine is faster on contemporary processors. * Formulation means that there might be old processors where * FPU code path would be faster, POWER6 perhaps, but there was * no opportunity to figure it out... */ return bn_mul_mont_int(rp, ap, bp, np, n0, num); }
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) { int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); if (!(num & 1) && num >= 6) { if ((num & 15) == 0 && num <= 64 && (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) == (CFR_MONTMUL | CFR_MONTSQR)) { typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0); static const bn_mul_mont_f funcs[4] = { bn_mul_mont_t4_8, bn_mul_mont_t4_16, bn_mul_mont_t4_24, bn_mul_mont_t4_32 }; bn_mul_mont_f worker = funcs[num / 16 - 1]; if ((*worker) (rp, ap, bp, np, n0)) return 1; /* retry once and fall back */ if ((*worker) (rp, ap, bp, np, n0)) return 1; return bn_mul_mont_vis3(rp, ap, bp, np, n0, num); } if ((OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3)) return bn_mul_mont_vis3(rp, ap, bp, np, n0, num); else if (num >= 8 && /* * bn_mul_mont_fpu doesn't use FMADD, we just use the * flag to detect when FPU path is preferable in cases * when current heuristics is unreliable. [it works * out because FMADD-capable processors where FPU * code path is undesirable are also VIS3-capable and * VIS3 code path takes precedence.] */ ( (OPENSSL_sparcv9cap_P[0] & SPARCV9_FMADD) || (OPENSSL_sparcv9cap_P[0] & (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) == (SPARCV9_PREFER_FPU | SPARCV9_VIS1) )) return bn_mul_mont_fpu(rp, ap, bp, np, n0, num); } return bn_mul_mont_int(rp, ap, bp, np, n0, num); }