//---------------------------------------------------------------------
    OptimisedUtil* OptimisedUtil::_detectImplementation(void)
    {
        //
        // Some speed test results (averaged number of CPU timestamp (RDTSC) per-function call):
        //
        //   Dagon SkeletonAnimation sample - softwareVertexSkinning:
        //
        //                                      Pentium 4 3.0G HT       Athlon XP 2500+     Athlon 64 X2 Dual Core 3800+
        //
        //      Shared Buffers, General C       763677                  462903              473038
        //      Shared Buffers, Unrolled SSE    210030 *best*           369762              228328 *best*
        //      Shared Buffers, General SSE     286202                  352412 *best*       302796
        //
        //      Separated Buffers, General C    762640                  464840              478740
        //      Separated Buffers, Unrolled SSE 219222 *best*           287992 *best*       238770 *best*
        //      Separated Buffers, General SSE  290129                  341614              307262
        //
        //      PosOnly, General C              388663                  257350              262831
        //      PosOnly, Unrolled SSE           139814 *best*           200323 *best*       168995 *best*
        //      PosOnly, General SSE            172693                  213704              175447
        //
        //   Another my own test scene - softwareVertexSkinning:
        //
        //                                      Pentium P4 3.0G HT      Athlon XP 2500+
        //
        //      Shared Buffers, General C       74527                   -
        //      Shared Buffers, Unrolled SSE    22743 *best*            -
        //      Shared Buffers, General SSE     28527                   -
        //
        //
        // Note that speed test appears unaligned load/store instruction version
        // loss performance 5%-10% than aligned load/store version, even if both
        // of them access to aligned data. Thus, we should use aligned load/store
        // as soon as possible.
        //
        //
        // We are pick up the implementation based on test results above.
        //
#ifdef __DO_PROFILE__
        {
            static OptimisedUtilProfiler msOptimisedUtilProfiler;
            return &msOptimisedUtilProfiler;
        }

#else   // !__DO_PROFILE__

#if __OGRE_HAVE_SSE
        if (PlatformInformation::getCpuFeatures() & PlatformInformation::CPU_FEATURE_SSE)
        {
            return _getOptimisedUtilSSE();
        }
        else
#endif  // __OGRE_HAVE_SSE
        {
            return _getOptimisedUtilGeneral();
        }

#endif  // __DO_PROFILE__
    }
        OptimisedUtilProfiler(void)
        {
            mOptimisedUtils.push_back(_getOptimisedUtilGeneral());
#if __OGRE_HAVE_SSE
            if (PlatformInformation::getCpuFeatures() & PlatformInformation::CPU_FEATURE_SSE)
            {
                mOptimisedUtils.push_back(_getOptimisedUtilSSE());
            }
#endif
        }
        OptimisedUtilProfiler(void)
        {
            mOptimisedUtils.push_back(_getOptimisedUtilGeneral());
#if __OGRE_HAVE_SSE
            if (PlatformInformation::getCpuFeatures() & PlatformInformation::CPU_FEATURE_SSE)
            {
                mOptimisedUtils.push_back(_getOptimisedUtilSSE());
            }
//#elif __OGRE_HAVE_VFP
//            if (PlatformInformation::getCpuFeatures() & PlatformInformation::CPU_FEATURE_VFP)
//            {
//                mOptimisedUtils.push_back(_getOptimisedUtilVFP());
//            }
//#elif __OGRE_HAVE_NEON
//            if (PlatformInformation::getCpuFeatures() & PlatformInformation::CPU_FEATURE_NEON)
//            {
//                mOptimisedUtils.push_back(_getOptimisedUtilNEON());
//            }
#endif
        }