Beispiel #1
0
int main(int argc, char* argv[])
{
    int size = SIZE * 8;
    int size2 = size * size;
    Scalar* a = internal::aligned_new<Scalar>(size2);
    Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
    Scalar* c = internal::aligned_new<Scalar>(size2);

    for (int i=0; i<size; ++i)
    {
        a[i] = b[i] = c[i] = 0;
    }

    BenchTimer timer;

    timer.reset();
    for (int k=0; k<10; ++k)
    {
        timer.start();
        benchVec(a, b, c, size2);
        timer.stop();
    }
    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
    return 0;
    for (int innersize = size; innersize>2 ; --innersize)
    {
        if (size2%innersize==0)
        {
            int outersize = size2/innersize;
            MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
            MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
            MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
            timer.reset();
            for (int k=0; k<3; ++k)
            {
                timer.start();
                benchVec(ma, mb, mc);
                timer.stop();
            }
            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
        }
    }

    VectorXf va = Map<VectorXf>(a, size2);
    VectorXf vb = Map<VectorXf>(b, size2);
    VectorXf vc = Map<VectorXf>(c, size2);
    timer.reset();
    for (int k=0; k<3; ++k)
    {
        timer.start();
        benchVec(va, vb, vc);
        timer.stop();
    }
    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";

    return 0;
}
int main(int argc, char *argv[])
{
  int rows = SIZE;
  int cols = SIZE;
  float density = DENSITY;

  EigenSparseMatrix sm1(rows,cols);
  DenseVector v1(cols), v2(cols);
  v1.setRandom();

  BenchTimer timer;
  for (float density = DENSITY; density>=MINDENSITY; density*=0.5)
  {
    //fillMatrix(density, rows, cols, sm1);
    fillMatrix2(7, rows, cols, sm1);

    // dense matrices
    #ifdef DENSEMATRIX
    {
      std::cout << "Eigen Dense\t" << density*100 << "%\n";
      DenseMatrix m1(rows,cols);
      eiToDense(sm1, m1);

      timer.reset();
      timer.start();
      for (int k=0; k<REPEAT; ++k)
        v2 = m1 * v1;
      timer.stop();
      std::cout << "   a * v:\t" << timer.best() << "  " << double(REPEAT)/timer.best() << " * / sec " << endl;

      timer.reset();
      timer.start();
      for (int k=0; k<REPEAT; ++k)
        v2 = m1.transpose() * v1;
      timer.stop();
      std::cout << "   a' * v:\t" << timer.best() << endl;
    }
    #endif

    // eigen sparse matrices
    {
      std::cout << "Eigen sparse\t" << sm1.nonZeros()/float(sm1.rows()*sm1.cols())*100 << "%\n";

      BENCH(asm("#myc"); v2 = sm1 * v1; asm("#myd");)
      std::cout << "   a * v:\t" << timer.best()/REPEAT << "  " << double(REPEAT)/timer.best(REAL_TIMER) << " * / sec " << endl;


      BENCH( { asm("#mya"); v2 = sm1.transpose() * v1; asm("#myb"); })

      std::cout << "   a' * v:\t" << timer.best()/REPEAT << endl;
    }
Beispiel #3
0
static void bench_record(SkPicture* src, const char* name, SkBBHFactory* bbhFactory) {
    BenchTimer timer;
    timer.start();
    const int width  = src ? src->width()  : FLAGS_nullSize;
    const int height = src ? src->height() : FLAGS_nullSize;

    for (int i = 0; i < FLAGS_loops; i++) {
        if (FLAGS_skr) {
            EXPERIMENTAL::SkRecording recording(width, height);
            if (NULL != src) {
                src->draw(recording.canvas());
            }
            // Release and delete the SkPlayback so that recording optimizes its SkRecord.
            SkDELETE(recording.releasePlayback());
        } else {
            SkPictureRecorder recorder;
            SkCanvas* canvas = recorder.beginRecording(width, height, bbhFactory, FLAGS_flags);
            if (NULL != src) {
                src->draw(canvas);
            }
            if (FLAGS_endRecording) {
                SkAutoTUnref<SkPicture> dst(recorder.endRecording());
            }
        }
    }
    timer.end();

    const double msPerLoop = timer.fCpu / (double)FLAGS_loops;
    printf("%f\t%s\n", scale_time(msPerLoop), name);
}
Beispiel #4
0
void bench(int nfft,bool fwd,bool unscaled=false, bool halfspec=false)
{
    typedef typename NumTraits<T>::Real Scalar;
    typedef typename std::complex<Scalar> Complex;
    int nits = NDATA/nfft;
    vector<T> inbuf(nfft);
    vector<Complex > outbuf(nfft);
    FFT< Scalar > fft;

    if (unscaled) {
        fft.SetFlag(fft.Unscaled);
        cout << "unscaled ";
    }
    if (halfspec) {
        fft.SetFlag(fft.HalfSpectrum);
        cout << "halfspec ";
    }


    std::fill(inbuf.begin(),inbuf.end(),0);
    fft.fwd( outbuf , inbuf);

    BenchTimer timer;
    timer.reset();
    for (int k=0;k<8;++k) {
        timer.start();
        if (fwd)
            for(int i = 0; i < nits; i++)
                fft.fwd( outbuf , inbuf);
        else
            for(int i = 0; i < nits; i++)
                fft.inv(inbuf,outbuf);
        timer.stop();
    }

    cout << nameof<Scalar>() << " ";
    double mflops = 5.*nfft*log2((double)nfft) / (1e6 * timer.value() / (double)nits );
    if ( NumTraits<T>::IsComplex ) {
        cout << "complex";
    }else{
        cout << "real   ";
        mflops /= 2;
    }


    if (fwd)
        cout << " fwd";
    else
        cout << " inv";

    cout << " NFFT=" << nfft << "  " << (double(1e-6*nfft*nits)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
}
    static void run()
    {
	arg1 a1;
	a1.setIdentity();
	arg2 a2;
	a2.setIdentity();

	BenchTimer timer;
	timer.reset();
	for (int k=0; k<10; ++k)
	{
	    timer.start();
	    for (int k=0; k<REPEAT; ++k)
		a2 = func::run( a1, a2 );
	    timer.stop();
	}
	cout << setprecision(4) << fixed << timer.value() << "s  " << endl;;
    }
Beispiel #6
0
int tool_main(int argc, char** argv) {
    SetupCrashHandler();
    SkCommandLineFlags::Parse(argc, argv);
#if SK_ENABLE_INST_COUNT
    if (FLAGS_leaks) {
        gPrintInstCount = true;
    }
#endif
    SkAutoGraphics ag;

    // First, parse some flags.
    BenchLogger logger;
    if (FLAGS_logFile.count()) {
        logger.SetLogFile(FLAGS_logFile[0]);
    }

    LoggerResultsWriter logWriter(logger, FLAGS_timeFormat[0]);
    MultiResultsWriter writer;
    writer.add(&logWriter);

    SkAutoTDelete<JSONResultsWriter> jsonWriter;
    if (FLAGS_outResultsFile.count()) {
        jsonWriter.reset(SkNEW(JSONResultsWriter(FLAGS_outResultsFile[0])));
        writer.add(jsonWriter.get());
    }

    // Instantiate after all the writers have been added to writer so that we
    // call close() before their destructors are called on the way out.
    CallEnd<MultiResultsWriter> ender(writer);

    const uint8_t alpha = FLAGS_forceBlend ? 0x80 : 0xFF;
    SkTriState::State dither = SkTriState::kDefault;
    for (size_t i = 0; i < 3; i++) {
        if (strcmp(SkTriState::Name[i], FLAGS_forceDither[0]) == 0) {
            dither = static_cast<SkTriState::State>(i);
        }
    }

    BenchMode benchMode = kNormal_BenchMode;
    for (size_t i = 0; i < SK_ARRAY_COUNT(BenchMode_Name); i++) {
        if (strcmp(FLAGS_mode[0], BenchMode_Name[i]) == 0) {
            benchMode = static_cast<BenchMode>(i);
        }
    }

    SkTDArray<int> configs;
    bool runDefaultConfigs = false;
    // Try user-given configs first.
    for (int i = 0; i < FLAGS_config.count(); i++) {
        for (int j = 0; j < static_cast<int>(SK_ARRAY_COUNT(gConfigs)); ++j) {
            if (0 == strcmp(FLAGS_config[i], gConfigs[j].name)) {
                *configs.append() = j;
            } else if (0 == strcmp(FLAGS_config[i], kDefaultsConfigStr)) {
                runDefaultConfigs = true;
            }
        }
    }
    // If there weren't any, fill in with defaults.
    if (runDefaultConfigs) {
        for (int i = 0; i < static_cast<int>(SK_ARRAY_COUNT(gConfigs)); ++i) {
            if (gConfigs[i].runByDefault) {
                *configs.append() = i;
            }
        }
    }
    // Filter out things we can't run.
    if (kNormal_BenchMode != benchMode) {
        // Non-rendering configs only run in normal mode
        for (int i = 0; i < configs.count(); ++i) {
            const Config& config = gConfigs[configs[i]];
            if (Benchmark::kNonRendering_Backend == config.backend) {
                configs.remove(i, 1);
                --i;
            }
        }
    }

#if SK_SUPPORT_GPU
    for (int i = 0; i < configs.count(); ++i) {
        const Config& config = gConfigs[configs[i]];

        if (Benchmark::kGPU_Backend == config.backend) {
            GrContext* context = gContextFactory.get(config.contextType);
            if (NULL == context) {
                SkDebugf("GrContext could not be created for config %s. Config will be skipped.\n",
                    config.name);
                configs.remove(i);
                --i;
                continue;
            }
            if (config.sampleCount > context->getMaxSampleCount()){
                SkDebugf(
                    "Sample count (%d) for config %s is not supported. Config will be skipped.\n",
                    config.sampleCount, config.name);
                configs.remove(i);
                --i;
                continue;
            }
        }
    }
#endif

    // All flags should be parsed now.  Report our settings.
    if (FLAGS_runOnce) {
        logger.logError("bench was run with --runOnce, so we're going to hide the times."
                        " It's for your own good!\n");
    }
    writer.option("mode", FLAGS_mode[0]);
    writer.option("alpha", SkStringPrintf("0x%02X", alpha).c_str());
    writer.option("antialias", SkStringPrintf("%d", FLAGS_forceAA).c_str());
    writer.option("filter", SkStringPrintf("%d", FLAGS_forceFilter).c_str());
    writer.option("dither",  SkTriState::Name[dither]);

    writer.option("rotate", SkStringPrintf("%d", FLAGS_rotate).c_str());
    writer.option("scale", SkStringPrintf("%d", FLAGS_scale).c_str());
    writer.option("clip", SkStringPrintf("%d", FLAGS_clip).c_str());

#if defined(SK_BUILD_FOR_WIN32)
    writer.option("system", "WIN32");
#elif defined(SK_BUILD_FOR_MAC)
    writer.option("system", "MAC");
#elif defined(SK_BUILD_FOR_ANDROID)
    writer.option("system", "ANDROID");
#elif defined(SK_BUILD_FOR_UNIX)
    writer.option("system", "UNIX");
#else
    writer.option("system", "other");
#endif

#if defined(SK_DEBUG)
    writer.option("build", "DEBUG");
#else
    writer.option("build", "RELEASE");
#endif

    // Set texture cache limits if non-default.
    for (size_t i = 0; i < SK_ARRAY_COUNT(gConfigs); ++i) {
#if SK_SUPPORT_GPU
        const Config& config = gConfigs[i];
        if (Benchmark::kGPU_Backend != config.backend) {
            continue;
        }
        GrContext* context = gContextFactory.get(config.contextType);
        if (NULL == context) {
            continue;
        }

        size_t bytes;
        int count;
        context->getResourceCacheLimits(&count, &bytes);
        if (-1 != FLAGS_gpuCacheBytes) {
            bytes = static_cast<size_t>(FLAGS_gpuCacheBytes);
        }
        if (-1 != FLAGS_gpuCacheCount) {
            count = FLAGS_gpuCacheCount;
        }
        context->setResourceCacheLimits(count, bytes);
#endif
    }

    // Run each bench in each configuration it supports and we asked for.
    Iter iter;
    Benchmark* bench;
    while ((bench = iter.next()) != NULL) {
        SkAutoTUnref<Benchmark> benchUnref(bench);
        if (SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getName())) {
            continue;
        }

        bench->setForceAlpha(alpha);
        bench->setForceAA(FLAGS_forceAA);
        bench->setForceFilter(FLAGS_forceFilter);
        bench->setDither(dither);
        bench->preDraw();

        bool loggedBenchName = false;
        for (int i = 0; i < configs.count(); ++i) {
            const int configIndex = configs[i];
            const Config& config = gConfigs[configIndex];

            if (!bench->isSuitableFor(config.backend)) {
                continue;
            }

            GrContext* context = NULL;
#if SK_SUPPORT_GPU
            SkGLContextHelper* glContext = NULL;
            if (Benchmark::kGPU_Backend == config.backend) {
                context = gContextFactory.get(config.contextType);
                if (NULL == context) {
                    continue;
                }
                glContext = gContextFactory.getGLContext(config.contextType);
            }
#endif

            SkAutoTUnref<SkCanvas> canvas;
            SkAutoTUnref<SkPicture> recordFrom;
            SkPictureRecorder recorderTo;
            const SkIPoint dim = bench->getSize();

            SkAutoTUnref<SkSurface> surface;
            if (Benchmark::kNonRendering_Backend != config.backend) {
                surface.reset(make_surface(config.fColorType,
                                           dim,
                                           config.backend,
                                           config.sampleCount,
                                           context));
                if (!surface.get()) {
                    logger.logError(SkStringPrintf(
                        "Device creation failure for config %s. Will skip.\n", config.name));
                    continue;
                }

                switch(benchMode) {
                    case kDeferredSilent_BenchMode:
                    case kDeferred_BenchMode:
                        canvas.reset(SkDeferredCanvas::Create(surface.get()));
                        break;
                    case kRecord_BenchMode:
                        canvas.reset(SkRef(recorderTo.beginRecording(dim.fX, dim.fY)));
                        break;
                    case kPictureRecord_BenchMode: {
                        SkPictureRecorder recorderFrom;
                        bench->draw(1, recorderFrom.beginRecording(dim.fX, dim.fY));
                        recordFrom.reset(recorderFrom.endRecording());
                        canvas.reset(SkRef(recorderTo.beginRecording(dim.fX, dim.fY)));
                        break;
                    }
                    case kNormal_BenchMode:
                        canvas.reset(SkRef(surface->getCanvas()));
                        break;
                    default:
                        SkASSERT(false);
                }
            }

            if (NULL != canvas) {
                canvas->clear(SK_ColorWHITE);
                if (FLAGS_clip)   {
                    perform_clip(canvas, dim.fX, dim.fY);
                }
                if (FLAGS_scale)  {
                    perform_scale(canvas, dim.fX, dim.fY);
                }
                if (FLAGS_rotate) {
                    perform_rotate(canvas, dim.fX, dim.fY);
                }
            }

            if (!loggedBenchName) {
                loggedBenchName = true;
                writer.bench(bench->getName(), dim.fX, dim.fY);
            }

#if SK_SUPPORT_GPU
            SkGLContextHelper* contextHelper = NULL;
            if (Benchmark::kGPU_Backend == config.backend) {
                contextHelper = gContextFactory.getGLContext(config.contextType);
            }
            BenchTimer timer(contextHelper);
#else
            BenchTimer timer;
#endif

            double previous = std::numeric_limits<double>::infinity();
            bool converged = false;

            // variables used to compute loopsPerFrame
            double frameIntervalTime = 0.0f;
            int frameIntervalTotalLoops = 0;

            bool frameIntervalComputed = false;
            int loopsPerFrame = 0;
            int loopsPerIter = 0;
            if (FLAGS_verbose) { SkDebugf("%s %s: ", bench->getName(), config.name); }
            if (!FLAGS_dryRun) {
                do {
                    // Ramp up 1 -> 2 -> 4 -> 8 -> 16 -> ... -> ~1 billion.
                    loopsPerIter = (loopsPerIter == 0) ? 1 : loopsPerIter * 2;
                    if (loopsPerIter >= (1<<30) || timer.fWall > FLAGS_maxMs) {
                        // If you find it takes more than a billion loops to get up to 20ms of runtime,
                        // you've got a computer clocked at several THz or have a broken benchmark.  ;)
                        //     "1B ought to be enough for anybody."
                        logger.logError(SkStringPrintf(
                            "\nCan't get %s %s to converge in %dms (%d loops)",
                             bench->getName(), config.name, FLAGS_maxMs, loopsPerIter));
                        break;
                    }

                    if ((benchMode == kRecord_BenchMode || benchMode == kPictureRecord_BenchMode)) {
                        // Clear the recorded commands so that they do not accumulate.
                        canvas.reset(SkRef(recorderTo.beginRecording(dim.fX, dim.fY)));
                    }

                    timer.start();
                    // Inner loop that allows us to break the run into smaller
                    // chunks (e.g. frames). This is especially useful for the GPU
                    // as we can flush and/or swap buffers to keep the GPU from
                    // queuing up too much work.
                    for (int loopCount = loopsPerIter; loopCount > 0; ) {
                        // Save and restore around each call to draw() to guarantee a pristine canvas.
                        SkAutoCanvasRestore saveRestore(canvas, true/*also save*/);

                        int loops;
                        if (frameIntervalComputed && loopCount > loopsPerFrame) {
                            loops = loopsPerFrame;
                            loopCount -= loopsPerFrame;
                        } else {
                            loops = loopCount;
                            loopCount = 0;
                        }

                        if (benchMode == kPictureRecord_BenchMode) {
                            recordFrom->draw(canvas);
                        } else {
                            bench->draw(loops, canvas);
                        }

                        if (kDeferredSilent_BenchMode == benchMode) {
                            static_cast<SkDeferredCanvas*>(canvas.get())->silentFlush();
                        } else if (NULL != canvas) {
                            canvas->flush();
                        }

    #if SK_SUPPORT_GPU
                        // swap drawing buffers on each frame to prevent the GPU
                        // from queuing up too much work
                        if (NULL != glContext) {
                            glContext->swapBuffers();
                        }
    #endif
                    }



                    // Stop truncated timers before GL calls complete, and stop the full timers after.
                    timer.truncatedEnd();
    #if SK_SUPPORT_GPU
                    if (NULL != glContext) {
                        context->flush();
                        SK_GL(*glContext, Finish());
                    }
    #endif
                    timer.end();

                    // setup the frame interval for subsequent iterations
                    if (!frameIntervalComputed) {
                        frameIntervalTime += timer.fWall;
                        frameIntervalTotalLoops += loopsPerIter;
                        if (frameIntervalTime >= FLAGS_minMs) {
                            frameIntervalComputed = true;
                            loopsPerFrame =
                              (int)(((double)frameIntervalTotalLoops / frameIntervalTime) * FLAGS_minMs);
                            if (loopsPerFrame < 1) {
                                loopsPerFrame = 1;
                            }
    //                        SkDebugf("  %s has %d loops in %f ms (normalized to %d)\n",
    //                                 bench->getName(), frameIntervalTotalLoops,
    //                                 timer.fWall, loopsPerFrame);
                        }
                    }

                    const double current = timer.fWall / loopsPerIter;
                    if (FLAGS_verbose && current > previous) { SkDebugf("↑"); }
                    if (FLAGS_verbose) { SkDebugf("%.3g ", current); }
                    converged = HasConverged(previous, current, timer.fWall);
                    previous = current;
                } while (!FLAGS_runOnce && !converged);
            }
            if (FLAGS_verbose) { SkDebugf("\n"); }

            if (!FLAGS_dryRun && FLAGS_outDir.count() && Benchmark::kNonRendering_Backend != config.backend) {
                SkAutoTUnref<SkImage> image(surface->newImageSnapshot());
                if (image.get()) {
                    saveFile(bench->getName(), config.name, FLAGS_outDir[0],
                             image);
                }
            }

            if (FLAGS_runOnce) {
                // Let's not mislead ourselves by looking at Debug build or single iteration bench times!
                continue;
            }

            // Normalize to ms per 1000 iterations.
            const double normalize = 1000.0 / loopsPerIter;
            const struct { char shortName; const char* longName; double ms; } times[] = {
                {'w', "msecs",  normalize * timer.fWall},
                {'W', "Wmsecs", normalize * timer.fTruncatedWall},
                {'c', "cmsecs", normalize * timer.fCpu},
                {'C', "Cmsecs", normalize * timer.fTruncatedCpu},
                {'g', "gmsecs", normalize * timer.fGpu},
            };

            writer.config(config.name);
            for (size_t i = 0; i < SK_ARRAY_COUNT(times); i++) {
                if (strchr(FLAGS_timers[0], times[i].shortName) && times[i].ms > 0) {
                    writer.timer(times[i].longName, times[i].ms);
                }
            }
        }
    }
#if SK_SUPPORT_GPU
    gContextFactory.destroyContexts();
#endif
    return 0;
}
Beispiel #7
0
int main(int argc, char *argv[])
{
//   bench_sort();

  int rows = SIZE;
  int cols = SIZE;
  float density = DENSITY;

  EigenSparseMatrix sm1(rows,cols), sm2(rows,cols), sm3(rows,cols), sm4(rows,cols);

  BenchTimer timer;
  for (int nnzPerCol = NNZPERCOL; nnzPerCol>1; nnzPerCol/=1.1)
  {
    sm1.setZero();
    sm2.setZero();
    fillMatrix2(nnzPerCol, rows, cols, sm1);
    fillMatrix2(nnzPerCol, rows, cols, sm2);
//     std::cerr << "filling OK\n";

    // dense matrices
    #ifdef DENSEMATRIX
    {
      std::cout << "Eigen Dense\t" << nnzPerCol << "%\n";
      DenseMatrix m1(rows,cols), m2(rows,cols), m3(rows,cols);
      eiToDense(sm1, m1);
      eiToDense(sm2, m2);

      timer.reset();
      timer.start();
      for (int k=0; k<REPEAT; ++k)
        m3 = m1 * m2;
      timer.stop();
      std::cout << "   a * b:\t" << timer.value() << endl;

      timer.reset();
      timer.start();
      for (int k=0; k<REPEAT; ++k)
        m3 = m1.transpose() * m2;
      timer.stop();
      std::cout << "   a' * b:\t" << timer.value() << endl;

      timer.reset();
      timer.start();
      for (int k=0; k<REPEAT; ++k)
        m3 = m1.transpose() * m2.transpose();
      timer.stop();
      std::cout << "   a' * b':\t" << timer.value() << endl;

      timer.reset();
      timer.start();
      for (int k=0; k<REPEAT; ++k)
        m3 = m1 * m2.transpose();
      timer.stop();
      std::cout << "   a * b':\t" << timer.value() << endl;
    }
    #endif

    // eigen sparse matrices
    {
      std::cout << "Eigen sparse\t" << sm1.nonZeros()/(float(sm1.rows())*float(sm1.cols()))*100 << "% * "
                << sm2.nonZeros()/(float(sm2.rows())*float(sm2.cols()))*100 << "%\n";

      BENCH(sm3 = sm1 * sm2; )
      std::cout << "   a * b:\t" << timer.value() << endl;

//       BENCH(sm3 = sm1.transpose() * sm2; )
//       std::cout << "   a' * b:\t" << timer.value() << endl;
// //
//       BENCH(sm3 = sm1.transpose() * sm2.transpose(); )
//       std::cout << "   a' * b':\t" << timer.value() << endl;
// //
//       BENCH(sm3 = sm1 * sm2.transpose(); )
//       std::cout << "   a * b' :\t" << timer.value() << endl;


//       std::cout << "\n";
//
//       BENCH( sm3._experimentalNewProduct(sm1, sm2); )
//       std::cout << "   a * b:\t" << timer.value() << endl;
//
//       BENCH(sm3._experimentalNewProduct(sm1.transpose(),sm2); )
//       std::cout << "   a' * b:\t" << timer.value() << endl;
// //
//       BENCH(sm3._experimentalNewProduct(sm1.transpose(),sm2.transpose()); )
//       std::cout << "   a' * b':\t" << timer.value() << endl;
// //
//       BENCH(sm3._experimentalNewProduct(sm1, sm2.transpose());)
//       std::cout << "   a * b' :\t" << timer.value() << endl;
    }

    // eigen dyn-sparse matrices
    /*{
      DynamicSparseMatrix<Scalar> m1(sm1), m2(sm2), m3(sm3);
      std::cout << "Eigen dyn-sparse\t" << m1.nonZeros()/(float(m1.rows())*float(m1.cols()))*100 << "% * "
                << m2.nonZeros()/(float(m2.rows())*float(m2.cols()))*100 << "%\n";

//       timer.reset();
//       timer.start();
      BENCH(for (int k=0; k<REPEAT; ++k) m3 = m1 * m2;)
//       timer.stop();
      std::cout << "   a * b:\t" << timer.value() << endl;
//       std::cout << sm3 << "\n";

      timer.reset();
      timer.start();
//       std::cerr << "transpose...\n";
//       EigenSparseMatrix sm4 = sm1.transpose();
//       std::cout << sm4.nonZeros() << " == " << sm1.nonZeros() << "\n";
//       exit(1);
//       std::cerr << "transpose OK\n";
//       std::cout << sm1 << "\n\n" << sm1.transpose() << "\n\n" << sm4.transpose() << "\n\n";
      BENCH(for (int k=0; k<REPEAT; ++k) m3 = m1.transpose() * m2;)
//       timer.stop();
      std::cout << "   a' * b:\t" << timer.value() << endl;

//       timer.reset();
//       timer.start();
      BENCH( for (int k=0; k<REPEAT; ++k) m3 = m1.transpose() * m2.transpose(); )
//       timer.stop();
      std::cout << "   a' * b':\t" << timer.value() << endl;

//       timer.reset();
//       timer.start();
      BENCH( for (int k=0; k<REPEAT; ++k) m3 = m1 * m2.transpose(); )
//       timer.stop();
      std::cout << "   a * b' :\t" << timer.value() << endl;
    }*/

    // CSparse
    #ifdef CSPARSE
    {
      std::cout << "CSparse \t" << nnzPerCol << "%\n";
      cs *m1, *m2, *m3;
      eiToCSparse(sm1, m1);
      eiToCSparse(sm2, m2);

//       timer.reset();
//       timer.start();
//       for (int k=0; k<REPEAT; ++k)
      BENCH(
      {
        m3 = cs_sorted_multiply(m1, m2);
        if (!m3)
        {
          std::cerr << "cs_multiply failed\n";
//           break;
        }
//         cs_print(m3, 0);
        cs_spfree(m3);
      }
      );
//       timer.stop();
      std::cout << "   a * b:\t" << timer.value() << endl;

//       BENCH( { m3 = cs_sorted_multiply2(m1, m2); cs_spfree(m3); } );
//       std::cout << "   a * b:\t" << timer.value() << endl;
    }