int main()
{
    double *A, *B, *C;
    int i,j,r,max_threads,size;
    double alpha, beta;
    double s_initial, s_elapsed;
    
    printf("Intializing data for matrix multiplication C=A*B for matrix\n\n"
            " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N);
    alpha = 1.0;
    beta = 0.0;

    printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n");
    A = ( double *)mkl_malloc(M*P*sizeof( double ),64);
    B = ( double *)mkl_malloc(N*P*sizeof( double ),64);
    C = ( double *)mkl_malloc(M*N*sizeof( double ),64);
    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: can`t allocate memory for matrices.\n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf("Intializing matrix data\n\n");
    size = M*P;
    for (i = 0; i < size; ++i)
    {
        A[i] = ( double )(i+1);
    }
    size = N*P;
    for (i = 0; i < size; ++i)
    {
        B[i] = ( double )(i-1);
    }

    printf("Finding max number of threads can use for parallel runs \n\n");
    max_threads = mkl_get_max_threads();

    printf("Running from 1 to %i threads \n\n",max_threads);
    for (i = 1; i <= max_threads; ++i)
    {
        size = M*N;
        for (j = 0; j < size; ++j)
        {
            C[j] = 0.0;
        }

	    printf("Requesting to use %i threads \n\n",i); 
	    mkl_set_num_threads(i);

	    printf("Measuring performance of matrix product using dgemm function\n"
		    " via CBLAS interface on %i threads \n\n",i);
	    s_initial = dsecnd();
	    for (r = 0; r < LOOP_COUNT; ++r)
	    {
    		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N);
            // multiply matrices with cblas_dgemm;
	    }
	    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

	    printf("Matrix multiplication using dgemm completed \n"
		    " at %.5f milliseconds using %d threads \n\n",
		    (s_elapsed * 1000),i);
        printf("Output the result: \n");
        size = M*N;
        for (i = 0; i < size; ++i)
        {
            printf("%i\t",(int)C[i]);
            if (i % N == N - 1)
                printf("\n");
        }
    }

    printf("Dellocating memory\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    return 0;
}
Beispiel #2
0
TH_API void THInferNumThreads(void)
{
#if defined(_OPENMP) && defined(TH_BLAS_MKL)
  // If we are using MKL an OpenMP make sure the number of threads match.
  // Otherwise, MKL and our OpenMP-enabled functions will keep changing the
  // size of the OpenMP thread pool, resulting in worse performance (and memory
  // leaks in GCC 5.4)
  omp_set_num_threads(mkl_get_max_threads());
#endif
}
Beispiel #3
0
// --------------------
// Print the available GPU devices. Used in testing.
extern "C" void
magma_print_environment()
{
    magma_int_t major, minor, micro;
    magma_version( &major, &minor, &micro );
    printf( "%% clMAGMA %d.%d.%d %s\n",
            (int) major, (int) minor, (int) micro, MAGMA_VERSION_STAGE );

    // CUDA, OpenCL, OpenMP, MKL, ACML versions all printed on same line
    char device_name[1024], driver[1024];
    clGetPlatformInfo( g_runtime.get_platform(), CL_PLATFORM_VERSION, sizeof(device_name), device_name, NULL );
    printf( "%% OpenCL platform %s.", device_name );
    
#if defined(_OPENMP)
    int omp_threads = 0;
    #pragma omp parallel
    {
        omp_threads = omp_get_num_threads();
    }
    printf( " OpenMP threads %d.", omp_threads );
#else
    printf( " MAGMA not compiled with OpenMP." );
#endif

#if defined(MAGMA_WITH_MKL)
    MKLVersion mkl_version;
    mkl_get_version( &mkl_version );
    printf( " MKL %d.%d.%d, MKL threads %d.",
            mkl_version.MajorVersion,
            mkl_version.MinorVersion,
            mkl_version.UpdateVersion,
            mkl_get_max_threads() );
#endif
    
#if defined(MAGMA_WITH_ACML)
    int acml_major, acml_minor, acml_patch;
    acmlversion( &acml_major, &acml_minor, &acml_patch );
    printf( " ACML %d.%d.%d.", acml_major, acml_minor, acml_patch );
#endif

    printf( "\n" );
    
    // print devices
    int ndevices = g_runtime.get_num_devices();
    cl_device_id* devices = g_runtime.get_devices();
    cl_ulong mem_size, alloc_size;
    for( int dev=0; dev < ndevices; ++dev ) {
        clGetDeviceInfo( devices[dev], CL_DEVICE_NAME,               sizeof(device_name), device_name, NULL );
        clGetDeviceInfo( devices[dev], CL_DEVICE_GLOBAL_MEM_SIZE,    sizeof(mem_size),    &mem_size,   NULL );
        clGetDeviceInfo( devices[dev], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_size),  &alloc_size, NULL );
        clGetDeviceInfo( devices[dev], CL_DRIVER_VERSION,            sizeof(driver),      driver,      NULL );
        printf( "%% Device: %s, %.1f MiB memory, max allocation %.1f MiB, driver  %s\n",
                device_name, mem_size/(1024.*1024.), alloc_size/(1024.*1024.), driver );
    }
}
magma_int_t magma_get_lapack_numthreads()
{
    magma_int_t threads = 1;

#if defined(MAGMA_WITH_MKL)
    threads = mkl_get_max_threads();
#elif defined(_OPENMP)
    #pragma omp parallel
    {
        threads = omp_get_num_threads();
    }
#endif

    return threads;
}
DisableThreadingInBlock::DisableThreadingInBlock()
  : mklNumThreads(1)
  , ompNumThreads(1)
  , openblasNumThreads(1)
{
#if defined(HAVE_MKL_H)
    mklNumThreads = mkl_get_max_threads();
    mkl_set_num_threads(1);
#endif
#ifdef _OPENMP
    ompNumThreads = omp_get_max_threads();
    omp_set_num_threads(1);
#endif
#ifdef OPENBLAS_DISABLE_THREADS
    openblasNumThreads = goto_get_num_procs();
    openblas_set_num_threads(1);
#endif
    // Silence compiler warnings about unused private members
    (void) mklNumThreads;
    (void) ompNumThreads;
    (void) openblasNumThreads;
}
Beispiel #6
0
int run_nmf(matrix X, matrix W, matrix H, int threads, int max_iter, int verbose)
{ 

    if (threads == 0 || threads > omp_get_max_threads())
    {
	omp_threads = omp_get_max_threads();
	mkl_threads = mkl_get_max_threads();
    }
    else
    {
	omp_threads = threads;
	mkl_threads = threads;
    }

    eps_threads = omp_threads;
    vecdiv_threads = omp_threads;
    vecmult_threads = omp_threads;
    sumrows_threads = omp_threads;
    sumcols_threads = omp_threads;
    coldiv_threads = omp_threads;
    rowdiv_threads = omp_threads;
    check_threads = omp_threads;


    double timers[TIMERS];


    int i;
    for(i=0;i<TIMERS;i++)
	timers[i]=0;


    update_div(W,H,X,CONVERGE_THRESH,max_iter,timers,verbose);


    return 0;
}
DISSECTION_API void DISS_INIT(uint64_t &dslv_,
			      const int &called,
			      const int &real_or_complex,
			      const int &nthreads,
			      const int &verbose)
{
  int num_threads;
  dissection_solver_ptr *dslv;
  dslv_ = (uint64_t)new dissection_solver_ptr;
  dslv = (dissection_solver_ptr *)dslv_;
  dslv->real_or_complex = real_or_complex;
  dslv->quad_fact = false;
  dslv->called = called;
  dslv->symbolic = 0;
  dslv->numeric = 0;
  {
    int pid = (int)getpid();
    char fname[256];
    if (verbose > 0) {
      dslv->verbose = true;
    }
    else {
      dslv->verbose = false;
    }
#if 1 
    if (dslv->verbose > 0) {
      fprintf(stderr, "pid = %d\n", pid);
      sprintf(fname, "dissection.%04d.%04d.log", pid, called);
      //      sprintf(fname, "dissection.%04d.log", pid);
      dslv->fp = fopen(fname, "a");
    }
    else {
      dslv->fp = stderr;
    }
#else
    dslv->fp = stderr;
#endif
  }
  if (dslv->verbose > 0) {
    fprintf(dslv->fp, "%s %d : diss_init : called = %d\n", 
	    __FILE__, __LINE__,  dslv->called);
  }
  
  //  _called++;                   // counter for dumping matrix data to debug
#ifdef BLAS_MKL
  if (getenv("MKL_NUM_THREADS")) {
    sscanf(getenv("MKL_NUM_THREADS"), "%d", &dslv->mkl_num_threads);
    if (dslv->verbose > 0) {
      fprintf(dslv->fp,
	      "environmental variable MKL_NUM_THREADS = %d\n",
	      dslv->mkl_num_threads);
    }
  }
  else {
    dslv->mkl_num_threads = mkl_get_max_threads();
  }
  if (dslv->verbose > 0) {
    fprintf(dslv->fp,
	    "MKL_NUM_THREADS = %d\n", dslv->mkl_num_threads);
  }
#endif
  if (nthreads == (-1)) {
    if (getenv("NTHREADS")) {
      sscanf(getenv("NTHREADS"), "%d", &num_threads);
    }
    else {
      num_threads = 1;
    }
  }
  if (nthreads > 0) {
    num_threads = nthreads;
  }
  {
    switch(real_or_complex) {
    case DISSECTION_REAL_MATRIX:
      dslv->rptr = new DissectionSolver<double>(num_threads, 
						(verbose != 0 ? true : false), 
						dslv->called, dslv->fp);
      break;
    case DISSECTION_COMPLEX_MATRIX:
      dslv->cptr = new DissectionSolver<complex<double>, double>(num_threads, 
								 (verbose != 0 ? true : false), 
								 dslv->called, dslv->fp);
      break;
    default:
      if (dslv->verbose > 0) {
	fprintf(dslv->fp, "%s %d : unknown matrix data type : %d\n", 
		__FILE__, __LINE__, dslv->real_or_complex);
      }
    }
  }
}
Beispiel #8
0
// X: a MxD matrix, Y: a M vector, W: a M vector
// W0: a M vector
int main(int argc, char ** argv){
    if (argc>1 && argv[1][0]=='h') {
        printf ("Usage: parSymSGD M D T C lamda r\n");
        printf ("  M: number of data points, D: dimensions, T: time iterations, C: cores;\n");
        printf ("  lamda: learning rate, r: panel size in unit of C.\n");
        return 1;
    }u
    // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points)
    int M = argc>1?atoi(argv[1]):32;
    int D = argc>2?atoi(argv[2]):4;
    T = argc>3?atoi(argv[3]):10;
    int C = argc>4?atoi(argv[4]):4;
    float lamda = argc>5?atof(argv[5]):0.01;
    int r = argc>6?atoi(argv[6]):1;
    ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r);

    int max_threads = mkl_get_max_threads(); // get the max number of threads
	
    int rep;
    mkl_set_num_threads(1); // set the number of threads to use by mkl
    panelSz = C*r;
    panels = M/panelSz;

    int i,j,k,p,t;
    float *Y, *Wreal, *W, *X;
    Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE);
    Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE);
    W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE);
    X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE);
    float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE);
    float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE);
	float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE);
    float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE);
    float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE);

    if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){
        printf("Memory allocation error.\n");
        return 2;
    }

    initData(Wreal,W,X,Y, M, D,I);

    ///printf("panelSz=%d, panels=%d\n", panelSz, panels);

    for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){
        omp_set_num_threads(nt);// set the number of openMP threads

        for (rep=0; rep<REPEATS; rep++){//repeat measurements
            double prepTime, gdTime, sInit;
            // preprocessing
            sInit=dsecnd();
            //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda);
            preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda);
            prepTime = (dsecnd() - sInit);
            ///dump2("Z",Z,M,D);
            ///dump2("B",B,panels,D);

            // GD
            initW(W,D);
            ///dump1("W (initial)", W, D);
            sInit=dsecnd();
            float err;
            float fixpoint = 0.0;
            for (t=0;t<T;t++){
                for (p=0;p<panels;p++){
                    gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I);
                    ///printf("(t=%d, p=%d) ",t,p);
                    ///dump1("W", W, D);
                    ///err=calErr(X, Ypred, Ytmp, Y, W, M, D);
                  printf("finish  one  panels     ............................  \n");
                }
            }
            gdTime = (dsecnd() - sInit);

            err=calErr(X, Ypred, Ytmp, Y, W, M, D);
            fixpoint = err - prev_err;
            

            // print final err. time is in milliseconds
            printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err);
        }
    }
    if (B) mkl_free(B);
    if (Z) mkl_free(Z);
    if (Ytmp) mkl_free(Ytmp);
    if (Ypred) mkl_free(Ypred);
    if (Y) mkl_free(Y);
    if (Wreal) mkl_free(Wreal);
    if (W) mkl_free(W);
    if (X) mkl_free(X);
	if (I) mkl_free(I);
    return 0;
}
void StVKReducedStiffnessMatrix::Evaluate(double * q, double * Rq)
{
  // this is same as EvaluateSubset with start=0, end=quadraticSize

  /*
  int i,j,k;
  int output;

  // reset to free terms
  int index = 0;
  int indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      Rq[indexEntry] = freeCoef_[index];
      index++;
      indexEntry++;
    }
    indexEntry += output + 1;
  }

  // add linear terms
  index = 0;
  indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      for(j=0; j<r; j++)
      {
        Rq[indexEntry] += linearCoef_[index] * q[j];
        index++;
      }
      indexEntry++;
    }
    indexEntry += output + 1;
  }

  // add quadratic terms
  index = 0;
  indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      for(j=0; j<r; j++)
        for(k=j; k<r; k++)
        {
          Rq[indexEntry] += quadraticCoef_[index] * q[j] * q[k];
          index++;
        }
        indexEntry++;
    }
    indexEntry += output + 1;
  }

  // make symetric
  for(output=0; output<r; output++)
    for(i=0; i<output; i++)
      Rq[ELT(r,i,output)] = Rq[ELT(r,output,i)];
  */

  if (useSingleThread)
  {
    #if defined(WIN32) || defined(linux)
      mkl_max_threads = mkl_get_max_threads();
      mkl_dynamic = mkl_get_dynamic();
      mkl_set_num_threads(1);
      mkl_set_dynamic(0);
    #elif defined(__APPLE__)
      //setenv("VECLIB_MAXIMUM_THREADS", "1", true);
    #endif
  }

  // reset to free terms
  memcpy(buffer1,freeCoef_,sizeof(double)*quadraticSize);

  // add linear terms
  // multiply linearCoef_ and q
  // linearCoef_ is r x quadraticSize array
  cblas_dgemv(CblasColMajor, CblasTrans, 
        r, quadraticSize,
        1.0,
        linearCoef_, r,
        q, 1,
        1.0,
        buffer1, 1);

  // compute qiqj
  int index = 0;
  for(int output=0; output<r; output++)
    for(int i=output; i<r; i++)
    {
      qiqj[index] = q[output] * q[i];
      index++;
    }
 
  // update Rq
  // quadraticCoef_ is quadraticSize x quadraticSize matrix
  // each column gives quadratic coef for one matrix entry
  cblas_dgemv(CblasColMajor, CblasTrans, 
        quadraticSize, quadraticSize,
        1.0,
        quadraticCoef_, quadraticSize,
        qiqj, 1,
        1.0,
        buffer1, 1);

  // unpack into a symmetric matrix
  int i1=0,j1=0;
  for(int i=0; i< quadraticSize; i++)
  {
    Rq[ELT(r,i1,j1)] = buffer1[i];
    Rq[ELT(r,j1,i1)] = buffer1[i];
    j1++;
    if(j1 == r)
    {
      i1++;
      j1 = i1;
    }
  }

  if (useSingleThread)
  {
    #if defined(WIN32) || defined(linux)
      mkl_set_num_threads(mkl_max_threads);
      mkl_set_dynamic(mkl_dynamic);
    #elif defined(__APPLE__)
      //unsetenv("VECLIB_MAXIMUM_THREADS");
    #endif
  }
}
Beispiel #10
0
int main(int argc, char** argv) {


    int maxnumit = 0;
    int maxrec = -1;

    const char *budget_type_str = NULL;
    const char *stage = NULL;
    const char *training = NULL;
    const char *dev = NULL;
    const char *path = NULL;
    const char * etransform_str = NULL;
    const char *kernel_str = NULL;
    const char *rbf_lambda_str = NULL;

#ifdef NDEBUG
    log_info("ai-parse %s (Release)", VERSION);
#else
    log_info("ai-parse %s (Debug)", VERSION);
#endif

    struct argparse_option options[] = {
        OPT_HELP(),
        //OPT_BOOLEAN('f', "force", &force, "force to do", NULL),
        OPT_INTEGER('v', "verbosity", &verbosity, "Verbosity level. Minimum (Default) 0. Increasing values increase parser verbosity.", NULL),
        OPT_STRING('o', "modelname", &modelname, "Model name", NULL),
        OPT_STRING('p', "path", &path, "CoNLL base directory including sections", NULL),
        OPT_STRING('s', "stage", &stage, "[ optimize | train | parse ]", NULL),
        OPT_INTEGER('n', "maxnumit", &maxnumit, "Maximum number of iterations by perceptron. Default is 50", NULL),
        OPT_STRING('t', "training", &training, "Training sections for optimize and train. Apply sections for parse", NULL),
        OPT_STRING('d', "development", &dev, "Development sections for optimize", NULL),
        OPT_STRING('e', "epattern", &epattern, "Embedding Patterns", NULL),
        OPT_INTEGER('l', "edimension", &edimension, "Embedding dimension", NULL),
        OPT_INTEGER('m', "maxrec", &maxrec, "Maximum number of training instance", NULL),
        OPT_STRING('x', "etransform", &etransform_str, "Embedding Transformation", NULL),
        OPT_STRING('k', "kernel", &kernel_str, "Kernel Type", NULL),
        OPT_INTEGER('a', "bias", &bias, "Polynomial kernel additive term. Default is 1", NULL),
        OPT_INTEGER('c', "concurrency", &num_parallel_mkl_slaves, "Parallel MKL Slaves. Default is 90% of all machine cores", NULL),
        OPT_INTEGER('b', "degree", &polynomial_degree, "Degree of polynomial kernel. Default is 4", NULL),
        OPT_STRING('z', "lambda", &rbf_lambda_str, "Lambda multiplier for RBF Kernel.Default value is 0.025"),
        OPT_STRING('u', "budget_type", &budget_type_str, "Budget control methods. NONE|RANDOM", NULL),
        OPT_INTEGER('g', "budget_size", &budget_target, "Budget Target for budget based perceptron algorithms. Default 50K", NULL),
        OPT_END(),
    };
    struct argparse argparse;
    argparse_init(&argparse, options, usage, 0);
    argc = argparse_parse(&argparse, argc, argv);

    int max_threads = mkl_get_max_threads();
    log_info("There are max %d MKL threads", max_threads);

    if (num_parallel_mkl_slaves == -1) {

        num_parallel_mkl_slaves = (int) (max_threads * 0.9);

        if (num_parallel_mkl_slaves == 0)
            num_parallel_mkl_slaves = 1;

    }

    log_info("Number of MKL Slaves is set to be %d", num_parallel_mkl_slaves);
    mkl_set_num_threads(num_parallel_mkl_slaves);

    if (1 == mkl_get_dynamic())
        log_info("Intel MKL may use less than %i threads for a large problem", num_parallel_mkl_slaves);
    else
        log_info("Intel MKL should use %i threads for a large problem", num_parallel_mkl_slaves);

    check(stage != NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0 || strcmp(stage, "parse") == 0),
            "Choose one of -s optimize, train, parse");

    check(path != NULL, "Specify a ConLL base directory using -p");

    check(edimension != 0, "Set embedding dimension using -l");

    check(modelname != NULL, "Provide model name using -o");

    if (budget_type_str != NULL) {
        if (strcmp(budget_type_str, "RANDOM") == 0 || strcmp(budget_type_str, "RANDOMIZED") == 0) {
            budget_method = RANDOMIZED;
        } else if (strcmp(budget_type_str, "NONE") == 0) {
            budget_method = NONE;

        } else {
            log_err("Unknown budget control type %s", budget_type_str);
            goto error;
        }

    } else {
        budget_method = NONE;
    }



    if (training == NULL) {
        log_warn("training section string is set to %s", DEFAULT_TRAINING_SECTION_STR);

        training = strdup(DEFAULT_TRAINING_SECTION_STR);
    }

    if (dev == NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0)) {
        log_info("development section string is set to %s", DEFAULT_DEV_SECTION_STR);

        dev = strdup(DEFAULT_DEV_SECTION_STR);
    }

    check(epattern != NULL, "Embedding pattern is required for -s optimize,train,parse");

    if (etransform_str == NULL) {
        log_info("Embedding transformation is set to be QUADRATIC");

        etransform = DEFAULT_EMBEDDING_TRANFORMATION;
    } else if (strcmp(etransform_str, "LINEAR") == 0) {
        etransform = LINEAR;
    } else if (strcmp(etransform_str, "QUADRATIC") == 0) {
        etransform = QUADRATIC;
    } else if (strcmp(etransform_str, "CUBIC") == 0) {
        etransform = CUBIC;
    } else {
        log_err("Unsupported transformation type for embedding %s", etransform_str);
    }

    if (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0) {

        if (maxnumit <= 0) {
            log_info("maxnumit is set to %d", DEFAULT_MAX_NUMIT);

            maxnumit = DEFAULT_MAX_NUMIT;
        }
    }

    if (kernel_str != NULL) {
        if (strcmp(kernel_str, "POLYNOMIAL") == 0) {

            log_info("Polynomial kernel will be used with bias %f and degree %d", bias, polynomial_degree);

            kernel = KPOLYNOMIAL;
        } else if (strcmp(kernel_str, "GAUSSIAN") == 0 || strcmp(kernel_str, "RBF") == 0) {

            if (rbf_lambda_str != NULL) {
                rbf_lambda = (float) atof(rbf_lambda_str);
            }

            log_info("RBF/GAUSSIAN kernel will be used with lambda %f ", rbf_lambda);

            kernel = KRBF;


        } else {
            log_err("Unsupported kernel type %s. Valid options are LINEAR, POLYNOMIAL, and RBF/GAUSSIAN", kernel_str);
            goto error;
        }
    }

    if (strcmp(stage, "optimize") == 0) {
        void *model = optimize(maxnumit, maxrec, path, training, dev, edimension);

        char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7));
        check_mem(model_filename);

        sprintf(model_filename, "%s.model", modelname);

        FILE *fp = fopen(model_filename, "w");

        if (kernel == KLINEAR) {

            PerceptronModel pmodel = (PerceptronModel) model;

            dump_PerceptronModel(fp, edimension, pmodel->embedding_w_best, pmodel->best_numit);

            PerceptronModel_free(pmodel);
        } else if (kernel == KPOLYNOMIAL || kernel == KRBF) {
            KernelPerceptron kpmodel = (KernelPerceptron) model;

            dump_KernelPerceptronModel(fp, kpmodel);
        }

        log_info("Model is dumped into %s file", model_filename);


        fclose(fp);



    } else if (strcmp(stage, "parse") == 0) {
        char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7));
        check_mem(model_filename);

        sprintf(model_filename, "%s.model", modelname);
        FILE *fp = fopen(model_filename, "r");

        check(fp != NULL, "%s could not be opened", model_filename);

        void *model;
        if (kernel == KLINEAR)
            model = load_PerceptronModel(fp);
        else
            model = load_KernelPerceptronModel(fp);

        fclose(fp);

        check(model != NULL, "Error in loading model file");

        log_info("Model loaded from %s successfully", model_filename);

        parseall(model, path, training, edimension);
    } else {
        log_info("Waiting for implementation");
    }



    return (EXIT_SUCCESS);
error:

    return (EXIT_FAILURE);

}
Beispiel #11
0
  void TaskManager :: Loop(int thd)
  {
    /*
    static Timer tADD("add entry counter");
    static Timer tCASready1("spin-CAS ready tick1");
    static Timer tCASready2("spin-CAS ready tick2");
    static Timer tCASyield("spin-CAS yield");
    static Timer tCAS1("spin-CAS wait");
    static Timer texit("exit zone");
    static Timer tdec("decrement");
    */
    thread_id = thd;

    int thds = GetNumThreads();

    int mynode = num_nodes * thd/thds;

    NodeData & mynode_data = *(nodedata[mynode]);



    TaskInfo ti;
    ti.nthreads = thds;
    ti.thread_nr = thd;
    // ti.nnodes = num_nodes;
    // ti.node_nr = mynode;

      
#ifdef USE_NUMA
    numa_run_on_node (mynode);
#endif
    active_workers++;
    workers_on_node[mynode]++;
    int jobdone = 0;


#ifdef USE_MKL
    auto mkl_max = mkl_get_max_threads();
    mkl_set_num_threads_local(1);
#endif

    
    while (!done)
      {
        if (complete[mynode] > jobdone)
          jobdone = complete[mynode];

        if (jobnr == jobdone)
          {
            // RegionTracer t(ti.thread_nr, tCASyield, ti.task_nr);            
            if(sleep)
              this_thread::sleep_for(chrono::microseconds(sleep_usecs));
            else
              {
#ifdef WIN32
                this_thread::yield();
#else  // WIN32
                sched_yield();
#endif // WIN32
              }
            continue;
          }

        {
          // RegionTracer t(ti.thread_nr, tADD, ti.task_nr);

          // non-atomic fast check ...
          if ( (mynode_data.participate & 1) == 0) continue;

          int oldval = mynode_data.participate += 2;
          if ( (oldval & 1) == 0)
            { // job not active, going out again
              mynode_data.participate -= 2;
              continue;
            }
        }

        if (startup_function) (*startup_function)();
        
        IntRange mytasks = Range(int(ntasks)).Split (mynode, num_nodes);
          
        try
          {
            
            while (1)
              {
                if (mynode_data.start_cnt >= mytasks.Size()) break;
		int mytask = mynode_data.start_cnt.fetch_add(1, memory_order_relaxed);
                if (mytask >= mytasks.Size()) break;
                
                ti.task_nr = mytasks.First()+mytask;
                ti.ntasks = ntasks;
                
                {
                  RegionTracer t(ti.thread_nr, jobnr, RegionTracer::ID_JOB, ti.task_nr);
                  (*func)(ti);
                }
              }

          }
        catch (Exception e)
          {
            {
              // cout << "got exception in TM" << endl; 
              lock_guard<mutex> guard(copyex_mutex);
              delete ex;
              ex = new Exception (e);
              mynode_data.start_cnt = mytasks.Size();
            }
          }

#ifndef __MIC__
        atomic_thread_fence (memory_order_release);     
#endif // __MIC__

        if (cleanup_function) (*cleanup_function)();

        jobdone = jobnr;

        mynode_data.participate-=2;

	{
	  int oldpart = 1;
	  if (mynode_data.participate.compare_exchange_strong (oldpart, 0))
	    {
              if (jobdone < jobnr.load())
                { // reopen gate
                  mynode_data.participate |= 1;                  
                }
              else
                {
                  if (mynode != 0)
                    mynode_data.start_cnt = 0;
                  complete[mynode] = jobnr.load(); 
                }
	    }	      
	}
      }
    

#ifdef USE_MKL
    mkl_set_num_threads_local(mkl_max);
#endif

    workers_on_node[mynode]--;
    active_workers--;
  }
void StVKReducedInternalForces::Evaluate(double * q, double * fq)
{
/* // unoptimized version
  // reset to zero
  int i,j,k,l;
  for(l=0; l<r; l++)
    fq[l] = 0;

  // add linear terms
  int index = 0;
  for(l=0; l<r; l++)
    for(i=0; i<r; i++)
    {
      fq[l] += linearCoef_[index] * q[i];
      index++;
    }

  // add quadratic terms
  index = 0;
  for(l=0; l<r; l++)
    for(i=0; i<r; i++)
      for(j=i; j<r; j++)
      {
        fq[l] += quadraticCoef_[index] * q[i] * q[j];
        index++;
      }

  // add cubic terms
  index = 0;
  for(l=0; l<r; l++)
    for(i=0; i<r; i++)
      for(j=i; j<r; j++)
        for(k=j; k<r; k++)
        {
          fq[l] += cubicCoef_[index] * q[i] * q[j] * q[k];
          index++;
        }
*/

  if (useSingleThread)
  {
    #if defined(_WIN32) || defined(WIN32) || defined(linux)
      mkl_max_threads = mkl_get_max_threads();
      mkl_dynamic = mkl_get_dynamic();
      mkl_set_num_threads(1);
      mkl_set_dynamic(0);
    #elif defined(__APPLE__)
      //setenv("VECLIB_MAXIMUM_THREADS", "1", true);
    #endif
  }

  // add linear terms
  // multiply linearCoef_ and q
  // linearCoef_ is r x r array
  cblas_dgemv(CblasColMajor, CblasTrans,
       r, r,
       1.0,
       linearCoef_, r,
       q, 1,
       0.0,
       fq, 1);

  // compute qiqj
  int index = 0;
  for(int output=0; output<r; output++)
    for(int i=output; i<r; i++)
    {
      qiqj[index] = q[output] * q[i];
      index++;
    }

  // add quadratic terms
  // quadraticCoef_ is quadraticSize x r matrix
  // each column gives quadratic coef for one force vector component
  cblas_dgemv(CblasColMajor, CblasTrans,
       quadraticSize, r,
       1.0,
       quadraticCoef_, quadraticSize,
       qiqj, 1,
       1.0,
       fq, 1);

  // add cubic terms
  // cubicCoef_ is cubicSize x r matrix
  // each column gives cubicSize coef for one force vector component
  int size = quadraticSize;
  double * qiqjPos = qiqj;
  double * cubicCoefPos = cubicCoef_;
  for(int i=0; i<r; i++)
  {
    cblas_dgemv(CblasColMajor, CblasTrans,
        size, r,
        q[i],
        cubicCoefPos, cubicSize,
        qiqjPos, 1,
        1.0,
        fq, 1);

    int param = r-i;
    size -= param;
    qiqjPos += param;
    cubicCoefPos += param * (param+1) / 2;
  }

  if (addGravity)
  {
    for(int i=0; i<r; i++)
      fq[i] -= reducedGravityForce[i];
  }

  if (useSingleThread)
  {
    #if defined(_WIN32) || defined(WIN32) || defined(linux)
      mkl_set_num_threads(mkl_max_threads);
      mkl_set_dynamic(mkl_dynamic);
    #elif defined(__APPLE__)
      //unsetenv("VECLIB_MAXIMUM_THREADS");
    #endif
  }
}
Beispiel #13
0
int main(int argc, char* argv[]){


    //factor X into W*H
    matrix A,B,C;
    


    int max_iter;
    if(argc > 1){
	if(!strcmp(argv[1],"-h")){
	    printf("usage: bench matrix_dim1[100] matrix_dim2[100] matrix_dim3[100] iterations[100] trials[10] mkl_threads[#procs]\n");
	    exit(0);	
	}
    }
    if (argc > 3){
	    matrix_dim[0] =  atoi(argv[1]);
	    matrix_dim[1] =  atoi(argv[2]);
	    matrix_dim[2] =  atoi(argv[3]);
    }
    else {
	matrix_dim[0] = MAT_DIM;
	matrix_dim[1] = MAT_DIM;
	matrix_dim[2] = MAT_DIM;
    }

    int num_trials;
    if(argc>5)
	num_trials = atoi(argv[5]);
    else 
	num_trials = TRIALS;

    int verbose = 0;
    if (num_trials<10)
	verbose = 1;


    mkl_threads = mkl_get_max_threads();
    if (argc>6){
	if (atoi(argv[6]) < mkl_threads)
	    mkl_threads = atoi(argv[6]);
    }

    if (argc>4)
	max_iter = atoi(argv[4]);
    else
	max_iter = MAX_ITER;


    printf("mkl_threads: \t\t%i\n",mkl_threads);
    printf("matrix_dims: \t\t%i,%i,%i\n",matrix_dim[0],matrix_dim[1],matrix_dim[2]);


    create_matrix(&A, matrix_dim[0], matrix_dim[1], 1);
    create_matrix(&B, matrix_dim[1], matrix_dim[2], 1);
    create_matrix(&C, matrix_dim[0], matrix_dim[2], 1);
    


    double t_min = 1E9;
    double t = 0;


    int trial;
    int iter;
    for(trial=0;trial<num_trials;trial++){
	t = 0;

	t -= get_time();
	for(iter=0;iter<max_iter;iter++)
	    matrix_multiply(A,B,C,mkl_threads);
	t += get_time();

	printf("%6i: %9.6f\n",trial,t);

	if(t < t_min)
	    t_min = t;
    }
    printf("t_min: %9.6f\n",t_min);

    destroy_matrix(&A);
    destroy_matrix(&B);
    destroy_matrix(&C);

    return 0;
}
Beispiel #14
0
 MKLDisableThreading(bool condition) : num_threads{mkl_get_max_threads()} {
     if (condition)
         mkl_set_num_threads(1);
 }