Exemple #1
0
/*
 * Class:     xerial_jnuma_NumaNative
 * Method:    allocateInterleaved
 * Signature: (J)J
 */
JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocateInterleaved
    (JNIEnv *env, jobject obj, jlong capacity) {
  void* mem = numa_alloc_interleaved((size_t) capacity);
  if(mem != NULL) {
    return (jlong) mem;
  }
  throwException(env, obj, 11);
  return 0L;
}
Exemple #2
0
JNIEXPORT jobject JNICALL Java_xerial_jnuma_NumaNative_allocInterleaved
(JNIEnv *env, jobject obj, jint capacity) {
    jobject b;
    void* mem = numa_alloc_interleaved((size_t) capacity);
    if(mem == NULL) {
        // failed to allocate interleaved memory
        throwException(env, obj, 11);
    }
    else {
        b = (*env)->NewDirectByteBuffer(env, mem, (jlong) capacity);
        return b;
    }
}
int main( int argc, char** argv ) {
	struct timespec start, stop; 
	double time;

#ifndef NDEBUG
	std::cout << "-->WARNING: COMPILED *WITH* ASSERTIONS!<--" << std::endl;
#endif
	
	if( argc<=3 ) {
		std::cout << "Usage: " << argv[0] << " <mtx> <scheme> <x> <REP1> <REP2>" << std::endl << std::endl;
		std::cout << "calculates Ax=y and reports average time taken as well as the mean of y." << std::endl;
		std::cout << "with\t\t <mtx> filename of the matrix A in matrix-market or binary triplet format." << std::endl;
		std::cout << "    \t\t <scheme> number of a sparse scheme to use, see below." << std::endl;
		std::cout << "    \t\t <x> 0 for taking x to be the 1-vector, 1 for taking x to be random (fixed seed)." << std::endl;
		std::cout << "    \t\t <REP1> (optional, default is 1) number of repititions of the entire experiment." << std::endl;
		std::cout << "    \t\t <REP2> (optional, default is 1) number of repititions of the in-place SpMV multiplication, per experiment." << std::endl;
		std::cout << std::endl << "Possible schemes:" << std::endl;
		std::cout << " 0: TS (triplet scheme)" << std::endl;
		std::cout << " 1: CRS (also known as CSR)" << std::endl;
		std::cout << " 2: ICRS (Incremental CRS)" << std::endl;
		std::cout << " 3: ZZ-CRS (Zig-zag CRS)" << std::endl;
		std::cout << " 4: ZZ-ICRS (Zig-zag ICRS)" << std::endl;
		std::cout << " 5: SVM (Sparse vector matrix)" << std::endl;
		std::cout << " 6: HTS (Hilbert-ordered triplet scheme)" << std::endl;
		std::cout << " 7: BICRS (Bi-directional Incremental CRS)" << std::endl;
		std::cout << " 8: Hilbert (Hilbert-ordered triplets backed by BICRS)" << std::endl;
		std::cout << " 9: Block Hilbert (Sparse matrix blocking, backed by Hilbert and HBICRS)" << std::endl;
		std::cout << "10: Bisection Hilbert (Sparse matrix blocking by bisection, backed by Hilbert and HBICRS)" << std::endl;
		std::cout << "11: CBICRS (Compressed Bi-directional Incremental CRS)" << std::endl;
		std::cout << "12: Beta Hilbert (known as Block CO-H+ in the paper by Yzelman & Roose, 2012: parallel compressed blocked Hilbert with BICRS)" << std::endl;
		std::cout << "13: Row-distributed Beta Hilbert (known as Row-distributed block CO-H in the paper by Yzelman & Roose, 2012: same as 12, but simpler distribution)" << std::endl;
#ifdef WITH_CSB
		std::cout << "14: Row-distributed CSB (Uses CSB sequentially within the row-distributed scheme of 13)" << std::endl;
#endif
		std::cout << "15: Row-distributed Hilbert (Parallel row-distributed Hilbert scheme, see also 8)" << std::endl;
		std::cout << "16: Row-distributed parallel CRS (using OpenMP, known as OpenMP CRS in the paper by Yzelman & Roose, 2012)" << std::endl;
		std::cout << "17: Row-distributed SpMV using compressed Hilbert indices." << std::endl;
#ifdef WITH_MKL
		std::cout << "18: Intel MKL SpMV based on the CRS data structure." << std::endl;
#endif
		std::cout << "19: Optimised ICRS." << std::endl;
#ifdef WITH_CUDA
		std::cout << "20: CUDA CuSparse HYB format." << std::endl;
#endif
		std::cout << std::endl << "The in-place Ax=y calculation is preceded by a quasi pre-fetch." << std::endl;
		std::cout << "Add a minus sign before the scheme number to enable use of the CCS wrapper (making each CRS-based structure CCS-based instead)" << std::endl;
		std::cout << "Note: binary triplet format is machine-dependent. ";
		std::cout << "Take care when using the same binary files on different machine architectures." << std::endl;
		return EXIT_FAILURE;
	}

	std::string file = std::string( argv[1] );
	int scheme = atoi( argv[2] );
	int ccs    = scheme < 0 ? 1 : 0;
	if( ccs ) scheme = -scheme;
	int x_mode = atoi( argv[3] );
	unsigned long int rep1 = 1;
	unsigned long int rep2 = 1;
	if( argc >= 5 )
		rep1 = static_cast< unsigned long int >( atoi( argv[4] ) );
	if( argc >= 6 )
		rep2 = static_cast< unsigned long int >( atoi( argv[5] ) );

	if( scheme != 16 && scheme != -16 && //pin master thread to a single core
		scheme != 18 && scheme != -18 ) { //but not when OpenMP is used (otherwise serialised computations)
		cpu_set_t mask;
		CPU_ZERO( &mask );
		CPU_SET ( 0, &mask );
		if( pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ) != 0 ) {
			std::cerr << "Error setting main thread affinity!" << std::endl;
			exit( 1 );
		}
	} else {
		omp_set_num_threads( MachineInfo::getInstance().cores() );
	}

#ifdef WITH_MKL
	if( scheme == 18 ) {
		mkl_set_num_threads( MachineInfo::getInstance().cores() );
	}
#endif
	std::cout << argv[0] << " called with matrix input file " << file << ", scheme number ";
	std::cout << scheme << " and x being " << (x_mode?"random":"the 1-vector") << "." << std::endl;
	std::cout << "Number of repititions of in-place zax is " << rep2 << std::endl;
	std::cout << "Number of repititions of the " << rep2 << " in-place zax(es) is " << rep1 << std::endl;

	Matrix< double >* checkm = new TS< double >( file );
	clock_gettime( CLOCK_ID, &start);
	Matrix< double >* matrix = selectMatrix( scheme, ccs, file );
	clock_gettime( CLOCK_ID, &stop);
	time  = (stop.tv_sec-start.tv_sec)*1000;
	time += (stop.tv_nsec-start.tv_nsec)/1000000.0;
	if( matrix == NULL ) {
		std::cerr << "Error during sparse scheme loading, exiting." << std::endl;
		return EXIT_FAILURE;
	}

	std::cout << "Matrix dimensions: " << matrix->m() << " times " << matrix->n() << "." << std::endl;
	std::cout << "Datastructure loading time: " << time << " ms." << std::endl << std::endl;

	srand( FIXED_SEED );
	double* x = NULL;
#ifdef INTERLEAVE_X
	if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 )
		x = (double*) numa_alloc_interleaved( matrix->n() * sizeof( double ) );
	else
#endif
		x = (double*) _mm_malloc( matrix->n() * sizeof( double ), 64 );

	//initialise input vector
	for( unsigned long int j=0; j<matrix->n(); j++ ) {
		x[ j ] = x_mode?(rand()/(double)RAND_MAX):1.0;
	}

	//do one trial run, also for verification
	double* c = checkm->mv( x );
	clock_gettime( CLOCK_ID, &start );
	double* z = matrix->mv( x );
	clock_gettime( CLOCK_ID, &stop);
	time = (stop.tv_sec-start.tv_sec)*1000;
	time += (stop.tv_nsec-start.tv_nsec)/1000000.0;
	double checkMSE = 0;
	unsigned long int max_e_index = 0;
	double max_e = fabs( z[0] - c[0] );
	for( unsigned long int j=0; j<matrix->m(); j++ ) {
		double curdiff = fabs( z[j] - c[j] );
		if( curdiff > max_e ) {
			max_e = curdiff;
			max_e_index = j;
		}
		curdiff  *= curdiff;
		curdiff  /= (double)(matrix->m());
		checkMSE += curdiff;
	}
#ifdef OUTPUT_Z
	for( unsigned long int j=0; j<matrix->m(); j++ ) {
		std::cout << z[ j ] << std::endl;
	}
#endif
	std::cout << "out-of-place z=Ax: mean= " << checksum( z, matrix->m() ) << ", ";
	std::cout << "MSE = " << checkMSE << ", ";
	std::cout << "max abs error = " << max_e << " while comparing y[ " << max_e_index << " ] = " <<  z[max_e_index] << " and c[ " << max_e_index << " ] = " <<  c[max_e_index] << ", ";
	std::cout << "time= " << time << " ms." << std::endl;
#ifdef RDBH_NO_COLLECT
	if( scheme == 13 ) {
		std::cout << "WARNING: MSE and max abs error are not correct for the Row-distributed Beta Hilbert scheme; please see the RDBHilbert.hpp file, and look for the RDBH_NO_COLLECT flag." << std::endl;
	}
#else
	if( scheme == 13 ) {
		std::cout << "WARNING: timings are pessimistic for the Row-distributed Beta Hilbert scheme; each spmv a (syncing) collect is executed to write local data to the global output vector as required by this library. To get the correct timings, turn this collect off via the RDBH_NO_COLLECT flag in the RDBHilbert.hpp file. Note that this causes the verification process to fail, since all data is kept in private local output subvectors." << std::endl;
	}
#endif
	double *times = new double[ rep1 ];

	//Run rep*rep instances
	for( unsigned long int run = 0; run < rep1; run++ ) {
		sleep( 1 );
		time = 0.0;
		//"prefetch"
		matrix->zax( x, z );
		matrix->zax( x, z, rep2, CLOCK_ID, &time );
		time /= static_cast<double>( rep2 );
		times[ run ] = time;
	}

	//calculate statistics
	double meantime, mintime, vartime;
	meantime = vartime = 0.0;
	mintime = times[ 0 ];
	for( unsigned long int run = 0; run < rep1; run++ ) {
		if( times[ run ] < mintime ) mintime = times[ run ];
		meantime += times[ run ] / static_cast< double >( rep1 );
	}
	for( unsigned long int run = 0; run < rep1; run++ ) {
		vartime += ( times[ run ] - meantime ) * ( times[ run ] - meantime ) / static_cast< double >( rep1 - 1 );
	}
	vartime = sqrt( vartime );

	std::cout << "In-place:" << std::endl;
	std::cout << "Mean  = " << checksum( z, matrix->m() ) << std::endl;
	std::cout << "Time  = " << meantime << " (average), \t" <<  mintime << " (fastest), \t" << vartime << " (stddev) ms. " << std::endl;
	const double avgspeed = static_cast< double >( 2*matrix->nzs() ) / meantime / 1000000.0;
	const double minspeed = static_cast< double >( 2*matrix->nzs() ) / mintime / 1000000.0;
	const double varspeed = fabs( avgspeed - static_cast< double >( 2*matrix->nzs() ) / (meantime - vartime) / 1000000.0 );
	std::cout << "Speed = " << avgspeed << " (average), \t"
					<< minspeed << " (fastest), \t"
					<< varspeed << " (variance) Gflop/s." << std::endl;
	const size_t memuse1 = matrix->bytesUsed() + sizeof( double ) * 2 * matrix->nzs();
	const double avgmem1 = static_cast< double >( 1000*memuse1 ) / meantime / 1073741824.0;
	const double minmem1 = static_cast< double >( 1000*memuse1 ) / mintime / 1073741824.0;
	const double varmem1 = fabs( avgmem1 - static_cast< double >( 1000*memuse1 ) / (meantime-vartime) / 1073741824.0 );
	std::cout << "        " << avgmem1 << " (average), \t"
					<< minmem1 << " (fastest), \t"
					<< varmem1 << " (variance) Gbyte/s (upper bound)." << std::endl;
	const size_t memuse2 = matrix->bytesUsed() + sizeof( double ) * ( matrix->m() + matrix->n() );
	const double avgmem2 = static_cast< double >( 1000*memuse2 ) / meantime / 1073741824.0;
	const double minmem2 = static_cast< double >( 1000*memuse2 ) / mintime / 1073741824.0;
	const double varmem2 = fabs( avgmem2 - static_cast< double >( 1000*memuse2 ) / (meantime-vartime) / 1073741824.0 );
	std::cout << "        " << avgmem2 << " (average), \t"
					<< minmem2 << " (fastest), \t"
					<< varmem2 << " (variance) Gbyte/s (lower bound)." << std::endl;

	delete [] times;
#ifdef INTERLEAVE_X
	if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) {
		numa_free( x, matrix->n() * sizeof( double ) );
	} else
#endif
		_mm_free( x );

	if( scheme == 12 || scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) {
#ifdef _NO_LIBNUMA
		_mm_free( z );
#else
		numa_free( z, matrix->m() * sizeof( double ) );
#endif
	} else {
		_mm_free( z );
	}
	_mm_free( c );
	delete matrix;
	delete checkm;

	return EXIT_SUCCESS;
}
Exemple #4
0
int main(int argc, char* argv[]) {
	printf("\n NODE_BIND:%d, NUMA:%d, CPU_BIND:%d, FIRST_TOUCH:%d\n",NODE_BIND, NUMA, CPU_BIND, FIRST_TOUCH);

        int repetitions, // number of repetition 
			maxThreads, // max number of threads
			it,
                        N; // array size;
        int bitCount = 1;
	int * key; // array of keys
	long * dataIn; // input data
	long * dataSTL; // input stl data
	long * dataRadix; // input radix data

        repetitions = 1;
#pragma omp parallel
	maxThreads = omp_get_num_threads();

        if(argc ==1 ){
            printf("prog input_file number_of_elements bit_count number_of_repetitions\n");
            printf("NO INPUT FILE");
            return 0;
        }
        if(argc == 2){
            printf("prog input_file number_of_elements bit_count number_of_repetitions\n");
            printf("NO ELEMENT COUNT\n");
            return 0;
        }
        if(argc >2 ){
	    N = (int) strtol(argv[2], NULL, 10);
        }
        if(argc >3){
             int tmp;
	    tmp = (int) strtol(argv[3], NULL, 10);
	    if ((tmp > 0) && (tmp<=16 )) // limit bit count
		bitCount = tmp;
        }        
        if(argc >4){
             int tmp;
	    tmp = (int) strtol(argv[4], NULL, 10);
	    if ((tmp > 0) && (tmp<=10000 )) // limit repetitions
		repetitions = tmp;
        }

        int *input;
	size_t N2;
	printf( "Reading data from file.\n" );
        if( readIntArrayFile( argv[1], &input, &N2 ) )
           return 1; 
	printf( "Data reading done.\n" );

        if( (N2<(size_t)N) || (N<=0) )
		N = N2;


       	printf( "\nPARALLEL STL SORT for N=%d, max threads = %d, test repetitions: %d\n", N, maxThreads, repetitions);

	dataIn = new long[N]; 
	dataSTL = new long[N];

#ifdef _WIN32

	dataRadix = new long[N];
	key = new int[N];
#endif
#ifdef linux

	key = new int[N];
#if NUMA==0

	dataRadix = new long[N]; 

#elif NUMA==1
			dataRadix = (long*) numa_alloc_interleaved(N * sizeof(long));
#elif NUMA==2
			dataRadix = (long*)numa_alloc_onnode(sizeof(long)*N,1);
#endif
#endif
	VTimer stlTimes(maxThreads);
	VTimer radixTimes(maxThreads);
#if TIME_COUNT==1
	VTimer partTimes(TIMERS_COUNT);
#endif
#if FLUSH_CACHE==1
#ifdef linux
        CacheFlusher cf;
#endif
#endif

        for(long i=0;i<N;i++)
		dataIn[i]=input[i];
	delete[] input;

// loop from 1 to maxThreads
	for (int t = 1; t <= maxThreads; t++) {
		int i;
#if TIME_COUNT==1
                partTimes.reset();
#endif
#if CALC_REF==1
// parallel STL
		for (it = 0; it < repetitions; it++) {
			setThreadsNo(t, maxThreads);
#pragma omp parallel for private(i)
			for (i = 0; i < N; i++)
				dataSTL[i] = dataIn[i];
#if FLUSH_CACHE==1
#ifdef linux
                        cf.flush();
#endif
#endif
			stlTimes.timerStart(t-1);

#ifdef linux
			__gnu_parallel::sort(dataSTL, dataSTL + N);
#endif
#ifdef _WIN32
			std::sort(dataSTL, dataSTL + N);
#endif
			stlTimes.timerEnd(t-1);
		}

#if FLUSH_CACHE==1
#ifdef linux
                cf.flush();
#endif
#endif
#endif

// radix sort V1
		for (it = 0; it < repetitions; it++) {
			setThreadsNo(t, maxThreads);
#pragma omp parallel for private(i) default(shared)

			for (i = 0; i < N; i++){
				dataRadix[i] = dataIn[i];
				key[i]=i;
			}

#if FLUSH_CACHE==1
#ifdef linux
                        cf.flush();
#endif
#endif
			omp_set_num_threads(t);
			radixTimes.timerStart(t-1);
#if TIME_COUNT==1
                        prsort::pradsort<long,int>(dataRadix,key, N, bitCount,&partTimes);
#else
                        prsort::pradsort<long,int>(dataRadix,key, N,bitCount,NULL);
#endif
			radixTimes.timerEnd(t-1);

		}

       
#if CALC_REF==1
		printf("|STL   SORT(th=%2d)  : %1.3fs  |\t", t,
				stlTimes.getTime(t-1));
#endif
#if TIME_COUNT==1
		for (int i = 0; i < TIMERS_COUNT; i++) {
#if CREATE_OUTPUT==1
			printf("%d %d %d %d %d %d %d %f\n", NUMA, NODE_BIND, CPU_BIND, FIRST_TOUCH,bitCount , t, i ,partTimes.getTime(i));
#else
			printf("part%d :%f ", i, partTimes.getTime(i));
#endif

		}
#endif
#if CREATE_OUTPUT ==1
		        printf("%d %d %d %d %d %d calosc %1.3f", NUMA,NODE_BIND,CPU_BIND,FIRST_TOUCH,bitCount, t ,radixTimes.getTime(t-1));
#else
		printf("|RADIX SORT (th=%2d)  : %1.3fs  |\t", t,
				radixTimes.getTime(t-1));
#endif

// Attention: checking result only from the last function usage 

#if CALC_REF==1
		checkResults(dataSTL, dataRadix, N);
#else
		printf("\n");
#endif

#if CHECK_KEY==1
	if(checkKey(dataIn,dataRadix,key,N))printf("Keys are good\n");

#endif
	}

#ifdef linux
	delete[] key;
#if NUMA>0
	numa_free(dataRadix, sizeof(long) * N);
        
#else

	delete[] dataRadix;
#endif
#endif
#ifdef _WIN32
	delete[] dataRadix;
#endif

	delete[] dataIn;
	delete[] dataSTL;
	
#if TIME_COUNT==1
	
        
        
#endif
	return 0;
}