/* * Class: xerial_jnuma_NumaNative * Method: allocateInterleaved * Signature: (J)J */ JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocateInterleaved (JNIEnv *env, jobject obj, jlong capacity) { void* mem = numa_alloc_interleaved((size_t) capacity); if(mem != NULL) { return (jlong) mem; } throwException(env, obj, 11); return 0L; }
JNIEXPORT jobject JNICALL Java_xerial_jnuma_NumaNative_allocInterleaved (JNIEnv *env, jobject obj, jint capacity) { jobject b; void* mem = numa_alloc_interleaved((size_t) capacity); if(mem == NULL) { // failed to allocate interleaved memory throwException(env, obj, 11); } else { b = (*env)->NewDirectByteBuffer(env, mem, (jlong) capacity); return b; } }
int main( int argc, char** argv ) { struct timespec start, stop; double time; #ifndef NDEBUG std::cout << "-->WARNING: COMPILED *WITH* ASSERTIONS!<--" << std::endl; #endif if( argc<=3 ) { std::cout << "Usage: " << argv[0] << " <mtx> <scheme> <x> <REP1> <REP2>" << std::endl << std::endl; std::cout << "calculates Ax=y and reports average time taken as well as the mean of y." << std::endl; std::cout << "with\t\t <mtx> filename of the matrix A in matrix-market or binary triplet format." << std::endl; std::cout << " \t\t <scheme> number of a sparse scheme to use, see below." << std::endl; std::cout << " \t\t <x> 0 for taking x to be the 1-vector, 1 for taking x to be random (fixed seed)." << std::endl; std::cout << " \t\t <REP1> (optional, default is 1) number of repititions of the entire experiment." << std::endl; std::cout << " \t\t <REP2> (optional, default is 1) number of repititions of the in-place SpMV multiplication, per experiment." << std::endl; std::cout << std::endl << "Possible schemes:" << std::endl; std::cout << " 0: TS (triplet scheme)" << std::endl; std::cout << " 1: CRS (also known as CSR)" << std::endl; std::cout << " 2: ICRS (Incremental CRS)" << std::endl; std::cout << " 3: ZZ-CRS (Zig-zag CRS)" << std::endl; std::cout << " 4: ZZ-ICRS (Zig-zag ICRS)" << std::endl; std::cout << " 5: SVM (Sparse vector matrix)" << std::endl; std::cout << " 6: HTS (Hilbert-ordered triplet scheme)" << std::endl; std::cout << " 7: BICRS (Bi-directional Incremental CRS)" << std::endl; std::cout << " 8: Hilbert (Hilbert-ordered triplets backed by BICRS)" << std::endl; std::cout << " 9: Block Hilbert (Sparse matrix blocking, backed by Hilbert and HBICRS)" << std::endl; std::cout << "10: Bisection Hilbert (Sparse matrix blocking by bisection, backed by Hilbert and HBICRS)" << std::endl; std::cout << "11: CBICRS (Compressed Bi-directional Incremental CRS)" << std::endl; std::cout << "12: Beta Hilbert (known as Block CO-H+ in the paper by Yzelman & Roose, 2012: parallel compressed blocked Hilbert with BICRS)" << std::endl; std::cout << "13: Row-distributed Beta Hilbert (known as Row-distributed block CO-H in the paper by Yzelman & Roose, 2012: same as 12, but simpler distribution)" << std::endl; #ifdef WITH_CSB std::cout << "14: Row-distributed CSB (Uses CSB sequentially within the row-distributed scheme of 13)" << std::endl; #endif std::cout << "15: Row-distributed Hilbert (Parallel row-distributed Hilbert scheme, see also 8)" << std::endl; std::cout << "16: Row-distributed parallel CRS (using OpenMP, known as OpenMP CRS in the paper by Yzelman & Roose, 2012)" << std::endl; std::cout << "17: Row-distributed SpMV using compressed Hilbert indices." << std::endl; #ifdef WITH_MKL std::cout << "18: Intel MKL SpMV based on the CRS data structure." << std::endl; #endif std::cout << "19: Optimised ICRS." << std::endl; #ifdef WITH_CUDA std::cout << "20: CUDA CuSparse HYB format." << std::endl; #endif std::cout << std::endl << "The in-place Ax=y calculation is preceded by a quasi pre-fetch." << std::endl; std::cout << "Add a minus sign before the scheme number to enable use of the CCS wrapper (making each CRS-based structure CCS-based instead)" << std::endl; std::cout << "Note: binary triplet format is machine-dependent. "; std::cout << "Take care when using the same binary files on different machine architectures." << std::endl; return EXIT_FAILURE; } std::string file = std::string( argv[1] ); int scheme = atoi( argv[2] ); int ccs = scheme < 0 ? 1 : 0; if( ccs ) scheme = -scheme; int x_mode = atoi( argv[3] ); unsigned long int rep1 = 1; unsigned long int rep2 = 1; if( argc >= 5 ) rep1 = static_cast< unsigned long int >( atoi( argv[4] ) ); if( argc >= 6 ) rep2 = static_cast< unsigned long int >( atoi( argv[5] ) ); if( scheme != 16 && scheme != -16 && //pin master thread to a single core scheme != 18 && scheme != -18 ) { //but not when OpenMP is used (otherwise serialised computations) cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET ( 0, &mask ); if( pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ) != 0 ) { std::cerr << "Error setting main thread affinity!" << std::endl; exit( 1 ); } } else { omp_set_num_threads( MachineInfo::getInstance().cores() ); } #ifdef WITH_MKL if( scheme == 18 ) { mkl_set_num_threads( MachineInfo::getInstance().cores() ); } #endif std::cout << argv[0] << " called with matrix input file " << file << ", scheme number "; std::cout << scheme << " and x being " << (x_mode?"random":"the 1-vector") << "." << std::endl; std::cout << "Number of repititions of in-place zax is " << rep2 << std::endl; std::cout << "Number of repititions of the " << rep2 << " in-place zax(es) is " << rep1 << std::endl; Matrix< double >* checkm = new TS< double >( file ); clock_gettime( CLOCK_ID, &start); Matrix< double >* matrix = selectMatrix( scheme, ccs, file ); clock_gettime( CLOCK_ID, &stop); time = (stop.tv_sec-start.tv_sec)*1000; time += (stop.tv_nsec-start.tv_nsec)/1000000.0; if( matrix == NULL ) { std::cerr << "Error during sparse scheme loading, exiting." << std::endl; return EXIT_FAILURE; } std::cout << "Matrix dimensions: " << matrix->m() << " times " << matrix->n() << "." << std::endl; std::cout << "Datastructure loading time: " << time << " ms." << std::endl << std::endl; srand( FIXED_SEED ); double* x = NULL; #ifdef INTERLEAVE_X if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) x = (double*) numa_alloc_interleaved( matrix->n() * sizeof( double ) ); else #endif x = (double*) _mm_malloc( matrix->n() * sizeof( double ), 64 ); //initialise input vector for( unsigned long int j=0; j<matrix->n(); j++ ) { x[ j ] = x_mode?(rand()/(double)RAND_MAX):1.0; } //do one trial run, also for verification double* c = checkm->mv( x ); clock_gettime( CLOCK_ID, &start ); double* z = matrix->mv( x ); clock_gettime( CLOCK_ID, &stop); time = (stop.tv_sec-start.tv_sec)*1000; time += (stop.tv_nsec-start.tv_nsec)/1000000.0; double checkMSE = 0; unsigned long int max_e_index = 0; double max_e = fabs( z[0] - c[0] ); for( unsigned long int j=0; j<matrix->m(); j++ ) { double curdiff = fabs( z[j] - c[j] ); if( curdiff > max_e ) { max_e = curdiff; max_e_index = j; } curdiff *= curdiff; curdiff /= (double)(matrix->m()); checkMSE += curdiff; } #ifdef OUTPUT_Z for( unsigned long int j=0; j<matrix->m(); j++ ) { std::cout << z[ j ] << std::endl; } #endif std::cout << "out-of-place z=Ax: mean= " << checksum( z, matrix->m() ) << ", "; std::cout << "MSE = " << checkMSE << ", "; std::cout << "max abs error = " << max_e << " while comparing y[ " << max_e_index << " ] = " << z[max_e_index] << " and c[ " << max_e_index << " ] = " << c[max_e_index] << ", "; std::cout << "time= " << time << " ms." << std::endl; #ifdef RDBH_NO_COLLECT if( scheme == 13 ) { std::cout << "WARNING: MSE and max abs error are not correct for the Row-distributed Beta Hilbert scheme; please see the RDBHilbert.hpp file, and look for the RDBH_NO_COLLECT flag." << std::endl; } #else if( scheme == 13 ) { std::cout << "WARNING: timings are pessimistic for the Row-distributed Beta Hilbert scheme; each spmv a (syncing) collect is executed to write local data to the global output vector as required by this library. To get the correct timings, turn this collect off via the RDBH_NO_COLLECT flag in the RDBHilbert.hpp file. Note that this causes the verification process to fail, since all data is kept in private local output subvectors." << std::endl; } #endif double *times = new double[ rep1 ]; //Run rep*rep instances for( unsigned long int run = 0; run < rep1; run++ ) { sleep( 1 ); time = 0.0; //"prefetch" matrix->zax( x, z ); matrix->zax( x, z, rep2, CLOCK_ID, &time ); time /= static_cast<double>( rep2 ); times[ run ] = time; } //calculate statistics double meantime, mintime, vartime; meantime = vartime = 0.0; mintime = times[ 0 ]; for( unsigned long int run = 0; run < rep1; run++ ) { if( times[ run ] < mintime ) mintime = times[ run ]; meantime += times[ run ] / static_cast< double >( rep1 ); } for( unsigned long int run = 0; run < rep1; run++ ) { vartime += ( times[ run ] - meantime ) * ( times[ run ] - meantime ) / static_cast< double >( rep1 - 1 ); } vartime = sqrt( vartime ); std::cout << "In-place:" << std::endl; std::cout << "Mean = " << checksum( z, matrix->m() ) << std::endl; std::cout << "Time = " << meantime << " (average), \t" << mintime << " (fastest), \t" << vartime << " (stddev) ms. " << std::endl; const double avgspeed = static_cast< double >( 2*matrix->nzs() ) / meantime / 1000000.0; const double minspeed = static_cast< double >( 2*matrix->nzs() ) / mintime / 1000000.0; const double varspeed = fabs( avgspeed - static_cast< double >( 2*matrix->nzs() ) / (meantime - vartime) / 1000000.0 ); std::cout << "Speed = " << avgspeed << " (average), \t" << minspeed << " (fastest), \t" << varspeed << " (variance) Gflop/s." << std::endl; const size_t memuse1 = matrix->bytesUsed() + sizeof( double ) * 2 * matrix->nzs(); const double avgmem1 = static_cast< double >( 1000*memuse1 ) / meantime / 1073741824.0; const double minmem1 = static_cast< double >( 1000*memuse1 ) / mintime / 1073741824.0; const double varmem1 = fabs( avgmem1 - static_cast< double >( 1000*memuse1 ) / (meantime-vartime) / 1073741824.0 ); std::cout << " " << avgmem1 << " (average), \t" << minmem1 << " (fastest), \t" << varmem1 << " (variance) Gbyte/s (upper bound)." << std::endl; const size_t memuse2 = matrix->bytesUsed() + sizeof( double ) * ( matrix->m() + matrix->n() ); const double avgmem2 = static_cast< double >( 1000*memuse2 ) / meantime / 1073741824.0; const double minmem2 = static_cast< double >( 1000*memuse2 ) / mintime / 1073741824.0; const double varmem2 = fabs( avgmem2 - static_cast< double >( 1000*memuse2 ) / (meantime-vartime) / 1073741824.0 ); std::cout << " " << avgmem2 << " (average), \t" << minmem2 << " (fastest), \t" << varmem2 << " (variance) Gbyte/s (lower bound)." << std::endl; delete [] times; #ifdef INTERLEAVE_X if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) { numa_free( x, matrix->n() * sizeof( double ) ); } else #endif _mm_free( x ); if( scheme == 12 || scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) { #ifdef _NO_LIBNUMA _mm_free( z ); #else numa_free( z, matrix->m() * sizeof( double ) ); #endif } else { _mm_free( z ); } _mm_free( c ); delete matrix; delete checkm; return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { printf("\n NODE_BIND:%d, NUMA:%d, CPU_BIND:%d, FIRST_TOUCH:%d\n",NODE_BIND, NUMA, CPU_BIND, FIRST_TOUCH); int repetitions, // number of repetition maxThreads, // max number of threads it, N; // array size; int bitCount = 1; int * key; // array of keys long * dataIn; // input data long * dataSTL; // input stl data long * dataRadix; // input radix data repetitions = 1; #pragma omp parallel maxThreads = omp_get_num_threads(); if(argc ==1 ){ printf("prog input_file number_of_elements bit_count number_of_repetitions\n"); printf("NO INPUT FILE"); return 0; } if(argc == 2){ printf("prog input_file number_of_elements bit_count number_of_repetitions\n"); printf("NO ELEMENT COUNT\n"); return 0; } if(argc >2 ){ N = (int) strtol(argv[2], NULL, 10); } if(argc >3){ int tmp; tmp = (int) strtol(argv[3], NULL, 10); if ((tmp > 0) && (tmp<=16 )) // limit bit count bitCount = tmp; } if(argc >4){ int tmp; tmp = (int) strtol(argv[4], NULL, 10); if ((tmp > 0) && (tmp<=10000 )) // limit repetitions repetitions = tmp; } int *input; size_t N2; printf( "Reading data from file.\n" ); if( readIntArrayFile( argv[1], &input, &N2 ) ) return 1; printf( "Data reading done.\n" ); if( (N2<(size_t)N) || (N<=0) ) N = N2; printf( "\nPARALLEL STL SORT for N=%d, max threads = %d, test repetitions: %d\n", N, maxThreads, repetitions); dataIn = new long[N]; dataSTL = new long[N]; #ifdef _WIN32 dataRadix = new long[N]; key = new int[N]; #endif #ifdef linux key = new int[N]; #if NUMA==0 dataRadix = new long[N]; #elif NUMA==1 dataRadix = (long*) numa_alloc_interleaved(N * sizeof(long)); #elif NUMA==2 dataRadix = (long*)numa_alloc_onnode(sizeof(long)*N,1); #endif #endif VTimer stlTimes(maxThreads); VTimer radixTimes(maxThreads); #if TIME_COUNT==1 VTimer partTimes(TIMERS_COUNT); #endif #if FLUSH_CACHE==1 #ifdef linux CacheFlusher cf; #endif #endif for(long i=0;i<N;i++) dataIn[i]=input[i]; delete[] input; // loop from 1 to maxThreads for (int t = 1; t <= maxThreads; t++) { int i; #if TIME_COUNT==1 partTimes.reset(); #endif #if CALC_REF==1 // parallel STL for (it = 0; it < repetitions; it++) { setThreadsNo(t, maxThreads); #pragma omp parallel for private(i) for (i = 0; i < N; i++) dataSTL[i] = dataIn[i]; #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif stlTimes.timerStart(t-1); #ifdef linux __gnu_parallel::sort(dataSTL, dataSTL + N); #endif #ifdef _WIN32 std::sort(dataSTL, dataSTL + N); #endif stlTimes.timerEnd(t-1); } #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif #endif // radix sort V1 for (it = 0; it < repetitions; it++) { setThreadsNo(t, maxThreads); #pragma omp parallel for private(i) default(shared) for (i = 0; i < N; i++){ dataRadix[i] = dataIn[i]; key[i]=i; } #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif omp_set_num_threads(t); radixTimes.timerStart(t-1); #if TIME_COUNT==1 prsort::pradsort<long,int>(dataRadix,key, N, bitCount,&partTimes); #else prsort::pradsort<long,int>(dataRadix,key, N,bitCount,NULL); #endif radixTimes.timerEnd(t-1); } #if CALC_REF==1 printf("|STL SORT(th=%2d) : %1.3fs |\t", t, stlTimes.getTime(t-1)); #endif #if TIME_COUNT==1 for (int i = 0; i < TIMERS_COUNT; i++) { #if CREATE_OUTPUT==1 printf("%d %d %d %d %d %d %d %f\n", NUMA, NODE_BIND, CPU_BIND, FIRST_TOUCH,bitCount , t, i ,partTimes.getTime(i)); #else printf("part%d :%f ", i, partTimes.getTime(i)); #endif } #endif #if CREATE_OUTPUT ==1 printf("%d %d %d %d %d %d calosc %1.3f", NUMA,NODE_BIND,CPU_BIND,FIRST_TOUCH,bitCount, t ,radixTimes.getTime(t-1)); #else printf("|RADIX SORT (th=%2d) : %1.3fs |\t", t, radixTimes.getTime(t-1)); #endif // Attention: checking result only from the last function usage #if CALC_REF==1 checkResults(dataSTL, dataRadix, N); #else printf("\n"); #endif #if CHECK_KEY==1 if(checkKey(dataIn,dataRadix,key,N))printf("Keys are good\n"); #endif } #ifdef linux delete[] key; #if NUMA>0 numa_free(dataRadix, sizeof(long) * N); #else delete[] dataRadix; #endif #endif #ifdef _WIN32 delete[] dataRadix; #endif delete[] dataIn; delete[] dataSTL; #if TIME_COUNT==1 #endif return 0; }