/* compute U = adjoint(X) * X, with S a scratch matrix. */ void evectmatrix_XtX(sqmatrix U, evectmatrix X, sqmatrix S) { CHECK(X.p == U.p && U.p <= S.alloc_p, "matrices not conformant"); /* blasglue_gemm('C', 'N', X.p, X.p, X.n, 1.0, X.data, X.p, X.data, X.p, 0.0, S.data, U.p); */ /* take advantage of the fact that U is Hermitian and only write out the upper triangle of the matrix */ memset(S.data, 0, sizeof(scalar) * (U.p * U.p)); blasglue_herk('U', 'C', X.p, X.n, 1.0, X.data, X.p, 0.0, S.data, U.p); evectmatrix_flops += X.N * X.c * X.p * (X.p - 1); /* Now, copy the conjugate of the upper half onto the lower half of S */ { int i, j; for (i = 0; i < U.p; ++i) for (j = i + 1; j < U.p; ++j) { ASSIGN_CONJ(S.data[j * U.p + i], S.data[i * U.p + j]); } } mpi_allreduce(S.data, U.data, U.p * U.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); }
/* As above, but compute only the diagonal elements of XtX. */ void evectmatrix_XtX_diag_real(evectmatrix X, real *diag, real *scratch_diag) { matrix_XtX_diag_real(X.data, X.n, X.p, scratch_diag); evectmatrix_flops += X.N * X.c * X.p * (2*X.p); mpi_allreduce(scratch_diag, diag, X.p, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); }
/* Compute only the diagonal elements of XtY, storing in diag (with scratch_diag a scratch array of the same size as diag). */ void evectmatrix_XtY_diag(evectmatrix X, evectmatrix Y, scalar *diag, scalar *scratch_diag) { matrix_XtY_diag(X.data, Y.data, X.n, X.p, scratch_diag); evectmatrix_flops += X.N * X.c * X.p * 2; mpi_allreduce(scratch_diag, diag, X.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); }
void init_timer() { double start; mpi_barrier(MPI_COMM_WORLD); start = mpi_wtime(); mpi_allreduce(&start, &_zero, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); _skew = start - _zero; _initialized = 1; }
/* compute trace(adjoint(X) * Y) */ scalar evectmatrix_traceXtY(evectmatrix X, evectmatrix Y) { scalar trace, trace_scratch; CHECK(X.p == Y.p && X.n == Y.n, "matrices not conformant"); trace_scratch = blasglue_dotc(X.n * X.p, X.data, 1, Y.data, 1); evectmatrix_flops += X.N * X.c * X.p * (2*X.p) + X.p; mpi_allreduce(&trace_scratch, &trace, SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); return trace; }
/* Dot p selected columns of X with those in Y, starting at ix and iy. Stores the result in U, with S a scratch matrix. */ void evectmatrix_XtY_slice(sqmatrix U, evectmatrix X, evectmatrix Y, int ix, int iy, int p, sqmatrix S) { CHECK(ix + p <= X.p && iy + p <= Y.p && ix >= 0 && iy >= 0 && X.n == Y.n && p == U.p && p <= S.alloc_p, "invalid arguments to XtY_slice"); memset(S.data, 0, sizeof(scalar) * (U.p * U.p)); blasglue_gemm('C', 'N', p, p, X.n, 1.0, X.data + ix, X.p, Y.data + iy, Y.p, 0.0, S.data, U.p); evectmatrix_flops += X.N * X.c * p * (2*p); mpi_allreduce(S.data, U.data, U.p * U.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); }
/* Compute adjoint(X) * Y, storing the result in U at an offset Uoffset with the matrix (i.e. as a submatrix within U). S is a scratch matrix (at least Y.p by Y.p). */ void evectmatrixXtY_sub(sqmatrix U, int Uoffset, evectmatrix X, evectmatrix Y, sqmatrix S) { int i; CHECK(X.p == Y.p && X.n == Y.n && U.p >= Y.p, "matrices not conformant"); CHECK(Uoffset + (Y.p-1)*U.p + Y.p <= U.p*U.p, "submatrix exceeds matrix bounds"); CHECK(Y.p <= S.alloc_p, "scratch matrix too small"); memset(S.data, 0, sizeof(scalar) * (Y.p * Y.p)); blasglue_gemm('C', 'N', X.p, X.p, X.n, 1.0, X.data, X.p, Y.data, Y.p, 0.0, S.data, Y.p); evectmatrix_flops += X.N * X.c * X.p * (2*X.p); for (i = 0; i < Y.p; ++i) { mpi_allreduce(S.data + i*Y.p, U.data + Uoffset + i*U.p, Y.p * SCALAR_NUMVALS, real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm); } }
int main(int argc, char **argv) { //set the mpi settings int threadprovided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &threadprovided); if(threadprovided != MPI_THREAD_MULTIPLE) { printf("MPI multiple thread isn't provided!\n"); fflush(stdout); mpi_exit(1); } int current_rank = mpi_get_rank(); int nr_ranks = mpi_get_size(); param.nr_ranks = nr_ranks; char hostname[1024]; int hostname_len; MPI_Get_processor_name(hostname, &hostname_len); printf("processor name: %s, number of processed: %d, rank: %d\n", hostname, nr_ranks, current_rank); fflush(stdout); // int global_l; char input_file_name[1024]; char model_file_name[1024]; const char *error_msg; parse_command_line(argc, argv, input_file_name, model_file_name); //set the number of threads for the shared-memory system int nr_threads = param.thread_count; int max_thread_count = omp_get_max_threads(); if(nr_threads > max_thread_count) { printf("[rank %d], please enter the correct number of threads: 1~%d\n", current_rank, max_thread_count); mpi_exit(1); } omp_set_num_threads(nr_threads); //set the cpu affnity /*int ithread, err, cpu; cpu_set_t cpu_mask; #pragma omp parallel private(ithread, cpu_mask, err, cpu) { ithread = omp_get_thread_num(); CPU_ZERO(&cpu_mask);//set mask to zero CPU_SET(ithread, &cpu_mask);//set mask with ithread err = sched_setaffinity((pid_t)0, sizeof(cpu_mask), &cpu_mask); cpu = sched_getcpu(); printf("thread_id %d on CPU %d\n", ithread, cpu); }*/ //now, read the problem from the input file read_problem(input_file_name); error_msg = rksvm_check_parameter(&prob,¶m); if(error_msg) { fprintf(stderr,"ERROR: %s\n",error_msg); mpi_exit(1); } //distributed code global_l = prob.l; mpi_allreduce(&global_l, 1, MPI_INT, MPI_SUM);//MPI_INT :int;MPI_SUM:sum prob.global_l = global_l; printf("#local instances = %d, #global instances = %d\n", prob.l, prob.global_l); fflush(stdout); if(current_rank==0){ puts("Start to train!"); } model = rksvm_train(&prob,¶m); if(rksvm_save_model(model_file_name,model)) { fprintf(stderr,"[rank %d] can't save model to file %s\n",mpi_get_rank(), model_file_name); mpi_exit(1); } rksvm_free_and_destroy_model(&model); free(prob.y); free(prob.x); free(prob.query); free(x_space); free(prob.length_of_each_rksvm_node); free(line); MPI_Finalize(); return 0; }
double l2r_rank_fun::fun(double *w)// w is with the size of global_l { int i,j,k; double f = 0.0; double reg = 0.0; int l=prob->l; selectiontree *T; Qv(w,z); //generate gz via MPI_Allgatherv MPI_Allgatherv((void*)z, l, MPI_DOUBLE, (void*)gz, local_l, start_ptr, MPI_DOUBLE, MPI_COMM_WORLD); #pragma omp parallel for default(shared) private(i,j,k,T) for (i=0;i<nr_subset;i++) { for (j=0;j<count[i];j++) { pi[i][j].id= perm[j+start[i]]; pi[i][j].value = z[perm[j+start[i]]]; } qsort(pi[i], count[i], sizeof(id_and_value), compare_id_and_value); T=new selectiontree(nr_class[i]); k=0; for (j=0;j<count[i];j++) { while (k<count[i]&&(1-pi[i][j].value+pi[i][k].value>0)) { T->insert_node(int_y[pi[i][k].id],pi[i][k].value); k++; } T->count_smaller(int_y[pi[i][j].id],&l_minus[pi[i][j].id], &gamma_minus[pi[i][j].id]); } delete T; k=count[i]-1; T = new selectiontree(nr_class[i]); for (j=count[i]-1;j>=0;j--) { while (k>=0&&(1+pi[i][j].value-pi[i][k].value>0)) { T->insert_node(int_y[pi[i][k].id],pi[i][k].value); k--; } T->count_larger(int_y[pi[i][j].id],&l_plus[pi[i][j].id], &gamma_plus[pi[i][j].id]); } delete T; } //#pragma omp parallel for default(shared) private(i) reduction(+:f) schedule(dynamic) for(i=0;i<global_l;i++) { f += w[i]*gz[i]; } #pragma omp parallel for default(shared) private(i) for (i=0;i<l;i++) { ATe[i] = l_minus[i] - l_plus[i]; ATAQb[i] = (l_plus[i]+l_minus[i])*gz[i+start_ptr[current_rank]]-gamma_plus[i]-gamma_minus[i]; } //#pragma omp parallel for default(shared) //private(i) //reduction(+:reg) schedule(runtime) for (int i=0;i<l;i++) { //#pragma omp atomic reg += C*(gz[i+start_ptr[current_rank]]*(ATAQb[i] - 2 * ATe[i]) + l_minus[i]); } mpi_allreduce(®, 1, MPI_DOUBLE, MPI_SUM); f /= 2.0; f += reg; si->obj=f; return(f); }
/* Compute the integral of f(r, {fields}) over the cell. */ cnumber integrate_fieldL(function f, SCM_list fields) { int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim; #ifdef HAVE_MPI int local_n2, local_y_start, local_n3; #endif real s1, s2, s3, c1, c2, c3; int ifield; field_smob **pf; cnumber integral = {0,0}; CHK_MALLOC(pf, field_smob *, fields.num_items); for (ifield = 0; ifield < fields.num_items; ++ifield) { pf[ifield] = assert_field_smob(fields.items[ifield]); CHECK(fields_conform(pf[0], pf[ifield]), "fields for integrate-fields must conform"); } if (fields.num_items > 0) { n1 = pf[0]->nx; n2 = pf[0]->ny; n3 = pf[0]->nz; n_other = pf[0]->other_dims; n_last = pf[0]->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); last_dim = pf[0]->last_dim; } else { n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz; n_other = mdata->other_dims; n_last = mdata->last_dim_size / (sizeof(scalar_complex)/sizeof(scalar)); last_dim = mdata->last_dim; } rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3; s1 = geometry_lattice.size.x / n1; s2 = geometry_lattice.size.y / n2; s3 = geometry_lattice.size.z / n3; c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5; c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5; c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5; /* Here we have different loops over the coordinates, depending upon whether we are using complex or real and serial or parallel transforms. Each loop must define, in its body, variables (i2,j2,k2) describing the coordinate of the current point, and "index" describing the corresponding index in the curfield array. This was all stolen from maxwell_eps.c...it would be better if we didn't have to cut and paste, sigh. */ #ifdef SCALAR_COMPLEX # ifndef HAVE_MPI for (i = 0; i < n1; ++i) for (j = 0; j < n2; ++j) for (k = 0; k < n3; ++k) { int i2 = i, j2 = j, k2 = k; int index = ((i * n2 + j) * n3 + k); # else /* HAVE_MPI */ if (fields.num_items > 0) { local_n2 = pf[0]->local_ny; local_y_start = pf[0]->local_y_start; } else { local_n2 = mdata->local_ny; local_y_start = mdata->local_y_start; } /* first two dimensions are transposed in MPI output: */ for (j = 0; j < local_n2; ++j) for (i = 0; i < n1; ++i) for (k = 0; k < n3; ++k) { int i2 = i, j2 = j + local_y_start, k2 = k; int index = ((j * n1 + i) * n3 + k); # endif /* HAVE_MPI */ #else /* not SCALAR_COMPLEX */ # ifndef HAVE_MPI for (i = 0; i < n_other; ++i) for (j = 0; j < n_last; ++j) { int index = i * n_last + j; int i2, j2, k2; switch (rank) { case 2: i2 = i; j2 = j; k2 = 0; break; case 3: i2 = i / n2; j2 = i % n2; k2 = j; break; default: i2 = j; j2 = k2 = 0; break; } # else /* HAVE_MPI */ if (fields.num_items > 0) { local_n2 = pf[0]->local_ny; local_y_start = pf[0]->local_y_start; } else { local_n2 = mdata->local_ny; local_y_start = mdata->local_y_start; } /* For a real->complex transform, the last dimension is cut in half. For a 2d transform, this is taken into account in local_ny already, but for a 3d transform we must compute the new n3: */ if (n3 > 1) { if (fields.num_items > 0) local_n3 = pf[0]->last_dim_size / 2; else local_n3 = mdata->last_dim_size / 2; } else local_n3 = 1; /* first two dimensions are transposed in MPI output: */ for (j = 0; j < local_n2; ++j) for (i = 0; i < n1; ++i) for (k = 0; k < local_n3; ++k) { # define i2 i int j2 = j + local_y_start; # define k2 k int index = ((j * n1 + i) * local_n3 + k); # endif /* HAVE_MPI */ #endif /* not SCALAR_COMPLEX */ { list arg_list = SCM_EOL; cnumber integrand; vector3 p; p.x = i2 * s1 - c1; p.y = j2 * s2 - c2; p.z = k2 * s3 - c3; for (ifield = fields.num_items - 1; ifield >= 0; --ifield) { SCM item = SCM_EOL; switch (pf[ifield]->type) { case RSCALAR_FIELD_SMOB: item = ctl_convert_number_to_scm(pf[ifield]->f.rs[index]); break; case CSCALAR_FIELD_SMOB: item = cnumber2scm(cscalar2cnumber( pf[ifield]->f.cs[index])); break; case CVECTOR_FIELD_SMOB: item = cvector32scm(cscalar32cvector3( pf[ifield]->f.cv+3*index)); break; } arg_list = gh_cons(item, arg_list); } arg_list = gh_cons(vector32scm(p), arg_list); integrand = ctl_convert_cnumber_to_c(gh_apply(f, arg_list)); integral.re += integrand.re; integral.im += integrand.im; #ifndef SCALAR_COMPLEX { int last_index; # ifdef HAVE_MPI if (n3 == 1) last_index = j + local_y_start; else last_index = k; # else last_index = j; # endif if (last_index != 0 && 2*last_index != last_dim) { int i2c, j2c, k2c; i2c = i2 ? (n1 - i2) : 0; j2c = j2 ? (n2 - j2) : 0; k2c = k2 ? (n3 - k2) : 0; p.x = i2c * s1 - c1; p.y = j2c * s2 - c2; p.z = k2c * s3 - c3; arg_list = SCM_EOL; for (ifield = fields.num_items - 1; ifield >= 0; --ifield) { SCM item = SCM_UNDEFINED; switch (pf[ifield]->type) { case RSCALAR_FIELD_SMOB: item = ctl_convert_number_to_scm( pf[ifield]->f.rs[index]); break; case CSCALAR_FIELD_SMOB: item = cnumber2scm(cscalar2cnumber( pf[ifield]->f.cs[index])); break; case CVECTOR_FIELD_SMOB: item = cvector32scm( cvector3_conj(cscalar32cvector3( pf[ifield]->f.cv+3*index))); break; } arg_list = gh_cons(item, arg_list); } arg_list = gh_cons(vector32scm(p), arg_list); integrand = ctl_convert_cnumber_to_c(gh_apply(f, arg_list)); integral.re += integrand.re; integral.im += integrand.im; } } #endif } } free(pf); integral.re *= Vol / (n1 * n2 * n3); integral.im *= Vol / (n1 * n2 * n3); { cnumber integral_sum; mpi_allreduce(&integral, &integral_sum, 2, number, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); return integral_sum; } }