Beispiel #1
0
int run_dgcuda(int argc, char *argv[]) {
    int local_num_elem, local_num_sides;
    int n_threads, n_blocks_elem, n_blocks_sides;
    int i, n, local_n_p, total_timesteps, local_n_quad, local_n_quad1d;
    int verbose, convergence, video, eval_error, benchmark;

    double endtime, t;
    double tol, total_error, max_error;
    double *min_radius;
    double min_r;
    double *V1x, *V1y, *V2x, *V2y, *V3x, *V3y;
    double *sides_x1, *sides_x2;
    double *sides_y1, *sides_y2;

    double *r1_local, *r2_local, *w_local;

    double *s_r, *oned_w_local;

    int *left_elem, *right_elem;
    int *elem_s1, *elem_s2, *elem_s3;
    int *left_side_number, *right_side_number;

    FILE *mesh_file, *out_file;

    char out_filename[100];
    char *mesh_filename;

    double *Uv1, *Uv2, *Uv3;
    double *error;

    clock_t start, end;
    double elapsed;

    // get input 
    endtime = -1;
    if (get_input(argc, argv, &n, &total_timesteps, &endtime, 
                              &verbose, &video, &convergence, &tol, 
                              &benchmark, &eval_error, 
                              &mesh_filename)) {
        return 1;
    }

    // set the order of the approximation & timestep
    local_n_p = (n + 1) * (n + 2) / 2;

    // sanity check on limiter
    if (limiter && n != 1) {
        printf("Error: limiter only enabled for p = 1\n");
        exit(0);
    }

    // open the mesh to get local_num_elem for allocations
    mesh_file = fopen(mesh_filename, "r");
    if (!mesh_file) {
        printf("\nERROR: mesh file not found.\n");
        return 1;
    }

    // read in the mesh and make all the mappings
    read_mesh(mesh_file, &local_num_sides, &local_num_elem,
                         &V1x, &V1y, &V2x, &V2y, &V3x, &V3y,
                         &left_side_number, &right_side_number,
                         &sides_x1, &sides_y1, 
                         &sides_x2, &sides_y2, 
                         &elem_s1, &elem_s2, &elem_s3,
                         &left_elem, &right_elem);

    // close the file
    fclose(mesh_file);

    // initialize the gpu
    init_cpu(local_num_elem, local_num_sides, local_n_p,
             V1x, V1y, V2x, V2y, V3x, V3y,
             left_side_number, right_side_number,
             sides_x1, sides_y1,
             sides_x2, sides_y2, 
             elem_s1, elem_s2, elem_s3,
             left_elem, right_elem,
             convergence, eval_error);

    // get the correct quadrature rules for this scheme
    set_quadrature(n, &r1_local, &r2_local, &w_local, 
                   &s_r, &oned_w_local, &local_n_quad, &local_n_quad1d);

    // set constant data
    set_N(local_N);
    set_n_p(local_n_p);
    set_num_elem(local_num_elem);
    set_num_sides(local_num_sides);
    set_n_quad(local_n_quad);
    set_n_quad1d(local_n_quad1d);

    // find the min inscribed circle
    preval_inscribed_circles(d_J, d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y);
    min_radius = (double *) malloc(local_num_elem * sizeof(double));
    memcpy(min_radius, d_J, local_num_elem * sizeof(double));
    min_r = min_radius[0];
    for (i = 1; i < local_num_elem; i++) {
        min_r = (min_radius[i] < min_r) ? min_radius[i] : min_r;
        // report problem
        if (min_radius[i] == 0) {
            printf("%i\n", i);
            printf("%.015lf, %.015lf, %.015lf, %.015lf, %.015lf, %.015lf\n", 
                                                     V1x[i], V1y[i],
                                                     V2x[i], V2y[i],
                                                     V3x[i], V3y[i]);
        }
    }
    free(min_radius);

    // pre computations
    preval_jacobian(d_J, d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y); 

    preval_side_length(d_s_length, d_s_V1x, d_s_V1y, d_s_V2x, d_s_V2y);
                                                      
    preval_normals(d_Nx, d_Ny, 
                   d_s_V1x, d_s_V1y, d_s_V2x, d_s_V2y,
                   d_V1x, d_V1y, 
                   d_V2x, d_V2y, 
                   d_V3x, d_V3y, 
                   d_left_side_number);


    preval_normals_direction(d_Nx, d_Ny, 
                             d_V1x, d_V1y, 
                             d_V2x, d_V2y, 
                             d_V3x, d_V3y, 
                             d_left_elem, d_left_side_number);

    preval_partials(d_V1x, d_V1y,
                    d_V2x, d_V2y,
                    d_V3x, d_V3y,
                    d_xr,  d_yr,
                    d_xs,  d_ys);

   // evaluate the basis functions at those points and store on GPU
    preval_basis(r1_local, r2_local, s_r, w_local, oned_w_local, local_n_quad, local_n_quad1d, local_n_p);

    // no longer need any of these CPU variables
    free(elem_s1);
    free(elem_s2);
    free(elem_s3);
    free(sides_x1);
    free(sides_x2);
    free(sides_y1);
    free(sides_y2);
    free(left_elem);
    free(right_elem);
    free(left_side_number);
    free(right_side_number);
    free(r1_local);
    free(r2_local);
    free(w_local);
    free(s_r);
    free(oned_w_local);

    // initial conditions
    init_conditions(d_c, d_J, d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y);

    printf(" ? %i degree polynomial interpolation (local_n_p = %i)\n", n, local_n_p);
    printf(" ? %i precomputed basis points\n", local_n_quad * local_n_p);
    printf(" ? %i elements\n", local_num_elem);
    printf(" ? %i sides\n", local_num_sides);
    printf(" ? min radius = %.015lf\n", min_r);

    if (endtime == -1 && convergence != 1) {
        printf(" ? total_timesteps = %i\n", total_timesteps);
    } else if (endtime != -1 && convergence != 1) {
        printf(" ? endtime = %lf\n", endtime);
    }


    if (benchmark) {
        start = clock();
    }

    t = time_integrate_rk4(local_num_elem, local_num_sides, 
                           n, local_n_p,
                           endtime, total_timesteps, min_r,
                           verbose, convergence, video, tol);

    if (benchmark) {
        end = clock();
        elapsed = ((double)(end - start)) / CLOCKS_PER_SEC;
        printf("Runtime: %lf seconds\n", elapsed);
    }

    // evaluate and write U to file
    write_U(local_num_elem, total_timesteps, total_timesteps);

    // free everything else
    free(d_s_V1x);
    free(d_s_V2x);
    free(d_s_V1y);
    free(d_s_V2y);

    free(d_s_length);
    free(d_lambda);
    free(d_k1);
    free(d_k2);
    free(d_k3);
    free(d_k4);
    free(d_rhs_volume);
    free(d_rhs_surface_left);
    free(d_rhs_surface_right);
    free(d_elem_s1);
    free(d_elem_s2);
    free(d_elem_s3);
    free(d_xr);
    free(d_yr);
    free(d_xs);
    free(d_ys);

    free(d_left_side_number);
    free(d_right_side_number);

    free(d_Nx);
    free(d_Ny);

    free(d_right_elem);
    free(d_left_elem);
    free(d_c);
    free(d_J);

    free(d_Uv1);
    free(d_Uv2);
    free(d_Uv3);
    free(d_V1x);
    free(d_V1y);
    free(d_V2x);
    free(d_V2y);
    free(d_V3x);
    free(d_V3y);

    // free CPU variables
    free(V1x);
    free(V1y);
    free(V2x);
    free(V2y);
    free(V3x);
    free(V3y);

    return 0;
}
Beispiel #2
0
int main(int argc, char *argv[]) {
    int num_elem, num_sides;
    int n_threads, n_blocks_elem, n_blocks_reduction, n_blocks_sides;
    int i, n, n_p, timesteps, n_quad, n_quad1d;

    double dt, t, endtime;
    double *min_radius;
    double min_r;
    double *V1x, *V1y, *V2x, *V2y, *V3x, *V3y;
    double *sides_x1, *sides_x2;
    double *sides_y1, *sides_y2;

    double *r1_local, *r2_local, *w_local;

    double *s_r, *oned_w_local;

    int *left_elem, *right_elem;
    int *elem_s1, *elem_s2, *elem_s3;
    int *left_side_number, *right_side_number;

    FILE *mesh_file, *out_file;

    char line[100];
    char *mesh_filename;
    char *out_filename;
    char *rho_out_filename;
    char *u_out_filename;
    char *v_out_filename;
    char *E_out_filename;
    char *outfile_base;
    int outfile_len;

    double *Uu1, *Uu2, *Uu3;
    double *Uv1, *Uv2, *Uv3;

    // get input 
    endtime = -1;
    if (get_input(argc, argv, &n, &timesteps, &endtime, &mesh_filename, &out_filename)) {
        return 1;
    }

    // TODO: this should be cleaner, obviously
    rho_out_filename = "output/uniform_rho.out";
    u_out_filename = "output/uniform_u.out";
    v_out_filename = "output/uniform_v.out";
    E_out_filename = "output/uniform_E.out";

    // set the order of the approximation & timestep
    n_p = (n + 1) * (n + 2) / 2;

    // open the mesh to get num_elem for allocations
    mesh_file = fopen(mesh_filename, "r");
    if (!mesh_file) {
        printf("\nERROR: mesh file not found.\n");
        return 1;
    }
    fgets(line, 100, mesh_file);
    sscanf(line, "%i", &num_elem);

    // allocate vertex points
    V1x = (double *) malloc(num_elem * sizeof(double));
    V1y = (double *) malloc(num_elem * sizeof(double));
    V2x = (double *) malloc(num_elem * sizeof(double));
    V2y = (double *) malloc(num_elem * sizeof(double));
    V3x = (double *) malloc(num_elem * sizeof(double));
    V3y = (double *) malloc(num_elem * sizeof(double));

    elem_s1 = (int *) malloc(num_elem * sizeof(int));
    elem_s2 = (int *) malloc(num_elem * sizeof(int));
    elem_s3 = (int *) malloc(num_elem * sizeof(int));

    // TODO: these are too big; should be a way to figure out how many we actually need
    left_side_number  = (int *)   malloc(3*num_elem * sizeof(int));
    right_side_number = (int *)   malloc(3*num_elem * sizeof(int));

    sides_x1    = (double *) malloc(3*num_elem * sizeof(double));
    sides_x2    = (double *) malloc(3*num_elem * sizeof(double));
    sides_y1    = (double *) malloc(3*num_elem * sizeof(double));
    sides_y2    = (double *) malloc(3*num_elem * sizeof(double)); 
    left_elem   = (int *) malloc(3*num_elem * sizeof(int));
    right_elem  = (int *) malloc(3*num_elem * sizeof(int));

    for (i = 0; i < 3*num_elem; i++) {
        right_elem[i] = -1;
    }

    // read in the mesh and make all the mappings
    read_mesh(mesh_file, &num_sides, num_elem,
                         V1x, V1y, V2x, V2y, V3x, V3y,
                         left_side_number, right_side_number,
                         sides_x1, sides_y1, 
                         sides_x2, sides_y2, 
                         elem_s1, elem_s2, elem_s3,
                         left_elem, right_elem);

    // close the file
    fclose(mesh_file);

    // initialize the gpu
    init_gpu(num_elem, num_sides, n_p,
             V1x, V1y, V2x, V2y, V3x, V3y,
             left_side_number, right_side_number,
             sides_x1, sides_y1,
             sides_x2, sides_y2, 
             elem_s1, elem_s2, elem_s3,
             left_elem, right_elem);

    n_threads          = 256;
    n_blocks_elem      = (num_elem  / n_threads) + ((num_elem  % n_threads) ? 1 : 0);
    n_blocks_sides     = (num_sides / n_threads) + ((num_sides % n_threads) ? 1 : 0);
    n_blocks_reduction = (num_elem  / 256) + ((num_elem  % 256) ? 1 : 0);

    // find the min inscribed circle
    preval_inscribed_circles(d_J, d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y, num_elem);
    min_radius = (double *) malloc(num_elem * sizeof(double));

    /*
    // find the min inscribed circle. do it on the gpu if there are at least 256 elements
    if (num_elem >= 256) {
        //min_reduction<<<n_blocks_reduction, 256>>>(d_J, d_reduction, num_elem);
        cudaThreadSynchronize();
        checkCudaError("error after min_jacobian.");

        // each block finds the smallest value, so need to sort through n_blocks_reduction
        min_radius = (double *) malloc(n_blocks_reduction * sizeof(double));
        cudaMemcpy(min_radius, d_reduction, n_blocks_reduction * sizeof(double), cudaMemcpyDeviceToHost);
        min_r = min_radius[0];
        for (i = 1; i < n_blocks_reduction; i++) {
            min_r = (min_radius[i] < min_r) ? min_radius[i] : min_r;
        }
        free(min_radius);

    } else {
        */
        // just grab all the radii and sort them since there are so few of them
        min_radius = (double *) malloc(num_elem * sizeof(double));
        memcpy(min_radius, d_J, num_elem * sizeof(double));
        min_r = min_radius[0];
        for (i = 1; i < num_elem; i++) {
            min_r = (min_radius[i] < min_r) ? min_radius[i] : min_r;
        }
        free(min_radius);
    //}

    // pre computations
    preval_jacobian(d_J, d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y, num_elem); 

    preval_side_length(d_s_length, d_s_V1x, d_s_V1y, d_s_V2x, d_s_V2y, 
                                                      num_sides); 
    //cudaThreadSynchronize();
    preval_normals(d_Nx, d_Ny, 
                   d_s_V1x, d_s_V1y, d_s_V2x, d_s_V2y,
                   d_V1x, d_V1y, 
                   d_V2x, d_V2y, 
                   d_V3x, d_V3y, 
                   d_left_side_number, num_sides); 

    preval_normals_direction(d_Nx, d_Ny, 
                             d_V1x, d_V1y, 
                             d_V2x, d_V2y, 
                             d_V3x, d_V3y, 
                             d_left_elem, d_left_side_number, num_sides); 

    preval_partials(d_V1x, d_V1y,
                    d_V2x, d_V2y,
                    d_V3x, d_V3y,
                    d_xr,  d_yr,
                    d_xs,  d_ys, num_elem);

    // get the correct quadrature rules for this scheme
    set_quadrature(n, &r1_local, &r2_local, &w_local, 
                   &s_r, &oned_w_local, &n_quad, &n_quad1d);

    // evaluate the basis functions at those points and store on GPU
    preval_basis(r1_local, r2_local, s_r, w_local, oned_w_local, n_quad, n_quad1d, n_p);

    // initial conditions
    init_conditions(d_c, d_J, d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y,
                    n_quad, n_p, num_elem);

    printf("Computing...\n");
    printf(" ? %i degree polynomial interpolation (n_p = %i)\n", n, n_p);
    printf(" ? %i precomputed basis points\n", n_quad * n_p);
    printf(" ? %i elements\n", num_elem);
    printf(" ? %i sides\n", num_sides);
    printf(" ? min radius = %lf\n", min_r);
    printf(" ? endtime = %lf\n", endtime);

    time_integrate_rk4(n_quad, n_quad1d, n_p, n, num_elem, num_sides, endtime, min_r);

    // evaluate at the vertex points and copy over data
    Uu1 = (double *) malloc(num_elem * sizeof(double));
    Uu2 = (double *) malloc(num_elem * sizeof(double));
    Uu3 = (double *) malloc(num_elem * sizeof(double));

    Uv1 = (double *) malloc(num_elem * sizeof(double));
    Uv2 = (double *) malloc(num_elem * sizeof(double));
    Uv3 = (double *) malloc(num_elem * sizeof(double));

    // evaluate rho and write to file 
    eval_u(d_c, d_Uv1, d_Uv2, d_Uv3, num_elem, n_p, 0);
    memcpy(Uv1, d_Uv1, num_elem * sizeof(double));
    memcpy(Uv2, d_Uv2, num_elem * sizeof(double));
    memcpy(Uv3, d_Uv3, num_elem * sizeof(double));
    out_file  = fopen(rho_out_filename , "w");
    fprintf(out_file, "View \"Density \" {\n");
    for (i = 0; i < num_elem; i++) {
        fprintf(out_file, "ST (%lf,%lf,0,%lf,%lf,0,%lf,%lf,0) {%lf,%lf,%lf};\n", 
                               V1x[i], V1y[i], V2x[i], V2y[i], V3x[i], V3y[i],
                               d_Uv1[i], d_Uv2[i], d_Uv3[i]);
    }
    fprintf(out_file,"};");
    fclose(out_file);

    // evaluate the u and v vectors and write to file
    eval_u_velocity(d_c, d_Uv1, d_Uv2, d_Uv3, num_elem, n_p, 1);
    memcpy(Uu1, d_Uv1, num_elem * sizeof(double));
    memcpy(Uu2, d_Uv2, num_elem * sizeof(double));
    memcpy(Uu3, d_Uv3, num_elem * sizeof(double));
    eval_u_velocity(d_c, d_Uv1, d_Uv2, d_Uv3, num_elem, n_p, 2);
    memcpy(Uv1, d_Uv1, num_elem * sizeof(double));
    memcpy(Uv2, d_Uv2, num_elem * sizeof(double));
    memcpy(Uv3, d_Uv3, num_elem * sizeof(double));
    out_file  = fopen(u_out_filename , "w");
    fprintf(out_file, "View \"u \" {\n");
    for (i = 0; i < num_elem; i++) {
        fprintf(out_file, "VT (%lf,%lf,0,%lf,%lf,0,%lf,%lf,0) {%lf,%lf,0,%lf,%lf,0,%lf,%lf,0};\n", 
                               V1x[i], V1y[i], V2x[i], V2y[i], V3x[i], V3y[i],
                               Uu1[i], Uv1[i], Uu2[i], Uv2[i], Uu3[i], Uv3[i]);
    }
    fprintf(out_file,"};");
    fclose(out_file);

    // evaluate E and write to file
    eval_u(d_c, d_Uv1, d_Uv2, d_Uv3, num_elem, n_p, 3);
    memcpy(Uv1, d_Uv1, num_elem * sizeof(double));
    memcpy(Uv2, d_Uv2, num_elem * sizeof(double));
    memcpy(Uv3, d_Uv3, num_elem * sizeof(double));
    out_file  = fopen(E_out_filename , "w");
    fprintf(out_file, "View \"E \" {\n");
    for (i = 0; i < num_elem; i++) {
        fprintf(out_file, "ST (%lf,%lf,0,%lf,%lf,0,%lf,%lf,0) {%lf,%lf,%lf};\n", 
                               V1x[i], V1y[i], V2x[i], V2y[i], V3x[i], V3y[i],
                               Uv1[i], Uv2[i], Uv3[i]);
    }
    fprintf(out_file,"};");
    fclose(out_file);

    // plot pressure
    eval_p(d_c, d_Uv1, d_Uv2, d_Uv3, num_elem, n_p, 3);
    memcpy(Uv1, d_Uv1, num_elem * sizeof(double));
    memcpy(Uv2, d_Uv2, num_elem * sizeof(double));
    memcpy(Uv3, d_Uv3, num_elem * sizeof(double));
    out_file  = fopen("output/p.out" , "w");
    fprintf(out_file, "View \"E \" {\n");
    for (i = 0; i < num_elem; i++) {
        fprintf(out_file, "ST (%lf,%lf,0,%lf,%lf,0,%lf,%lf,0) {%lf,%lf,%lf};\n", 
                               V1x[i], V1y[i], V2x[i], V2y[i], V3x[i], V3y[i],
                               Uv1[i], Uv2[i], Uv3[i]);
    }
    fprintf(out_file,"};");
    fclose(out_file);

    measure_error(d_c, d_Uv1, d_Uv2, d_Uv3, 
                  d_V1x, d_V1y, d_V2x, d_V2y, d_V3x, d_V3y,
                  num_elem, n_p);

    memcpy(Uv1, d_Uv1, num_elem * sizeof(double));
    memcpy(Uv2, d_Uv2, num_elem * sizeof(double));
    memcpy(Uv3, d_Uv3, num_elem * sizeof(double));
    out_file  = fopen("output/p_error.out" , "w");
    fprintf(out_file, "View \"p \" {\n");
    for (i = 0; i < num_elem; i++) {
        fprintf(out_file, "ST (%lf,%lf,0,%lf,%lf,0,%lf,%lf,0) {%lf,%lf,%lf};\n", 
                               V1x[i], V1y[i], V2x[i], V2y[i], V3x[i], V3y[i],
                               Uv1[i], Uv2[i], Uv3[i]);
    }
    fprintf(out_file,"};");
    fclose(out_file);

    // free variables
    free_gpu();
    
    free(Uu1);
    free(Uu2);
    free(Uu3);
    free(Uv1);
    free(Uv2);
    free(Uv3);

    free(V1x);
    free(V1y);
    free(V2x);
    free(V2y);
    free(V3x);
    free(V3y);

    free(elem_s1);
    free(elem_s2);
    free(elem_s3);

    free(sides_x1);
    free(sides_x2);
    free(sides_y1);
    free(sides_y2);

    free(left_elem);
    free(right_elem);
    free(left_side_number);
    free(right_side_number);

    free(r1_local);
    free(r2_local);
    free(w_local);
    free(s_r);
    free(oned_w_local);

    return 0;
}