int main() {
    unsigned int i = 0;
	int retVal;
    
    // Allocate NUM_THREADS threads
    hthread_t * tid = (hthread_t *) malloc(sizeof(hthread_t) * NUM_THREADS);
    hthread_attr_t * attr = (hthread_attr_t *) malloc(sizeof(hthread_attr_t) * NUM_AVAILABLE_HETERO_CPUS);

    assert(tid);
    assert(attr);
    
    // Set up attributes for a hardware thread
    for (i = 0; i < NUM_AVAILABLE_HETERO_CPUS; i++)
    { 
        hthread_attr_init(&attr[i]);
        hthread_attr_setdetachstate( &attr[i], HTHREAD_CREATE_JOINABLE);
    }
	
    unsigned int failed = 0;
	
    // Create hardware threads first
    for (i = 0; i < NUM_AVAILABLE_HETERO_CPUS; i++) {
        // Create thread -- Assuming that thread manager will give us
        // a TID = 2 every time since we are creating & joining 1 thread 
        // at a time.
        if (microblaze_create( &tid[i], &attr[i], foo_thread_FUNC_ID, (void *) 2, i) ) {
            failed = 1;
            PRINT_ERROR(THREAD_HARDWARE_CREATE_FAILED);
        }
        if (hthread_join( tid[i], (void *) &retVal ) ) {
            failed = 1;
            PRINT_ERROR(THREAD_HARDWARE_JOIN_FAILED);
        }
        // Make sure the return value is equal to base_array[i]
        if (base_array[i] != ((unsigned int) retVal - HT_CMD_HWTI_COMMAND)) {
            failed = 1;
            PRINT_ERROR(THREAD_HARDWARE_INCORRECT_RETURN);
        }

    }
    
    // Create all threads as software threads
    for (i = 0; i < NUM_THREADS; i++) {
        // Create threads
	    if (hthread_create( &tid[i], NULL, foo_thread, (void *) 2 )) {
            failed = 1;
            PRINT_ERROR(THREAD_SOFTWARE_CREATE_FAILED);
        }
    }
    // Now join on all software threads we just created
    for (i = 0; i < NUM_THREADS; i++) {
        // Join on thread
	    if (hthread_join(tid[i], (void *) &retVal )) {
            failed = 1;
            PRINT_ERROR(THREAD_SOFTWARE_JOIN_FAILED);
        }
    }
    // Create NUM_THREADS threads
	// ----> Create hardware threads first
    for (i = 0; i < NUM_AVAILABLE_HETERO_CPUS; i++) {
        // Create threads
        if (microblaze_create( &tid[i], &attr[i], foo2_thread_FUNC_ID, (void *) i, i) ) {
            failed = 1;
            PRINT_ERROR(THREAD_HARDWARE_CREATE_FAILED);
        }
    }

    // ----> The remaining are software threads
    for (i = NUM_AVAILABLE_HETERO_CPUS; i < NUM_THREADS; i++) {
        // Create threads
	    if (hthread_create( &tid[i], NULL, foo2_thread, (void *) i )) {
            failed = 1;
            PRINT_ERROR(THREAD_SOFTWARE_CREATE_FAILED);
        }
    }

    // Try to create more here --SHOULD FAIL!!!
    for (i = 0; i < NUM_THREADS; i++) {
        // If it does not fail
	    if (hthread_create( &tid[i], NULL, foo2_thread, (void *) i ) == SUCCESS ) {
            failed = 1;
            PRINT_ERROR(THREAD_SOFTWARE_ERROR_FAILED);
        }
    }

    // Clean up- Join on the threads.
    for (i = 0; i < NUM_THREADS; i++) {
        // If it fails
	    if (hthread_join(tid[i], (void *) &retVal )) {
            failed = 1;
            PRINT_ERROR(FINAL_JOIN_ERROR);
        }
    }

    // Test dynamic_create_smart
    

#ifdef SPLIT_BRAM
    // Test microblaze_create_DMA and dyanmic_create_smart_DMA
        
#endif
    
    if (failed) {
        PRINT_ERROR(TEST_FAILED);
    }
    else
        PRINT_ERROR(TEST_PASSED);

    free(tid);
    free(attr);

	return TEST_PASSED;
}
Ejemplo n.º 2
0
int main () {
    
    int i, j;
    hthread_t threads[NUM_THREADS];
    hthread_attr_t attr[NUM_THREADS];
    for (i = 0; i < NUM_THREADS; i++) {
        attr[i] = create_attr();
    }
    
    // Create timer variables
    hthread_time_t start, stop,running_total,running_calc_total,average_calc_total;
    hthread_time_t thread_create_start, thread_create_end, running_create_total = 0.0;

    //----------- initialize main matrices---------------
    int **matrixA = make_matrix (A_ROW_SIZE, A_COL_SIZE);
    int **matrixB = make_matrix (A_ROW_SIZE, B_COL_SIZE);
    int **matrixC = make_matrix (A_ROW_SIZE, B_COL_SIZE);
    for (i = 0; i < A_ROW_SIZE; i++) {
        for (j = 0; j <A_COL_SIZE; j++) {
            matrixA[i][j] = i + j;
        }
    }    
    for (i = 0; i < B_ROW_SIZE; i++) {
        for (j = 0; j <B_COL_SIZE; j++) {
            matrixB[i][j] = i + j;
        }
    }
    //---------------------- Done----------------------

    // Reserve space for thread package but ONLY
    // NUM_THREADS at a time
    data * package = (data *) malloc(sizeof(data) * NUM_THREADS);
    if (package == NULL) {
        printf("MALLOC ERROR: Unable to malloc thread package\n");
        while(1);
    }
    printf("*******************************************************\n");
    printf("  Multiplying A (%d x %d) x B (%d x %d)\n", A_ROW_SIZE, A_COL_SIZE, B_ROW_SIZE,B_COL_SIZE);
    printf("*******************************************************\n");

    int counter = 0, trials = 0, dest_row = 0, dest_col = 0;
    int running_create_counter, running_calc_counter;
    //for (counter = NUM_THREADS; counter > 0; counter-=2) {
    for (counter = NUM_THREADS; counter > 0; counter/=2) {
	    printf ("Number of threads %d\n", counter);
        for (trials = 0; trials < NUM_TRIALS; trials++) {
            
            // Clear the C matrix
            for (i = 0; i < A_ROW_SIZE; i++) {
                for (j = 0; j <B_COL_SIZE; j++) {
                    matrixC[i][j] = 0;
                }
            }
            // Reset running totals
            running_calc_total = 0.0;
            running_create_total = 0.0;
            running_create_counter = 0;
            running_calc_counter = 0;

            // Begin Timing
            start = hthread_time_get();
            
            // for every element in the result matrix
            int new_counter = counter;
            int A_ROW_SIZE_factor = 0;
            A_ROW_SIZE_factor = ((A_ROW_SIZE % counter != 0) ? 1 : 0);

            // A_ROW_SIZE_factor = (# of Rows / # of threads) + (1 if not a multiple of # of threads, else 0)
            A_ROW_SIZE_factor = ((A_ROW_SIZE / counter)+A_ROW_SIZE_factor);

            // This is the Outer loop for computing entire matrix.
            // dest_row is used as an offset into the matrix as we (try to)
            // create x number of threads through each pass of this for loop
            for (dest_row = 0; dest_row < A_ROW_SIZE_factor; dest_row++) {

                // if # of threads >= # of Rows or last iteration
                // Note: if statement below is simplified
                if (dest_row == (A_ROW_SIZE_factor-1)) {
                    // Then we create x threads, where x = # of Rows (remaining)
                    new_counter = (A_ROW_SIZE - (dest_row*counter));
                    // Now create x thread packages, instructing them what
                    // row they will work on
                    for ( i = 0; i < new_counter; i++) 
                        package[i].dest_row = (dest_row*counter) + i;
                }
                // if # of Rows > # of threads
                else {
                    // Then we create x threads at a time, where x = # of threads
                    new_counter = counter;
                    // Also, create x thread packages giving them row they will work on
                    for ( i = 0; i < new_counter; i++) 
                        package[i].dest_row = (dest_row*counter) + i;
                }
               
                // For each row a thread works on, we need to calculate the # of passes
                // it must make to compute the solution for a single cell. That is, each
                // thread can only hold at most MAX_SIZE'd elements for the row it is given.
                // If the original matrix is 2*MAX_SIZE, then that thread must receive from
                // host 0->MAX_SIZE-1 elements of the row, and then MAX_SIZE->(2*MAX_SIZE)-1 to
                // compute the first cell. To reduce the number of times we transfer these elements,
                // a thread computes each cell using only 0->MAX_SIZE-1 elements, then it would make a
                // second pass using MAX_SIZE->2*MAX_SIZE-1 elements. Default values for passes and 
                // array_size are set in order not to introduce more if statements.
                int passes = 1, array_size = A_COL_SIZE;
                
                // If we can fit all columns (matrix A) or all rows (matrix B) in one MAX_SIZE'd element array or
                // if # of Columns (A) or if # of Rows (B) is > MAX_SIZE
                passes = ((A_COL_SIZE % MAX_SIZE != 0) ? ((A_COL_SIZE / MAX_SIZE) + 1) : (A_COL_SIZE / MAX_SIZE));

                // The do-while loop is in charge of looping over the # of passes needed to be done
                int offset_counter = 0;
                do {
                    // Need to calculate array size for each pass
                    array_size = ( (A_COL_SIZE - (offset_counter*MAX_SIZE)) >= MAX_SIZE) ? 
                        MAX_SIZE : ( A_COL_SIZE - (offset_counter*MAX_SIZE));
                    //printf("Array Size for pass %d = %d\n", offset_counter, array_size);
                    
                    // "For each column of this row"/"for each cell in this row"
                    for (dest_col = 0; dest_col < A_COL_SIZE; dest_col++) {
                        // Copy the contents from original matrices to thread's package 
                        for (i = 0; i < new_counter; i++) {
                            package[i].dest_col = dest_col;
                            package[i].array_length = array_size;
                            //printf("Computing (%d, %d)\n", package[i].dest_row, package[i].dest_col);
                            // Now grab the row and the col from A & B matrix.
                            for( j = 0; j < array_size; j++) {
                                package[i].row[j] = matrixA[package[i].dest_row][(offset_counter*MAX_SIZE) + j];
                                package[i].col[j] = matrixB[j+(offset_counter*MAX_SIZE)][dest_col];
                            }
                        }
                        // -------------------------------THREAD CREATE--------------------------------------//
                        thread_create_start = hthread_time_get();         
                        for (i = 0; i < new_counter; i++) {
                            // if not the first time you are DMAing on this pass, only the column matrix has changed
                            if ( dest_col > 0 ) {
#ifdef  SPLIT_BRAM
                                microblaze_create_DMA( &threads[i], 
                                                        &attr[i], 
                                                        worker_thread_FUNC_ID, 
                                                        (void *) (&package[i].col), // The new column
                                                        array_size*sizeof(int),     // The # of elements
                                                        MAX_SIZE*sizeof(int),       // The offset into the original thread package
                                                        i);                         // the thread #
#else
                                microblaze_create( &threads[i], 
                                                        &attr[i], 
                                                        worker_thread_FUNC_ID, 
                                                        (void *) (&package[i]),     // The new column
                                                        i);                         // the thread #
#endif
                            }
                            // if this is the first time you are DMA'ing the row, or dest_col has passed MAX_SIZE * i, DMA everything    
                            // if this is a new pass, or the first time DMA'ing
                            else {
#ifdef  SPLIT_BRAM
                                microblaze_create_DMA( &threads[i], 
                                                        &attr[i], 
                                                        worker_thread_FUNC_ID,
                                                        (void *) &package[i],       // new column and row
                                                        sizeof(data),               // entire data package
                                                        0,                          // offset = 0 (beginning of free space)
                                                        i);
#else 
                                microblaze_create( &threads[i], 
                                                        &attr[i], 
                                                        worker_thread_FUNC_ID,
                                                        (void *) &package[i],       // new column and row
                                                        i); 
#endif
                            }
                            //microblaze_create(&threads[i], &attr, (void *) worker_thread_FUNC_ID,(void *) &package[i],i);
                            //hthread_create(&threads[i], &attr, (void *) worker_thread,(void *) &package[i]; 
                        }
                        
                        // temporary storage for average calc time
                        hthread_time_t calc_time = 0.0; 

                        // Join on those threads grabbing only the solution
                        for (i = 0; i < new_counter; i++) {
#ifdef  SPLIT_BRAM
                            hthread_join_DMA(threads[i],
                                    NULL,
                                    &package[i].solution,   // Place the 2 integers starting at solution
                                    sizeof(int)*3,          // size = 3 integers
                                    sizeof(data) - 12);      // offset = grab the last 12 bytes (1 int, 1 long long)
#else
                            hthread_join(threads[i], NULL);
#endif
                            // Update solution
                            matrixC[package[i].dest_row][package[i].dest_col] += package[i].solution;

                            // Grab times from all threads - clock cycles
                            calc_time += (hthread_time_t) package[i].time;
                            //printf("package[%d].solution = %d\n", i, package[i].solution);
                        }
                        thread_create_end = hthread_time_get();

                        // Update running total for average calculate time
                        average_calc_total = calc_time / (new_counter * 1.0); 
                        running_calc_total += average_calc_total;

                        // Update running total for thread creation overhead
                        running_create_total += ( ((thread_create_end - thread_create_start) / (new_counter * 1.0)) - average_calc_total);
                        running_create_counter++;
                        // ----------------------------------------------------------------------------------//
                    } // for dest_col
                    offset_counter++;
                }while (offset_counter < passes);
            } // for dest_row
            stop = hthread_time_get(); 
            running_total += stop - start;
        } // trials loop end

#ifdef  VERIFY
        // Verify the solution
        printf("Verifying solution.....");
        for (i = 0; i < A_ROW_SIZE; i++) {
            for (j = 0; j < B_COL_SIZE; j++) {
                int temp = 0;
                for (k = 0; k < A_COL_SIZE; k++) {
                    temp += (matrixA[i][k] * matrixB[k][j] );
                }
                if (matrixC[i][j] != temp) {
                    printf("ERROR: incorrect solution for C[%d][%d]\n", i,j);
                    while(1);
                }
            }
        }
        printf("Passed\n");
#endif
        running_total /= (NUM_TRIALS * 1.0);
        printf("Total Average Time         = %.6f seconds\n", hthread_time_sec(running_total));
        printf("Total Calculation Time     = %.6f msec\t%.6f sec\n", hthread_time_msec(running_calc_total), hthread_time_sec(running_calc_total));
        printf("Average Calculation Time   = %.6f msec\n", hthread_time_msec(running_calc_total/(running_create_counter * 1.0)));
        printf("Average Creation/Join Time = %.6f msec\n", hthread_time_msec(running_create_total/(running_create_counter * 1.0)));
        running_total = 0;
        //if (counter == 2) counter++;
    } // NUM_THREADS loop
    // Print Matrices
    //show_matrix(matrixA, A_ROW_SIZE, A_COL_SIZE);
    //show_matrix(matrixB, B_ROW_SIZE, B_COL_SIZE);
    //show_matrix(matrixC, A_ROW_SIZE, B_COL_SIZE);
    
    free(matrixA);
    free(matrixB);
    free(matrixC);
    printf("END\n");
    return 0;
}