Exemple #1
0
int main(int argc, const char * argv[])
{
    
    // prefix_sums test:
    int A[] = {1, 2, 3, 4, 5, 6, 7};
    struct Results s = prefix_sums(A, 7);
    
    printf("Array = {1, 2,... 7} \n");
    printf("Prefix sums: ");
    for (int i = 0; i <= s.L; i++)
    {
        printf("%i, ", (s.C[i]));
    }
    
    // Passing cars test
    int B[] = {0, 1, 0, 1, 1};
    printf("\nCars: %i ", PassingCars(B, 5));
    
    // Genomic Range Query test
    int P[] = {2, 5, 0};
    int Q[] = {4, 5, 6};
    char *S = "CAGCCTA";  // ACGT
    
    printf("\nGenomic Range Query:\n ");
    s = GenomicRangeQuery(S, P, Q, 3);
    for (int i = 0; i < s.L; i++)
    {
        printf("%i, ", (s.C[i]));
    }
               
    
}
Exemple #2
0
int main(int argc, char **argv) {

    const int maxThreads = omp_get_max_threads();

    if (argc < 4) {
        fprintf(stderr, "Usage: bench <csv file> <input size> <num threads> [<num threads> ...]\n");
        return -1;
    }

    FILE *const csvFile = csv_open(argv[1]);
    if (csvFile == NULL) {
        return -1;
    }

    const int len = safe_strtol(argv[2]);
    if (len < 1) {
        fprintf(stderr, "Input size must be positive\n");
        return -1;
    }

    TYPE *nrs = random_array(len, time(NULL));
    if (nrs == NULL) {
        return -1;
    }

    for (int i = 3; i < argc; i++) {
        int threads = safe_strtol(argv[i]);
        if (threads < 1) {
            threads = maxThreads;
        }

        omp_set_num_threads(threads);
        printf("%s. omp_get_max_threads() == %d\n", algorithm_name, threads);

        /* Bench the parallel implementation. */

        double start = omp_get_wtime();
        if (prefix_sums(nrs, len, NULL) != 0) {
            return -1;
        }
        double par_time = omp_get_wtime() - start;

        printf("elements: %d; par time: %f\n\n",
                len, par_time);

        fprintf(csvFile, "%s,%d,%d,%f\n", algorithm_name, threads, len, par_time);
    }

    free(nrs);

    csv_close(csvFile);

    return 0;
}
Exemple #3
0
double computeGraph(graph* G, graphSDG* SDGdata) {

    VERT_T* endV;
    LONG_T *degree, *numEdges, *pos, *pSums;
    WEIGHT_T* w;
    double elapsed_time;

#ifdef _OPENMP
    omp_lock_t *vLock;
    LONG_T chunkSize;
#endif

    elapsed_time = get_seconds();

#ifdef _OPENMP
    omp_set_num_threads(NUM_THREADS);
#endif

#ifdef _OPENMP
#pragma omp parallel
#endif    
{
    LONG_T i, j, u, n, m, tid, nthreads;
#ifdef DIAGNOSTIC
    double elapsed_time_part;
#endif
    
#ifdef _OPENMP    
    nthreads = omp_get_num_threads();
    tid = omp_get_thread_num();
#else
    tid = 0;
    nthreads = 1;
#endif

    n = N;
    m = M;
    
    if (tid == 0) {
#ifdef _OPENMP
        vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t));
        assert(vLock != NULL);
        chunkSize = n/nthreads;
#endif
        pos = (LONG_T *) malloc(m*sizeof(LONG_T));
        assert(pos != NULL);
        degree = (LONG_T *) calloc(n, sizeof(LONG_T));
        assert(degree != NULL);
    }
  
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds();
    }
#endif
    
#ifdef _OPENMP    
#pragma omp barrier
    
    #pragma omp for schedule(static, chunkSize)
    for (i=0; i<n; i++) {
        omp_init_lock(&vLock[i]);
    }

    #pragma omp barrier
  
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Lock initialization time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
 
    #pragma omp for
#endif
    for (i=0; i<m; i++) {
        u = SDGdata->startVertex[i];
#ifdef _OPENMP        
        omp_set_lock(&vLock[u]);
#endif
        pos[i] = degree[u]++;
#ifdef _OPENMP
        omp_unset_lock(&vLock[u]);
#endif
    } 
   
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Degree computation time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
   
#ifdef _OPENMP
#pragma omp barrier

#pragma omp for schedule(static, chunkSize)
    for (i=0; i<n; i++) {
        omp_destroy_lock(&vLock[i]);
    }

    if (tid == 0) 
        free(vLock);
#endif
    
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Lock destruction time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
   
    if (tid == 0) {
        numEdges = (LONG_T *) malloc((n+1)*sizeof(LONG_T));
        pSums = (LONG_T *) malloc(nthreads*sizeof(LONG_T));
   }

#ifdef _OPENMP
#pragma omp barrier
#endif

    prefix_sums(degree, numEdges, pSums, n); 
    
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Prefix sums time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
 
#ifdef _OPENMP
#pragma omp barrier
#endif

    if (tid == 0) {
        free(degree);
        free(pSums);
        w = (WEIGHT_T *) malloc(m*sizeof(WEIGHT_T));
        endV = (VERT_T *) malloc(m* sizeof(VERT_T));
    }

#ifdef _OPENMP
    #pragma omp barrier

    #pragma omp for
#endif
    for (i=0; i<m; i++) {
        u = SDGdata->startVertex[i];
        j = numEdges[u] + pos[i];
        endV[j] = SDGdata->endVertex[i];
        //TODO: 
		//w[j] = SDGdata->weight[i]; 
		fprintf(stderr, "%d\n", SDGdata->weight[i]);
		w[j] = 1; 
    }
    
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Edge data structure construction time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
 
    if (tid == 0) {
        free(pos);
        G->n = n;
        G->m = m;
        G->numEdges = numEdges;
        G->endV = endV;
        G->weight = w;
    }
#ifdef _OPENMP    
#endif
}
    /* Verification */
#if 0 
    fprintf(stderr, "SDG data:\n");
    for (int i=0; i<SDGdata->m; i++) {
        fprintf(stderr, "[%ld %ld %ld] ", SDGdata->startVertex[i], 
                SDGdata->endVertex[i], SDGdata->weight[i]);
    }
 
    fprintf(stderr, "\n");

    for (int i=0; i<G->n + 1; i++) {
        fprintf(stderr, "[%ld] ", G->numEdges[i]);
    }
    
    fprintf(stderr, "\nGraph:\n");
    for (int i=0; i<G->n; i++) {
        for (int j=G->numEdges[i]; j<G->numEdges[i+1]; j++) {
            fprintf(stderr, "[%ld %ld %ld] ", i, G->endV[j], G->weight[j]);
        }
    }
#endif 
    
    free(SDGdata->startVertex);
    free(SDGdata->endVertex);
    free(SDGdata->weight);
    
    elapsed_time = get_seconds() - elapsed_time; 
    
    return elapsed_time;
}
void vertex_betweenness_centrality_parBFS(graph_t* G, double* BC, long numSrcs) {

    attr_id_t *S;      /* stack of vertices in the order of non-decreasing 
                          distance from s. Also used to implicitly 
                          represent the BFS queue */
    plist_t* P;        /* predecessors of a vertex v on shortest paths from s */
    double* sig;       /* No. of shortest paths */
    attr_id_t* d;      /* Length of the shortest path between every pair */
    double* del;       /* dependency of vertices */
    attr_id_t *in_degree, *numEdges, *pSums;
    attr_id_t* pListMem;    
#if RANDSRCS
    attr_id_t* Srcs; 
#endif
    attr_id_t *start, *end;
    long MAX_NUM_PHASES;
    attr_id_t *psCount;

#ifdef _OPENMP    
    omp_lock_t* vLock;
    long chunkSize;
#endif
#ifdef DIAGNOSTIC
    double elapsed_time;
#endif
    int seed = 2387;

#ifdef _OPENMP    
#pragma omp parallel firstprivate(G)
    {
#endif

        attr_id_t *myS, *myS_t;
        attr_id_t myS_size;
        long i, j, k, p, count, myCount;
        long v, w, vert;
        long k0, k1;
        long numV, num_traversals, n, m, phase_num;
        long start_iter, end_iter;
        long tid, nthreads;
        int* stream;
#ifdef DIAGNOSTIC
        double elapsed_time_part;
#endif

#ifdef _OPENMP
        int myLock;
        tid = omp_get_thread_num();
        nthreads = omp_get_num_threads();
#else
        tid = 0;
        nthreads = 1;
#endif

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time = get_seconds();
            elapsed_time_part = get_seconds();
        }
#endif

        /* numV: no. of vertices to run BFS from = numSrcs */
        numV = numSrcs;
        n = G->n;
        m = G->m;

        /* Permute vertices */
        if (tid == 0) {
#if RANDSRCS
            Srcs = (attr_id_t *) malloc(n*sizeof(attr_id_t));
#endif
#ifdef _OPENMP
            vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t));
#endif
        }

#ifdef _OPENMP   
#pragma omp barrier
#pragma omp for
        for (i=0; i<n; i++) {
            omp_init_lock(&vLock[i]);
        }
#endif

        /* Initialize RNG stream */ 
        stream = init_sprng(0, tid, nthreads, seed, SPRNG_DEFAULT);

#if RANDSRCS
#ifdef _OPENMP
#pragma omp for
#endif
        for (i=0; i<n; i++) {
            Srcs[i] = i;
        }

#ifdef _OPENMP
#pragma omp for
#endif
        for (i=0; i<n; i++) {
            j = n * sprng(stream);
            if (i != j) {
#ifdef _OPENMP
                int l1 = omp_test_lock(&vLock[i]);
                if (l1) {
                    int l2 = omp_test_lock(&vLock[j]);
                    if (l2) {
#endif
                        k = Srcs[i];
                        Srcs[i] = Srcs[j];
                        Srcs[j] = k;
#ifdef _OPENMP  
                        omp_unset_lock(&vLock[j]);
                    }
                    omp_unset_lock(&vLock[i]);
                }
#endif        
            }
        } 
#endif

#ifdef _OPENMP    
#pragma omp barrier
#endif

        if (tid == 0) {
            MAX_NUM_PHASES = 500;
        }

#ifdef _OPENMP
#pragma omp barrier    
#endif

        /* Initialize predecessor lists */

        /* The size of the predecessor list of each vertex is bounded by 
           its in-degree. So we first compute the in-degree of every
           vertex */ 

        if (tid == 0) {
            P   = (plist_t  *) calloc(n, sizeof(plist_t));
            in_degree = (attr_id_t *) calloc(n+1, sizeof(attr_id_t));
            numEdges = (attr_id_t *) malloc((n+1)*sizeof(attr_id_t));
            pSums = (attr_id_t *) malloc(nthreads*sizeof(attr_id_t));
        }

#ifdef _OPENMP
#pragma omp barrier
#pragma omp for
#endif
        for (i=0; i<m; i++) {
            v = G->endV[i];
#ifdef _OPENMP
            omp_set_lock(&vLock[v]);
#endif
            in_degree[v]++;
#ifdef _OPENMP
            omp_unset_lock(&vLock[v]);
#endif
        }

        prefix_sums(in_degree, numEdges, pSums, n);

        if (tid == 0) {
            pListMem = (attr_id_t *) malloc(m*sizeof(attr_id_t));
        }

#ifdef _OPENMP
#pragma omp barrier
#pragma omp for
#endif
        for (i=0; i<n; i++) {
            P[i].list = pListMem + numEdges[i];
            P[i].degree = in_degree[i];
            P[i].count = 0;
        }

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time_part = get_seconds() -elapsed_time_part;
            fprintf(stderr, "In-degree computation time: %lf seconds\n", 
                    elapsed_time_part);
            elapsed_time_part = get_seconds();
        }
#endif

        /* Allocate shared memory */ 
        if (tid == 0) {
            free(in_degree);
            free(numEdges);
            free(pSums);

            S   = (attr_id_t *) malloc(n*sizeof(attr_id_t));
            sig = (double *) malloc(n*sizeof(double));
            d   = (attr_id_t *) malloc(n*sizeof(attr_id_t));
            del = (double *) calloc(n, sizeof(double));

            start = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t));
            end = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t));
            psCount = (attr_id_t *) malloc((nthreads+1)*sizeof(attr_id_t));
        }

        /* local memory for each thread */  
        myS_size = (2*n)/nthreads;
        myS = (attr_id_t *) malloc(myS_size*sizeof(attr_id_t));
        num_traversals = 0;
        myCount = 0;

#ifdef _OPENMP    
#pragma omp barrier
#endif

#ifdef _OPENMP    
#pragma omp for
#endif
        for (i=0; i<n; i++) {
            d[i] = -1;
        }

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time_part = get_seconds() - elapsed_time_part;
            fprintf(stderr, "BC initialization time: %lf seconds\n", 
                    elapsed_time_part);
            elapsed_time_part = get_seconds();
        }
#endif

        for (p=0; p<n; p++) {
#if RANDSRCS
            i = Srcs[p];
#else
            i = p;
#endif
            if (G->numEdges[i+1] - G->numEdges[i] == 0) {
                continue;
            } else {
                num_traversals++;
            }

            if (num_traversals == numV + 1) {
                break;
            }

            if (tid == 0) {
                sig[i] = 1;
                d[i] = 0;
                S[0] = i;
                start[0] = 0;
                end[0] = 1;
            }

            count = 1;
            phase_num = 0;

#ifdef _OPENMP       
#pragma omp barrier
#endif

            while (end[phase_num] - start[phase_num] > 0) {

                myCount = 0;
                start_iter = start[phase_num];
                end_iter = end[phase_num];
#ifdef _OPENMP
#pragma omp barrier
#pragma omp for schedule(dynamic) nowait
#endif
                for (vert = start_iter; vert < end_iter; vert++) {
                    v = S[vert];
                    for (j=G->numEdges[v]; j<G->numEdges[v+1]; j++) {

                        w = G->endV[j];
                        if (v != w) {

#ifdef _OPENMP                            
                            myLock = omp_test_lock(&vLock[w]);
                            if (myLock) { 
#endif             
                                /* w found for the first time? */ 
                                if (d[w] == -1) {
                                    if (myS_size == myCount) {
                                        /* Resize myS */
                                        myS_t = (attr_id_t *)
                                            malloc(2*myS_size*sizeof(attr_id_t));
                                        memcpy(myS_t, myS, 
                                                myS_size*sizeof(attr_id_t));
                                        free(myS);
                                        myS = myS_t;
                                        myS_size = 2*myS_size;
                                    }
                                    myS[myCount++] = w;
                                    d[w] = d[v] + 1;
                                    sig[w] = sig[v];
                                    P[w].list[P[w].count++] = v;
                                } else if (d[w] == d[v] + 1) {
                                    sig[w] += sig[v];
                                    P[w].list[P[w].count++] = v;
                                }
#ifdef _OPENMP  

                                omp_unset_lock(&vLock[w]);
                            } else {
                                if ((d[w] == -1) || (d[w] == d[v]+ 1)) {
                                    omp_set_lock(&vLock[w]);
                                    sig[w] += sig[v];
                                    P[w].list[P[w].count++] = v;
                                    omp_unset_lock(&vLock[w]);
                                }
                            }
#endif

                        }
                    }
                }
                /* Merge all local stacks for next iteration */
                phase_num++; 
                if (tid == 0) {
                    if (phase_num >= MAX_NUM_PHASES) {
                        fprintf(stderr, "Error: Max num phases set to %ld\n",
                                MAX_NUM_PHASES);
                        fprintf(stderr, "Diameter of input network greater than"
                                " this value. Increase MAX_NUM_PHASES"
                                " in vertex_betweenness_centrality_parBFS()\n");
                        exit(-1);
                    }
                }

                psCount[tid+1] = myCount;

#ifdef _OPENMP
#pragma omp barrier
#endif

                if (tid == 0) {
                    start[phase_num] = end[phase_num-1];
                    psCount[0] = start[phase_num];
                    for(k=1; k<=nthreads; k++) {
                        psCount[k] = psCount[k-1] + psCount[k];
                    }
                    end[phase_num] = psCount[nthreads];
                }



#ifdef _OPENMP
#pragma omp barrier
#endif

                k0 = psCount[tid]; 
                k1 = psCount[tid+1];
                for (k = k0; k < k1; k++) {
                    S[k] = myS[k-k0];
                } 

                count = end[phase_num];
            }

            phase_num--;

            while (phase_num > 0) {
                start_iter = start[phase_num];
                end_iter = end[phase_num];
#ifdef _OPENMP        
#pragma omp for schedule(static) nowait
#endif
                for (j=start_iter; j<end_iter; j++) {
                    w = S[j];
                    for (k = 0; k<P[w].count; k++) {
                        v = P[w].list[k];
#ifdef _OPENMP
                        omp_set_lock(&vLock[v]);
#endif
                        del[v] = del[v] + sig[v]*(1+del[w])/sig[w];
#ifdef _OPENMP
                        omp_unset_lock(&vLock[v]);
#endif
                    }
                    BC[w] += del[w];
                }

                phase_num--;

#ifdef _OPENMP
#pragma omp barrier
#endif            
            }


#ifdef _OPENMP
            chunkSize = n/nthreads;
#pragma omp for schedule(static, chunkSize) nowait
#endif
            for (j=0; j<count; j++) {
                w = S[j];
                d[w] = -1;
                del[w] = 0;
                P[w].count = 0;
            }


#ifdef _OPENMP
#pragma omp barrier
#endif

        }

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time_part = get_seconds() - elapsed_time_part;
            fprintf(stderr, "BC computation time: %lf seconds\n", 
                    elapsed_time_part);
        }
#endif


#ifdef _OPENMP
#pragma omp barrier
#endif

#ifdef _OPENMP
#pragma omp for
        for (i=0; i<n; i++) {
            omp_destroy_lock(&vLock[i]);
        }
#endif

        free(myS);

        if (tid == 0) { 
            free(S);
            free(pListMem);
            free(P);
            free(sig);
            free(d);
            free(del);
#ifdef _OPENMP
            free(vLock);
#endif
            free(start);
            free(end);
            free(psCount);

#ifdef DIAGNOSTIC
            elapsed_time = get_seconds() - elapsed_time;
            fprintf(stderr, "Time taken: %lf\n seconds", elapsed_time);
#endif

#if RANDSRCS
            free(Srcs);
#endif
        }

        free_sprng(stream);
#ifdef _OPENMP
    }    
#endif

}
void vertex_betweenness_centrality_simple(graph_t* G, double* BC, long numSrcs) {

    attr_id_t *in_degree, *numEdges, *pSums;
#if RANDSRCS
    attr_id_t* Srcs; 
#endif
    long num_traversals = 0;
#ifdef _OPENMP    
    omp_lock_t* vLock;
    long chunkSize;
#endif
#ifdef DIAGNOSTIC
    double elapsed_time;
#endif
    int seed = 2387;

    /* The outer loop is parallelized in this case. Each thread does a BFS 
       and the vertex BC values are incremented atomically */   
#ifdef _OPENMP
#pragma omp parallel firstprivate(G)
    {
#endif
        attr_id_t *S;      /* stack of vertices in the order of non-decreasing 
                              distance from s. Also used to implicitly 
                              represent the BFS queue */
        plist_t* P;          /* predecessors of a vertex v on shortest paths 
                                from s */
        attr_id_t* pListMem;    
        double* sig;       /* No. of shortest paths */
        attr_id_t* d;      /* Length of the shortest path between every pair */
        double* del;       /* dependency of vertices */
        attr_id_t *start, *end;
        long MAX_NUM_PHASES;

        long i, j, k, p, count;
        long v, w, vert;
        long numV, n, m, phase_num;
        long tid, nthreads;
        int* stream;
#ifdef DIAGNOSTIC
        double elapsed_time_part;
#endif

#ifdef _OPENMP
        int myLock;
        tid = omp_get_thread_num();
        nthreads = omp_get_num_threads();
#else
        tid = 0;
        nthreads = 1;
#endif

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time = get_seconds();
            elapsed_time_part = get_seconds();
        }
#endif

        /* numV: no. of vertices to run BFS from = numSrcs */
        numV = numSrcs;
        n = G->n;
        m = G->m;

        /* Permute vertices */
        if (tid == 0) {
#if RANDSRCS
            Srcs = (attr_id_t *) malloc(n*sizeof(attr_id_t));
#endif
#ifdef _OPENMP
            vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t));
#endif
        }

#ifdef _OPENMP   
#pragma omp barrier
#pragma omp for
        for (i=0; i<n; i++) {
            omp_init_lock(&vLock[i]);
        }
#endif

        /* Initialize RNG stream */ 
        stream = init_sprng(0, tid, nthreads, seed, SPRNG_DEFAULT);

#if RANDSRCS
#ifdef _OPENMP
#pragma omp for
#endif
        for (i=0; i<n; i++) {
            Srcs[i] = i;
        }

#ifdef _OPENMP
#pragma omp for
#endif
        for (i=0; i<n; i++) {
            j = n * sprng(stream);
            if (i != j) {
#ifdef _OPENMP
                int l1 = omp_test_lock(&vLock[i]);
                if (l1) {
                    int l2 = omp_test_lock(&vLock[j]);
                    if (l2) {
#endif
                        k = Srcs[i];
                        Srcs[i] = Srcs[j];
                        Srcs[j] = k;
#ifdef _OPENMP  
                        omp_unset_lock(&vLock[j]);
                    }
                    omp_unset_lock(&vLock[i]);
                }
#endif        
            }
        } 
#endif

#ifdef _OPENMP    
#pragma omp barrier
#endif

        MAX_NUM_PHASES = 50;

        /* Initialize predecessor lists */

        /* The size of the predecessor list of each vertex is bounded by 
           its in-degree. So we first compute the in-degree of every
           vertex */ 

        if (tid == 0) {
            in_degree = (attr_id_t *) calloc(n+1, sizeof(attr_id_t));
            numEdges = (attr_id_t *) malloc((n+1)*sizeof(attr_id_t));
            pSums = (attr_id_t *) malloc(nthreads*sizeof(attr_id_t));
        }


#ifdef _OPENMP
#pragma omp barrier
#pragma omp for
#endif
        for (i=0; i<m; i++) {
            v = G->endV[i];
#ifdef _OPENMP
            omp_set_lock(&vLock[v]);
#endif
            in_degree[v]++;
#ifdef _OPENMP
            omp_unset_lock(&vLock[v]);
#endif
        }

        prefix_sums(in_degree, numEdges, pSums, n);

        P  = (plist_t  *) calloc(n, sizeof(plist_t));
        pListMem = (attr_id_t *) malloc(m*sizeof(attr_id_t));

        for (i=0; i<n; i++) {
            P[i].list = pListMem + numEdges[i];
            P[i].degree = in_degree[i];
            P[i].count = 0;
        }

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time_part = get_seconds() -elapsed_time_part;
            fprintf(stderr, "In-degree computation time: %lf seconds\n", 
                    elapsed_time_part);
            elapsed_time_part = get_seconds();
        }
#endif

#ifdef _OPENMP
#pragma omp barrier
#endif

        /* Allocate shared memory */ 
        if (tid == 0) {
            free(in_degree);
            free(numEdges);
            free(pSums);
        }

        S   = (attr_id_t *) malloc(n*sizeof(attr_id_t));
        sig = (double *) malloc(n*sizeof(double));
        d   = (attr_id_t *) malloc(n*sizeof(attr_id_t));
        del = (double *) calloc(n, sizeof(double));

        start = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t));
        end = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t));

#ifdef _OPENMP   
#pragma omp barrier
#endif

        for (i=0; i<n; i++) {
            d[i] = -1;
        }

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time_part = get_seconds() - elapsed_time_part;
            fprintf(stderr, "BC initialization time: %lf seconds\n", 
                    elapsed_time_part);
            elapsed_time_part = get_seconds();
        }
#endif

#ifdef _OPENMP
#pragma omp for reduction(+:num_traversals)
#endif
        for (p=0; p<numV; p++) {
#if RANDSRCS
            i = Srcs[p];
#else
            i = p;
#endif
            if (G->numEdges[i+1] - G->numEdges[i] == 0) {
                continue;
            } else {
                num_traversals++;
            }

            sig[i] = 1;
            d[i] = 0;
            S[0] = i;
            start[0] = 0;
            end[0] = 1;

            count = 1;
            phase_num = 0;

            while (end[phase_num] - start[phase_num] > 0) {

                for (vert = start[phase_num]; vert < end[phase_num]; vert++) {
                    v = S[vert];
                    for (j=G->numEdges[v]; j<G->numEdges[v+1]; j++) {
                        w = G->endV[j];
                        if (v != w) {
                            /* w found for the first time? */ 
                            if (d[w] == -1) {
                                S[count++] = w;
                                d[w] = d[v] + 1;
                                sig[w] = sig[v];
                                P[w].list[P[w].count++] = v;
                            } else if (d[w] == d[v] + 1) {
                                sig[w] += sig[v];
                                P[w].list[P[w].count++] = v;
                            }
                        }
                    }
                }

                phase_num++; 

                start[phase_num] = end[phase_num-1];
                end[phase_num] = count;
            }

            phase_num--;

            while (phase_num > 0) {
                for (j=start[phase_num]; j<end[phase_num]; j++) {
                    w = S[j];
                    for (k = 0; k<P[w].count; k++) {
                        v = P[w].list[k];
                        del[v] = del[v] + sig[v]*(1+del[w])/sig[w];
                    }
#ifdef _OPENMP
                    omp_set_lock(&vLock[w]);
                    BC[w] += del[w];
                    omp_unset_lock(&vLock[w]);
#else
                    BC[w] += del[w];
#endif
                }

                phase_num--;
            }

            for (j=0; j<count; j++) {
                w = S[j];
                d[w] = -1;
                del[w] = 0;
                P[w].count = 0;
            }

        }

#ifdef DIAGNOSTIC
        if (tid == 0) {
            elapsed_time_part = get_seconds() - elapsed_time_part;
            fprintf(stderr, "BC computation time: %lf seconds\n", 
                    elapsed_time_part);
        }
#endif


#ifdef _OPENMP
#pragma omp barrier
#endif

#ifdef _OPENMP
#pragma omp for
        for (i=0; i<n; i++) {
            omp_destroy_lock(&vLock[i]);
        }
#endif

        free(S);
        free(pListMem);
        free(P);
        free(sig);
        free(d);
        free(del);
        free(start);
        free(end);

        if (tid == 0) {

#ifdef _OPENMP
            free(vLock);
#endif

#if RANDSRCS
            free(Srcs);
#endif

#ifdef DIAGNOSTIC
            elapsed_time = get_seconds() - elapsed_time;
            fprintf(stderr, "Total time taken: %lf seconds\n", elapsed_time);
#endif

        }

        free_sprng(stream);

#ifdef _OPENMP
#pragma omp barrier
    }
#endif

}    
Exemple #6
0
/* =============================================================================
 * computeGraph
 * =============================================================================
 */
void
computeGraph (void* argPtr)
{
    TM_THREAD_ENTER();

    graph*    GPtr       = ((computeGraph_arg_t*)argPtr)->GPtr;
    graphSDG* SDGdataPtr = ((computeGraph_arg_t*)argPtr)->SDGdataPtr;

    long myId = thread_getId();
    long numThread = thread_getNumThread();

    ULONGINT_T j;
    ULONGINT_T maxNumVertices = 0;
    ULONGINT_T numEdgesPlaced = SDGdataPtr->numEdgesPlaced;

    /*
     * First determine the number of vertices by scanning the tuple
     * startVertex list
     */

    long i;
    long i_start;
    long i_stop;
    createPartition(0, numEdgesPlaced, myId, numThread, &i_start, &i_stop);

    for (i = i_start; i < i_stop; i++) {
        if (SDGdataPtr->startVertex[i] > maxNumVertices) {
            maxNumVertices = SDGdataPtr->startVertex[i];
        }
    }

    TM_BEGIN();
    long tmp_maxNumVertices = (long)TM_SHARED_READ_L(global_maxNumVertices);
    long new_maxNumVertices = MAX(tmp_maxNumVertices, maxNumVertices) + 1;
    TM_SHARED_WRITE_L(global_maxNumVertices, new_maxNumVertices);
    TM_END();

    thread_barrier_wait();

    maxNumVertices = global_maxNumVertices;

    if (myId == 0) {

        GPtr->numVertices = maxNumVertices;
        GPtr->numEdges    = numEdgesPlaced;
        GPtr->intWeight   = SDGdataPtr->intWeight;
        GPtr->strWeight   = SDGdataPtr->strWeight;

        for (i = 0; i < numEdgesPlaced; i++) {
            if (GPtr->intWeight[numEdgesPlaced-i-1] < 0) {
                GPtr->numStrEdges = -(GPtr->intWeight[numEdgesPlaced-i-1]) + 1;
                GPtr->numIntEdges = numEdgesPlaced - GPtr->numStrEdges;
                break;
            }
        }

        GPtr->outDegree =
            (LONGINT_T*)P_MALLOC((GPtr->numVertices) * sizeof(LONGINT_T));
        assert(GPtr->outDegree);

        GPtr->outVertexIndex =
            (ULONGINT_T*)P_MALLOC((GPtr->numVertices) * sizeof(ULONGINT_T));
        assert(GPtr->outVertexIndex);
    }

    thread_barrier_wait();

    createPartition(0, GPtr->numVertices, myId, numThread, &i_start, &i_stop);

    for (i = i_start; i < i_stop; i++) {
        GPtr->outDegree[i] = 0;
        GPtr->outVertexIndex[i] = 0;
    }

    ULONGINT_T outVertexListSize = 0;

    thread_barrier_wait();

    ULONGINT_T i0 = -1UL;

    for (i = i_start; i < i_stop; i++) {

        ULONGINT_T k = i;
        if ((outVertexListSize == 0) && (k != 0)) {
            while (i0 == -1UL) {
                for (j = 0; j < numEdgesPlaced; j++) {
                    if (k == SDGdataPtr->startVertex[j]) {
                        i0 = j;
                        break;
                    }

                }
                k--;
            }
        }

        if ((outVertexListSize == 0) && (k == 0)) {
            i0 = 0;
        }

        for (j = i0; j < numEdgesPlaced; j++) {
            if (i == GPtr->numVertices-1) {
                break;
            }
            if ((i != SDGdataPtr->startVertex[j])) {
                if ((j > 0) && (i == SDGdataPtr->startVertex[j-1])) {
                    if (j-i0 >= 1) {
                        outVertexListSize++;
                        GPtr->outDegree[i]++;
                        ULONGINT_T t;
                        for (t = i0+1; t < j; t++) {
                            if (SDGdataPtr->endVertex[t] !=
                                SDGdataPtr->endVertex[t-1])
                            {
                                outVertexListSize++;
                                GPtr->outDegree[i] = GPtr->outDegree[i]+1;
                            }
                        }
                    }
                }
                i0 = j;
                break;
            }
        }

        if (i == GPtr->numVertices-1) {
            if (numEdgesPlaced-i0 >= 0) {
                outVertexListSize++;
                GPtr->outDegree[i]++;
                ULONGINT_T t;
                for (t = i0+1; t < numEdgesPlaced; t++) {
                    if (SDGdataPtr->endVertex[t] != SDGdataPtr->endVertex[t-1]) {
                        outVertexListSize++;
                        GPtr->outDegree[i]++;
                    }
                }
            }
        }

    } /* for i */

    thread_barrier_wait();

    prefix_sums(GPtr->outVertexIndex, GPtr->outDegree, GPtr->numVertices);

    thread_barrier_wait();

    TM_BEGIN();
    TM_SHARED_WRITE_L(
        global_outVertexListSize,
        ((long)TM_SHARED_READ_L(global_outVertexListSize) + outVertexListSize)
    );
    TM_END();

    thread_barrier_wait();

    outVertexListSize = global_outVertexListSize;

    if (myId == 0) {
        GPtr->numDirectedEdges = outVertexListSize;
        GPtr->outVertexList =
            (ULONGINT_T*)P_MALLOC(outVertexListSize * sizeof(ULONGINT_T));
        assert(GPtr->outVertexList);
        GPtr->paralEdgeIndex =
            (ULONGINT_T*)P_MALLOC(outVertexListSize * sizeof(ULONGINT_T));
        assert(GPtr->paralEdgeIndex);
        GPtr->outVertexList[0] = SDGdataPtr->endVertex[0];
    }

    thread_barrier_wait();

    /*
     * Evaluate outVertexList
     */

    i0 = -1UL;

    for (i = i_start; i < i_stop; i++) {

        ULONGINT_T k = i;
        while ((i0 == -1UL) && (k != 0)) {
            for (j = 0; j < numEdgesPlaced; j++) {
                if (k == SDGdataPtr->startVertex[j]) {
                    i0 = j;
                    break;
                }
            }
            k--;
        }

        if ((i0 == -1) && (k == 0)) {
            i0 = 0;
        }

        for (j = i0; j < numEdgesPlaced; j++) {
            if (i == GPtr->numVertices-1) {
                break;
            }
            if (i != SDGdataPtr->startVertex[j]) {
                if ((j > 0) && (i == SDGdataPtr->startVertex[j-1])) {
                    if (j-i0 >= 1) {
                        long ii = GPtr->outVertexIndex[i];
                        ULONGINT_T r = 0;
                        GPtr->paralEdgeIndex[ii] = i0;
                        GPtr->outVertexList[ii] = SDGdataPtr->endVertex[i0];
                        r++;
                        ULONGINT_T t;
                        for (t = i0+1; t < j; t++) {
                            if (SDGdataPtr->endVertex[t] !=
                                SDGdataPtr->endVertex[t-1])
                            {
                                GPtr->paralEdgeIndex[ii+r] = t;
                                GPtr->outVertexList[ii+r] = SDGdataPtr->endVertex[t];
                                r++;
                            }
                        }

                    }
                }
                i0 = j;
                break;
            }
        } /* for j */

        if (i == GPtr->numVertices-1) {
            ULONGINT_T r = 0;
            if (numEdgesPlaced-i0 >= 0) {
                long ii = GPtr->outVertexIndex[i];
                GPtr->paralEdgeIndex[ii+r] = i0;
                GPtr->outVertexList[ii+r] = SDGdataPtr->endVertex[i0];
                r++;
                ULONGINT_T t;
                for (t = i0+1; t < numEdgesPlaced; t++) {
                    if (SDGdataPtr->endVertex[t] != SDGdataPtr->endVertex[t-1]) {
                        GPtr->paralEdgeIndex[ii+r] = t;
                        GPtr->outVertexList[ii+r] = SDGdataPtr->endVertex[t];
                        r++;
                    }
                }
            }
        }

    } /* for i */

    thread_barrier_wait();

    if (myId == 0) {
        P_FREE(SDGdataPtr->startVertex);
        P_FREE(SDGdataPtr->endVertex);
        GPtr->inDegree =
            (LONGINT_T*)P_MALLOC(GPtr->numVertices * sizeof(LONGINT_T));
        assert(GPtr->inDegree);
        GPtr->inVertexIndex =
            (ULONGINT_T*)P_MALLOC(GPtr->numVertices * sizeof(ULONGINT_T));
        assert(GPtr->inVertexIndex);
    }

    thread_barrier_wait();

    for (i = i_start; i < i_stop; i++) {
        GPtr->inDegree[i] = 0;
        GPtr->inVertexIndex[i] = 0;
    }

    /* A temp. array to store the inplied edges */
    ULONGINT_T* impliedEdgeList;
    if (myId == 0) {
        impliedEdgeList = (ULONGINT_T*)P_MALLOC(GPtr->numVertices
                                                * MAX_CLUSTER_SIZE
                                                * sizeof(ULONGINT_T));
        global_impliedEdgeList = impliedEdgeList;
    }

    thread_barrier_wait();

    impliedEdgeList = global_impliedEdgeList;

    createPartition(0,
                    (GPtr->numVertices * MAX_CLUSTER_SIZE),
                    myId,
                    numThread,
                    &i_start,
                    &i_stop);

    for (i = i_start; i < i_stop; i++) {
        impliedEdgeList[i] = 0;
    }

    /*
     * An auxiliary array to store implied edges, in case we overshoot
     * MAX_CLUSTER_SIZE
     */

    ULONGINT_T** auxArr;
    if (myId == 0) {
        auxArr = (ULONGINT_T**)P_MALLOC(GPtr->numVertices * sizeof(ULONGINT_T*));
        assert(auxArr);
        global_auxArr = auxArr;
    }

    thread_barrier_wait();

    auxArr = global_auxArr;

    createPartition(0, GPtr->numVertices, myId, numThread, &i_start, &i_stop);

    for (i = i_start; i < i_stop; i++) {
        /* Inspect adjacency list of vertex i */
        for (j = GPtr->outVertexIndex[i];
             j < (GPtr->outVertexIndex[i] + GPtr->outDegree[i]);
             j++)
        {
            ULONGINT_T v = GPtr->outVertexList[j];
            ULONGINT_T k;
            for (k = GPtr->outVertexIndex[v];
                 k < (GPtr->outVertexIndex[v] + GPtr->outDegree[v]);
                 k++)
            {
                if (GPtr->outVertexList[k] == i) {
                    break;
                }
            }
            if (k == GPtr->outVertexIndex[v]+GPtr->outDegree[v]) {
                TM_BEGIN();
                /* Add i to the impliedEdgeList of v */
                long inDegree = (long)TM_SHARED_READ_L(GPtr->inDegree[v]);
                TM_SHARED_WRITE_L(GPtr->inDegree[v], (inDegree + 1));
                if (inDegree < MAX_CLUSTER_SIZE) {
                    TM_SHARED_WRITE_L(impliedEdgeList[v*MAX_CLUSTER_SIZE+inDegree],
                                    i);
                } else {
                    /* Use auxiliary array to store the implied edge */
                    /* Create an array if it's not present already */
                    ULONGINT_T* a = NULL;
                    if ((inDegree % MAX_CLUSTER_SIZE) == 0) {
                        a = (ULONGINT_T*)TM_MALLOC(MAX_CLUSTER_SIZE
                                                   * sizeof(ULONGINT_T));
                        assert(a);
                        TM_SHARED_WRITE_P(auxArr[v], a);
                    } else {
                        a = auxArr[v];
                    }
                    TM_SHARED_WRITE_L(a[inDegree % MAX_CLUSTER_SIZE], i);
                }
                TM_END();
            }
        }
    } /* for i */

    thread_barrier_wait();

    prefix_sums(GPtr->inVertexIndex, GPtr->inDegree, GPtr->numVertices);

    if (myId == 0) {
        GPtr->numUndirectedEdges = GPtr->inVertexIndex[GPtr->numVertices-1]
                                   + GPtr->inDegree[GPtr->numVertices-1];
        GPtr->inVertexList =
            (ULONGINT_T *)P_MALLOC(GPtr->numUndirectedEdges * sizeof(ULONGINT_T));
    }

    thread_barrier_wait();

    /*
     * Create the inVertex List
     */

    for (i = i_start; i < i_stop; i++) {
        for (j = GPtr->inVertexIndex[i];
             j < (GPtr->inVertexIndex[i] + GPtr->inDegree[i]);
             j++)
        {
            if ((j - GPtr->inVertexIndex[i]) < MAX_CLUSTER_SIZE) {
                GPtr->inVertexList[j] =
                    impliedEdgeList[i*MAX_CLUSTER_SIZE+j-GPtr->inVertexIndex[i]];
            } else {
                GPtr->inVertexList[j] =
                    auxArr[i][(j-GPtr->inVertexIndex[i]) % MAX_CLUSTER_SIZE];
            }
        }
    }

    thread_barrier_wait();

    if (myId == 0) {
        P_FREE(impliedEdgeList);
    }

    for (i = i_start; i < i_stop; i++) {
        if (GPtr->inDegree[i] > MAX_CLUSTER_SIZE) {
            P_FREE(auxArr[i]);
        }
    }

    thread_barrier_wait();

    if (myId == 0) {
        P_FREE(auxArr);
    }

    TM_THREAD_EXIT();
}
double betweennessCentrality(graph* G, DOUBLE_T* BC, int filter) {

    VERT_T *S;         /* stack of vertices in the order of non-decreasing 
                          distance from s. Also used to implicitly 
                          represent the BFS queue */
    plist* P;          /* predecessors of a vertex v on shortest paths from s */
    DOUBLE_T* sig;     /* No. of shortest paths */
    LONG_T* d;         /* Length of the shortest path between every pair */
    DOUBLE_T* del;     /* dependency of vertices */
    LONG_T *in_degree, *numEdges, *pSums;
    LONG_T *pListMem;    
    LONG_T* Srcs; 
    LONG_T *start, *end;
    LONG_T MAX_NUM_PHASES;
    LONG_T *psCount;
#ifdef _OPENMP    
    omp_lock_t* vLock;
    LONG_T chunkSize;
#endif
    int seed = 2387;
    double elapsed_time;

#ifdef _OPENMP    
#pragma omp parallel
{
#endif

    VERT_T *myS, *myS_t;
    LONG_T myS_size;
    LONG_T i, j, k, p, count, myCount;
    LONG_T v, w, vert;
    LONG_T numV, num_traversals, n, m, phase_num;
    LONG_T tid, nthreads;
    int* stream;
#ifdef DIAGNOSTIC
    double elapsed_time_part;
#endif

#ifdef _OPENMP
    int myLock;
    tid = omp_get_thread_num();
    nthreads = omp_get_num_threads();
#else
    tid = 0;
    nthreads = 1;
#endif

#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds();
    }
#endif

    /* numV: no. of vertices to run BFS from = 2^K4approx */
    numV = 1<<K4approx;
    n = G->n;
    m = G->m;

    /* Permute vertices */
    if (tid == 0) {
        Srcs = (LONG_T *) malloc(n*sizeof(LONG_T));
#ifdef _OPENMP
        vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t));
#endif
    }

#ifdef _OPENMP   
#pragma omp barrier
#pragma omp for
    for (i=0; i<n; i++) {
        omp_init_lock(&vLock[i]);
    }
#endif

    /* Initialize RNG stream */ 
	stream = init_sprng(0, tid, nthreads, seed, SPRNG_DEFAULT);

#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0; i<n; i++) {
        Srcs[i] = i;
    }

#ifdef _OPENMP
#pragma omp for
#endif    
    for (i=0; i<n; i++) {
        j = n*sprng(stream);
        if (i != j) {
#ifdef _OPENMP
            int l1 = omp_test_lock(&vLock[i]);
            if (l1) {
                int l2 = omp_test_lock(&vLock[j]);
                if (l2) {
#endif
                    k = Srcs[i];
                    Srcs[i] = Srcs[j];
                    Srcs[j] = k;
#ifdef _OPENMP
                    omp_unset_lock(&vLock[j]);
                }
                omp_unset_lock(&vLock[i]);
            }
#endif
        }
    }

#ifdef _OPENMP    
#pragma omp barrier
#endif

#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() -elapsed_time_part;
        fprintf(stderr, "Vertex ID permutation time: %lf seconds\n", elapsed_time_part);
        elapsed_time_part = get_seconds();
    }
#endif

    /* Start timing code from here */
    if (tid == 0) {
        elapsed_time = get_seconds();
#ifdef VERIFYK4
        MAX_NUM_PHASES = 2*sqrt(n);
#else
        MAX_NUM_PHASES = 50;
#endif
    }

#ifdef _OPENMP
#pragma omp barrier    
#endif

    /* Initialize predecessor lists */
    
    /* The size of the predecessor list of each vertex is bounded by 
       its in-degree. So we first compute the in-degree of every
       vertex */ 

    if (tid == 0) {
        P   = (plist  *) calloc(n, sizeof(plist));
        in_degree = (LONG_T *) calloc(n+1, sizeof(LONG_T));
        numEdges = (LONG_T *) malloc((n+1)*sizeof(LONG_T));
        pSums = (LONG_T *) malloc(nthreads*sizeof(LONG_T));
    }

#ifdef _OPENMP
#pragma omp barrier
#pragma omp for
#endif
    for (i=0; i<m; i++) {
        v = G->endV[i];
#ifdef _OPENMP
        omp_set_lock(&vLock[v]);
#endif
        in_degree[v]++;
#ifdef _OPENMP
        omp_unset_lock(&vLock[v]);
#endif
    }

    prefix_sums(in_degree, numEdges, pSums, n);
    
    if (tid == 0) {
        pListMem = (LONG_T *) malloc(m*sizeof(LONG_T));
    }

#ifdef _OPENMP
#pragma omp barrier
#pragma omp for
#endif
    for (i=0; i<n; i++) {
        P[i].list = pListMem + numEdges[i];
        P[i].degree = in_degree[i];
        P[i].count = 0;
    }

#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "In-degree computation time: %lf seconds\n", elapsed_time_part);
        elapsed_time_part = get_seconds();
    }
#endif

    /* Allocate shared memory */ 
    if (tid == 0) {
        free(in_degree);
        free(numEdges);
        free(pSums);
        
        S   = (VERT_T *) malloc(n*sizeof(VERT_T));
        sig = (DOUBLE_T *) malloc(n*sizeof(DOUBLE_T));
        d   = (LONG_T *) malloc(n*sizeof(LONG_T));
        del = (DOUBLE_T *) calloc(n, sizeof(DOUBLE_T));
        
        start = (LONG_T *) malloc(MAX_NUM_PHASES*sizeof(LONG_T));
        end = (LONG_T *) malloc(MAX_NUM_PHASES*sizeof(LONG_T));
        psCount = (LONG_T *) malloc((nthreads+1)*sizeof(LONG_T));
    }

    /* local memory for each thread */  
    myS_size = (2*n)/nthreads;
    myS = (LONG_T *) malloc(myS_size*sizeof(LONG_T));
    num_traversals = 0;
    myCount = 0;

#ifdef _OPENMP    
#pragma omp barrier
#endif

#ifdef _OPENMP    
#pragma omp for
#endif
    for (i=0; i<n; i++) {
        d[i] = -1;
    }
 
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() -elapsed_time_part;
        fprintf(stderr, "BC initialization time: %lf seconds\n", elapsed_time_part);
        elapsed_time_part = get_seconds();
    }
#endif
   
    for (p=0; p<n; p++) {

        i = Srcs[p];
        //printf ("%d \n", i);
//         i = p;
        if (G->numEdges[i+1] - G->numEdges[i] == 0) {
            continue;
        } else {
            num_traversals++;
        }

        if (num_traversals == numV + 1) {
            break;
        }
        
        if (tid == 0) {
            sig[i] = 1;
            d[i] = 0;
            S[0] = i;
            start[0] = 0;
            end[0] = 1;
        }
        
        count = 1;
        phase_num = 0;

#ifdef _OPENMP       
#pragma omp barrier
#endif
        
        while (end[phase_num] - start[phase_num] > 0) {
            
            myCount = 0;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp for schedule(dynamic)
#endif
            for (vert = start[phase_num]; vert < end[phase_num]; vert++) {
                v = S[vert];
                for (j=G->numEdges[v]; j<G->numEdges[v+1]; j++) {

                     if ((G->weight[j] & 7) == 0 && filter==1) continue; 

                        w = G->endV[j];
                        if (v != w) {

#ifdef _OPENMP                            
                            myLock = omp_test_lock(&vLock[w]);
                            if (myLock) { 
#endif             
                                /* w found for the first time? */ 
                                if (d[w] == -1) {
                                    if (myS_size == myCount) {
                                        /* Resize myS */
                                        myS_t = (LONG_T *)
                                            malloc(2*myS_size*sizeof(VERT_T));
                                        memcpy(myS_t, myS, myS_size*sizeof(VERT_T));
                                        free(myS);
                                        myS = myS_t;
                                        myS_size = 2*myS_size;
                                    }
                                    myS[myCount++] = w;
                                    d[w] = d[v] + 1;
                                    sig[w] = sig[v];
                                    P[w].list[P[w].count++] = v;
                                } else if (d[w] == d[v] + 1) {
                                    sig[w] += sig[v];
                                    P[w].list[P[w].count++] = v;
                                }
#ifdef _OPENMP  
                            
                            omp_unset_lock(&vLock[w]);
                            } else {
                                if ((d[w] == -1) || (d[w] == d[v]+ 1)) {
                                    omp_set_lock(&vLock[w]);
                                    sig[w] += sig[v];
                                    P[w].list[P[w].count++] = v;
                                    omp_unset_lock(&vLock[w]);
                                }
                            }
#endif
                            
                        }
                }
             }
            /* Merge all local stacks for next iteration */
            phase_num++; 

            psCount[tid+1] = myCount;

#ifdef _OPENMP
#pragma omp barrier
#endif


            if (tid == 0) {
                start[phase_num] = end[phase_num-1];
                psCount[0] = start[phase_num];
                for(k=1; k<=nthreads; k++) {
                    psCount[k] = psCount[k-1] + psCount[k];
                }
                end[phase_num] = psCount[nthreads];
            }
            
#ifdef _OPENMP           
#pragma omp barrier
#endif

            for (k = psCount[tid]; k < psCount[tid+1]; k++) {
                S[k] = myS[k-psCount[tid]];
            } 
            
#ifdef _OPENMP            
#pragma omp barrier
#endif
            count = end[phase_num];
        }
     
        phase_num--;

#ifdef _OPENMP        
#pragma omp barrier
#endif
            //printf ("%d\n", phase_num);

        while (phase_num > 0) {
#ifdef _OPENMP        
#pragma omp for
#endif
            for (j=start[phase_num]; j<end[phase_num]; j++) {
                w = S[j];
                for (k = 0; k<P[w].count; k++) {
                    v = P[w].list[k];
#ifdef _OPENMP
                    omp_set_lock(&vLock[v]);
#endif
                    del[v] = del[v] + sig[v]*(1+del[w])/sig[w];
#ifdef _OPENMP
                    omp_unset_lock(&vLock[v]);
#endif
                }
                BC[w] += del[w];
            }

            phase_num--;
            
#ifdef _OPENMP
#pragma omp barrier
#endif            
        }

        
#ifdef _OPENMP
        chunkSize = n/nthreads;
#pragma omp for schedule(static, chunkSize)
#endif
        for (j=0; j<count; j++) {
            w = S[j];
            //fprintf (stderr, "w: %d\n", w);
            d[w] = -1;
            del[w] = 0;
            P[w].count = 0;
        }


#ifdef _OPENMP
#pragma omp barrier
#endif

    }
 
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() -elapsed_time_part;
        fprintf(stderr, "BC computation time: %lf seconds\n", elapsed_time_part);
    }
#endif

#ifdef _OPENMP
#pragma omp for
    for (i=0; i<n; i++) {
        omp_destroy_lock(&vLock[i]);
    }
#endif

    free(myS);
    
    if (tid == 0) { 
        free(S);
        free(pListMem);
        free(P);
        free(sig);
        free(d);
        free(del);
#ifdef _OPENMP
        free(vLock);
#endif
        free(start);
        free(end);
        free(psCount);
        elapsed_time = get_seconds() - elapsed_time;
        free(Srcs);
    }

    free_sprng(stream);
#ifdef _OPENMP
}    
#endif
    /* Verification */
#ifdef VERIFYK4
    double BCval;
    if (SCALE % 2 == 0) {
        BCval = 0.5*pow(2, 3*SCALE/2)-pow(2, SCALE)+1.0;
    } else {
        BCval = 0.75*pow(2, (3*SCALE-1)/2)-pow(2, SCALE)+1.0;
    }
    int failed = 0;
    for (int i=0; i<G->n; i++) {
        if (round(BC[i] - BCval) != 0) {
            failed = 1;
            break;
        }
    }
    if (failed) {
        fprintf(stderr, "Kernel 4 failed validation!\n");
    } else {
        fprintf(stderr, "Kernel 4 validation successful!\n");
    }
#endif

    for (int i = 0; i < G->n; i++) printf ("BC: %d %f\n",i, BC[i]);
    return elapsed_time;
}