void productor(){ while(1){ omp_set_lock(&CS); if(pet==0){ rsp=rsp%10; rsp++; } else if(pet==1){ rsp=(rsp%10)+10; rsp++; } else if(pet==2){ rsp=(rsp%10)+20; rsp++; } int id=omp_get_thread_num(); printf("----Productor %d con petición: %d y respuesta: %d\n",id,pet,rsp); // omp_unset_lock(&CC); // Para el caso sin distribuidor omp_unset_lock(&CD); sleep(1); } }
/* Implements using omp lock functions * */ void foo_locks(long long int n) { long long int a=0; long long int i; omp_lock_t my_lock; // init lock omp_init_lock(&my_lock); double time = omp_get_wtime(); #pragma omp parallel for schedule(static) shared(a) for(i = 0; i < n; i++) { omp_set_lock(&my_lock); a+=1; omp_unset_lock(&my_lock); } omp_destroy_lock(&my_lock); time = omp_get_wtime() - time; printf("Final value = %d \n ", a); printf("Locks: Total time = %f seconds \n ", time); } // end foo_locks
/* Gets the next chunk of iterations to perform for the given thread_id * If USE_LOCKS is set to True, function will implement locks * Else it will initiate a critical region for the shared variable read-write */ void get_chunks(int thread_id, double K, int* start_iter, int* chunk) { int remaining_iters_num, chunk_size; if(USE_LOCKS == FALSE) { #pragma omp critical (chunk) { remaining_iters_num = remaining_iters[thread_id]; chunk_size = (int) ceil((double)remaining_iters_num*K); if (chunk_size > remaining_iters_num) chunk_size = remaining_iters_num; remaining_iters[thread_id] = (remaining_iters_num - chunk_size); } } else { omp_set_lock(&(remaining_iters_lock[thread_id])); remaining_iters_num = remaining_iters[thread_id]; chunk_size = (int) ceil((double)remaining_iters_num*K); if (chunk_size > remaining_iters_num) chunk_size = remaining_iters_num; remaining_iters[thread_id] = (remaining_iters_num - chunk_size); omp_unset_lock(&(remaining_iters_lock[thread_id])); } *start_iter = hi[thread_id]-remaining_iters_num; *chunk = chunk_size; }
void IntList_Insert(pIntList pList, int x, pArrNode an) { pIntListNode prev, p , newNode; // assert(newNode!=NULL); omp_set_lock(&listLock); newNode = ArrNode_getNode(an); if (pList->head == NULL) { /* list is empty, insert the first element */ pList->head = newNode; } else { /* list is not empty, find the right place to insert element */ p = pList->head; prev = NULL; while (p != NULL && p->data < newNode->data) { prev = p; p = p->next; } if (p == NULL) { /* insert as the last element */ prev->next = newNode; newNode->prev = prev; } else if (prev == NULL) { /* insert as the first element */ pList->head = newNode; newNode->next = p; p->prev = newNode; } else { /* insert right between prev and p */ prev->next = newNode; newNode->prev = prev; newNode->next = p; p->prev = newNode; } } omp_unset_lock(&listLock); }
double computeGraph(graph* G, graphSDG* SDGdata) { VERT_T* endV; LONG_T *degree, *numEdges, *pos, *pSums; WEIGHT_T* w; double elapsed_time; #ifdef _OPENMP omp_lock_t *vLock; LONG_T chunkSize; #endif elapsed_time = get_seconds(); #ifdef _OPENMP omp_set_num_threads(NUM_THREADS); #endif #ifdef _OPENMP #pragma omp parallel #endif { LONG_T i, j, u, n, m, tid, nthreads; #ifdef DIAGNOSTIC double elapsed_time_part; #endif #ifdef _OPENMP nthreads = omp_get_num_threads(); tid = omp_get_thread_num(); #else tid = 0; nthreads = 1; #endif n = N; m = M; if (tid == 0) { #ifdef _OPENMP vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t)); assert(vLock != NULL); chunkSize = n/nthreads; #endif pos = (LONG_T *) malloc(m*sizeof(LONG_T)); assert(pos != NULL); degree = (LONG_T *) calloc(n, sizeof(LONG_T)); assert(degree != NULL); } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #pragma omp for schedule(static, chunkSize) for (i=0; i<n; i++) { omp_init_lock(&vLock[i]); } #pragma omp barrier #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Lock initialization time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #pragma omp for #endif for (i=0; i<m; i++) { u = SDGdata->startVertex[i]; #ifdef _OPENMP omp_set_lock(&vLock[u]); #endif pos[i] = degree[u]++; #ifdef _OPENMP omp_unset_lock(&vLock[u]); #endif } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Degree computation time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #pragma omp for schedule(static, chunkSize) for (i=0; i<n; i++) { omp_destroy_lock(&vLock[i]); } if (tid == 0) free(vLock); #endif #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Lock destruction time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif if (tid == 0) { numEdges = (LONG_T *) malloc((n+1)*sizeof(LONG_T)); pSums = (LONG_T *) malloc(nthreads*sizeof(LONG_T)); } #ifdef _OPENMP #pragma omp barrier #endif prefix_sums(degree, numEdges, pSums, n); #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Prefix sums time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { free(degree); free(pSums); w = (WEIGHT_T *) malloc(m*sizeof(WEIGHT_T)); endV = (VERT_T *) malloc(m* sizeof(VERT_T)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<m; i++) { u = SDGdata->startVertex[i]; j = numEdges[u] + pos[i]; endV[j] = SDGdata->endVertex[i]; //TODO: //w[j] = SDGdata->weight[i]; fprintf(stderr, "%d\n", SDGdata->weight[i]); w[j] = 1; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Edge data structure construction time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif if (tid == 0) { free(pos); G->n = n; G->m = m; G->numEdges = numEdges; G->endV = endV; G->weight = w; } #ifdef _OPENMP #endif } /* Verification */ #if 0 fprintf(stderr, "SDG data:\n"); for (int i=0; i<SDGdata->m; i++) { fprintf(stderr, "[%ld %ld %ld] ", SDGdata->startVertex[i], SDGdata->endVertex[i], SDGdata->weight[i]); } fprintf(stderr, "\n"); for (int i=0; i<G->n + 1; i++) { fprintf(stderr, "[%ld] ", G->numEdges[i]); } fprintf(stderr, "\nGraph:\n"); for (int i=0; i<G->n; i++) { for (int j=G->numEdges[i]; j<G->numEdges[i+1]; j++) { fprintf(stderr, "[%ld %ld %ld] ", i, G->endV[j], G->weight[j]); } } #endif free(SDGdata->startVertex); free(SDGdata->endVertex); free(SDGdata->weight); elapsed_time = get_seconds() - elapsed_time; return elapsed_time; }
int kdFoF(KD kd,float fEps) { PARTICLE *p; KDN *c; int pi,pj,pn,cp; int iGroup; int *Fifo,iHead,iTail,nFifo; float fEps2; float dx,dy,dz,x,y,z,lx,ly,lz,sx,sy,sz,fDist2; #ifdef _OPENMP int idSelf; omp_lock_t *locks; for (pn=0;pn<kd->nActive;++pn) kd->p[pn].iTouched = -1; /* We really want to make an independent lock for each particle. However, each lock * seems to use a buttload of memory (something like 312 bytes per lock). Therefore, * to ensure that we don't use too much memory, only use 1 lock per 100 particles. * This should still provide very low lock contention while not using oodles of * memory at the same time, since it is extremely rare that two threads will be looking * two particles that map to the same lock at the same time.*/ kd->nHash = (int)(kd->nActive/100); locks = (omp_lock_t *)malloc(kd->nHash*sizeof(omp_lock_t)); assert(locks != NULL); for (pn=0;pn<kd->nHash;++pn) omp_init_lock(&locks[pn]); #endif p = kd->p; c = kd->kdNodes; lx = kd->fPeriod[0]; ly = kd->fPeriod[1]; lz = kd->fPeriod[2]; fEps2 = fEps*fEps; for (pn=0;pn<kd->nActive;++pn) p[pn].iGroup = 0; #pragma omp parallel default(none) shared(kd,locks,p,c,lx,ly,lz,fEps2) \ private(pi,pj,pn,cp,iGroup,Fifo,iHead,iTail,dx,dy,dz,x,y,z,sx,sy,sz,fDist2,idSelf,nFifo) { #ifdef _OPENMP nFifo = kd->nActive/omp_get_num_threads(); idSelf = omp_get_thread_num(); #else nFifo = kd->nActive; #endif Fifo = (int *)malloc(nFifo*sizeof(int)); assert(Fifo != NULL); iHead = 0; iTail = 0; iGroup = 0; #pragma omp for schedule(runtime) for (pn=0;pn<kd->nActive;++pn) { if (p[pn].iGroup) continue; /* ** Mark it and add to the do-fifo. */ #ifdef _OPENMP omp_set_lock(&locks[_hashLock(kd,pn)]); if (p[pn].iTouched >= 0 && p[pn].iTouched < idSelf ) { assert(p[pn].iGroup > 0); omp_unset_lock(&locks[_hashLock(kd,pn)]); continue; } p[pn].iTouched = idSelf; iGroup = pn+1; p[pn].iGroup = iGroup; omp_unset_lock(&locks[_hashLock(kd,pn)]); #else ++iGroup; p[pn].iGroup = iGroup; #endif Fifo[iTail++] = pn; if (iTail == nFifo) iTail = 0; while (iHead != iTail) { pi = Fifo[iHead++]; if (iHead == nFifo) iHead = 0; /* ** Now do an fEps-Ball Gather! */ x = p[pi].r[0]; y = p[pi].r[1]; z = p[pi].r[2]; cp = ROOT; while (1) { INTERSECT(c,cp,fEps2,lx,ly,lz,x,y,z,sx,sy,sz); /* ** We have an intersection to test. */ if (c[cp].iDim >= 0) { cp = LOWER(cp); continue; } else { for (pj=c[cp].pLower;pj<=c[cp].pUpper;++pj) { #ifdef _OPENMP if (p[pj].iGroup == iGroup) { /* We have already looked at this particle */ //assert(p[pj].iTouched == idSelf); particle is not locked. continue; } if (p[pj].iTouched >= 0 && p[pj].iTouched < idSelf) { /* Somebody more important than us is already looking at this * particle. However, we do not yet know if this particle belongs * in our group, so just skip it to save time but don't restart the * entire group. */ // assert(p[pj].iGroup > 0); particle is not locked continue; } #else if (p[pj].iGroup) continue; #endif dx = sx - p[pj].r[0]; dy = sy - p[pj].r[1]; dz = sz - p[pj].r[2]; fDist2 = dx*dx + dy*dy + dz*dz; if (fDist2 < fEps2) { /* ** Mark it and add to the do-fifo. */ #ifdef _OPENMP omp_set_lock(&locks[_hashLock(kd,pj)]); if (p[pj].iTouched >= 0 && p[pj].iTouched < idSelf) { /* Now we know this particle should be in our group. If somebody more * important than us touched it, about the entire group. */ assert(p[pj].iGroup > 0); omp_unset_lock(&locks[_hashLock(kd,pj)]); iHead = iTail; /*printf("Thread %d: Aborting group %d. p[%d].iOrder p.iGroup=%d p.iTouched=%d (Per-Particle2)\n", idSelf, iGroup, pj, p[pj].iOrder, p[pj].iGroup, p[pj].iTouched);*/ goto RestartSnake; } p[pj].iTouched = idSelf; p[pj].iGroup = iGroup; omp_unset_lock(&locks[_hashLock(kd,pj)]); #else p[pj].iGroup = iGroup; #endif Fifo[iTail++] = pj; if (iTail == nFifo) iTail = 0; } } SETNEXT(cp); if (cp == ROOT) break; continue; } ContainedCell: for (pj=c[cp].pLower;pj<=c[cp].pUpper;++pj) { #ifdef _OPENMP if (p[pj].iGroup == iGroup) continue; if (p[pj].iTouched >= 0 && p[pj].iTouched < idSelf) { /* Somebody more important that us is already looking at this * group. Abort this entire group! */ //assert(p[pj].iGroup > 0); particle is not locked iHead = iTail; /*printf("Thread %d: Aborting group %d. p[%d].iOrder=%d p.iGroup=%d p.iTouched=%d (Per-Cell1)\n", idSelf, iGroup, pj, p[pj].iOrder, p[pj].iGroup, p[pj].iTouched);*/ goto RestartSnake; } #else if (p[pj].iGroup) continue; #endif /* ** Mark it and add to the do-fifo. */ #ifdef _OPENMP omp_set_lock(&locks[_hashLock(kd,pj)]); if (p[pj].iTouched >= 0 && p[pj].iTouched < idSelf) { /* Check again in case somebody touched it before the lock. */ assert(p[pj].iGroup > 0); omp_unset_lock(&locks[_hashLock(kd,pj)]); iHead = iTail; /*printf("Thread %d: Aborting group %d. p[%d].iGroup=%d p[%d].iTouched=%d (Per-Cell2)\n", idSelf, iGroup, pj, p[pj].iGroup, pj, p[pj].iTouched);*/ goto RestartSnake; } p[pj].iTouched = idSelf; p[pj].iGroup = iGroup; omp_unset_lock(&locks[_hashLock(kd,pj)]); #else p[pj].iGroup = iGroup; #endif Fifo[iTail++] = pj; if (iTail == nFifo) iTail = 0; } GetNextCell: SETNEXT(cp); if (cp == ROOT) break; } } /* End while(iHead != iTail) */ #ifdef _OPENMP RestartSnake: #endif assert(iHead == iTail); } free(Fifo); } /* End of the OpenMP PARALLEL section */ #ifdef _OPENMP /* Now we have count how many groups there are. This is straightforward, * since the number of groups is the number of particles whose groupID equals * their particleID+1. */ pj = 0; for (pn=0;pn<kd->nActive;++pn) if (p[pn].iGroup == pn+1) ++pj; kd->nGroup = (kd->nActive)+1; free(locks); #else kd->nGroup = iGroup+1; #endif return(kd->nGroup-1); }
void vertex_betweenness_centrality_parBFS(graph_t* G, double* BC, long numSrcs) { attr_id_t *S; /* stack of vertices in the order of non-decreasing distance from s. Also used to implicitly represent the BFS queue */ plist_t* P; /* predecessors of a vertex v on shortest paths from s */ double* sig; /* No. of shortest paths */ attr_id_t* d; /* Length of the shortest path between every pair */ double* del; /* dependency of vertices */ attr_id_t *in_degree, *numEdges, *pSums; attr_id_t* pListMem; #if RANDSRCS attr_id_t* Srcs; #endif attr_id_t *start, *end; long MAX_NUM_PHASES; attr_id_t *psCount; #ifdef _OPENMP omp_lock_t* vLock; long chunkSize; #endif #ifdef DIAGNOSTIC double elapsed_time; #endif int seed = 2387; #ifdef _OPENMP #pragma omp parallel firstprivate(G) { #endif attr_id_t *myS, *myS_t; attr_id_t myS_size; long i, j, k, p, count, myCount; long v, w, vert; long k0, k1; long numV, num_traversals, n, m, phase_num; long start_iter, end_iter; long tid, nthreads; int* stream; #ifdef DIAGNOSTIC double elapsed_time_part; #endif #ifdef _OPENMP int myLock; tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); #else tid = 0; nthreads = 1; #endif #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time = get_seconds(); elapsed_time_part = get_seconds(); } #endif /* numV: no. of vertices to run BFS from = numSrcs */ numV = numSrcs; n = G->n; m = G->m; /* Permute vertices */ if (tid == 0) { #if RANDSRCS Srcs = (attr_id_t *) malloc(n*sizeof(attr_id_t)); #endif #ifdef _OPENMP vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t)); #endif } #ifdef _OPENMP #pragma omp barrier #pragma omp for for (i=0; i<n; i++) { omp_init_lock(&vLock[i]); } #endif /* Initialize RNG stream */ stream = init_sprng(0, tid, nthreads, seed, SPRNG_DEFAULT); #if RANDSRCS #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { Srcs[i] = i; } #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { j = n * sprng(stream); if (i != j) { #ifdef _OPENMP int l1 = omp_test_lock(&vLock[i]); if (l1) { int l2 = omp_test_lock(&vLock[j]); if (l2) { #endif k = Srcs[i]; Srcs[i] = Srcs[j]; Srcs[j] = k; #ifdef _OPENMP omp_unset_lock(&vLock[j]); } omp_unset_lock(&vLock[i]); } #endif } } #endif #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { MAX_NUM_PHASES = 500; } #ifdef _OPENMP #pragma omp barrier #endif /* Initialize predecessor lists */ /* The size of the predecessor list of each vertex is bounded by its in-degree. So we first compute the in-degree of every vertex */ if (tid == 0) { P = (plist_t *) calloc(n, sizeof(plist_t)); in_degree = (attr_id_t *) calloc(n+1, sizeof(attr_id_t)); numEdges = (attr_id_t *) malloc((n+1)*sizeof(attr_id_t)); pSums = (attr_id_t *) malloc(nthreads*sizeof(attr_id_t)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<m; i++) { v = G->endV[i]; #ifdef _OPENMP omp_set_lock(&vLock[v]); #endif in_degree[v]++; #ifdef _OPENMP omp_unset_lock(&vLock[v]); #endif } prefix_sums(in_degree, numEdges, pSums, n); if (tid == 0) { pListMem = (attr_id_t *) malloc(m*sizeof(attr_id_t)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<n; i++) { P[i].list = pListMem + numEdges[i]; P[i].degree = in_degree[i]; P[i].count = 0; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() -elapsed_time_part; fprintf(stderr, "In-degree computation time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif /* Allocate shared memory */ if (tid == 0) { free(in_degree); free(numEdges); free(pSums); S = (attr_id_t *) malloc(n*sizeof(attr_id_t)); sig = (double *) malloc(n*sizeof(double)); d = (attr_id_t *) malloc(n*sizeof(attr_id_t)); del = (double *) calloc(n, sizeof(double)); start = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t)); end = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t)); psCount = (attr_id_t *) malloc((nthreads+1)*sizeof(attr_id_t)); } /* local memory for each thread */ myS_size = (2*n)/nthreads; myS = (attr_id_t *) malloc(myS_size*sizeof(attr_id_t)); num_traversals = 0; myCount = 0; #ifdef _OPENMP #pragma omp barrier #endif #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { d[i] = -1; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "BC initialization time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif for (p=0; p<n; p++) { #if RANDSRCS i = Srcs[p]; #else i = p; #endif if (G->numEdges[i+1] - G->numEdges[i] == 0) { continue; } else { num_traversals++; } if (num_traversals == numV + 1) { break; } if (tid == 0) { sig[i] = 1; d[i] = 0; S[0] = i; start[0] = 0; end[0] = 1; } count = 1; phase_num = 0; #ifdef _OPENMP #pragma omp barrier #endif while (end[phase_num] - start[phase_num] > 0) { myCount = 0; start_iter = start[phase_num]; end_iter = end[phase_num]; #ifdef _OPENMP #pragma omp barrier #pragma omp for schedule(dynamic) nowait #endif for (vert = start_iter; vert < end_iter; vert++) { v = S[vert]; for (j=G->numEdges[v]; j<G->numEdges[v+1]; j++) { w = G->endV[j]; if (v != w) { #ifdef _OPENMP myLock = omp_test_lock(&vLock[w]); if (myLock) { #endif /* w found for the first time? */ if (d[w] == -1) { if (myS_size == myCount) { /* Resize myS */ myS_t = (attr_id_t *) malloc(2*myS_size*sizeof(attr_id_t)); memcpy(myS_t, myS, myS_size*sizeof(attr_id_t)); free(myS); myS = myS_t; myS_size = 2*myS_size; } myS[myCount++] = w; d[w] = d[v] + 1; sig[w] = sig[v]; P[w].list[P[w].count++] = v; } else if (d[w] == d[v] + 1) { sig[w] += sig[v]; P[w].list[P[w].count++] = v; } #ifdef _OPENMP omp_unset_lock(&vLock[w]); } else { if ((d[w] == -1) || (d[w] == d[v]+ 1)) { omp_set_lock(&vLock[w]); sig[w] += sig[v]; P[w].list[P[w].count++] = v; omp_unset_lock(&vLock[w]); } } #endif } } } /* Merge all local stacks for next iteration */ phase_num++; if (tid == 0) { if (phase_num >= MAX_NUM_PHASES) { fprintf(stderr, "Error: Max num phases set to %ld\n", MAX_NUM_PHASES); fprintf(stderr, "Diameter of input network greater than" " this value. Increase MAX_NUM_PHASES" " in vertex_betweenness_centrality_parBFS()\n"); exit(-1); } } psCount[tid+1] = myCount; #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { start[phase_num] = end[phase_num-1]; psCount[0] = start[phase_num]; for(k=1; k<=nthreads; k++) { psCount[k] = psCount[k-1] + psCount[k]; } end[phase_num] = psCount[nthreads]; } #ifdef _OPENMP #pragma omp barrier #endif k0 = psCount[tid]; k1 = psCount[tid+1]; for (k = k0; k < k1; k++) { S[k] = myS[k-k0]; } count = end[phase_num]; } phase_num--; while (phase_num > 0) { start_iter = start[phase_num]; end_iter = end[phase_num]; #ifdef _OPENMP #pragma omp for schedule(static) nowait #endif for (j=start_iter; j<end_iter; j++) { w = S[j]; for (k = 0; k<P[w].count; k++) { v = P[w].list[k]; #ifdef _OPENMP omp_set_lock(&vLock[v]); #endif del[v] = del[v] + sig[v]*(1+del[w])/sig[w]; #ifdef _OPENMP omp_unset_lock(&vLock[v]); #endif } BC[w] += del[w]; } phase_num--; #ifdef _OPENMP #pragma omp barrier #endif } #ifdef _OPENMP chunkSize = n/nthreads; #pragma omp for schedule(static, chunkSize) nowait #endif for (j=0; j<count; j++) { w = S[j]; d[w] = -1; del[w] = 0; P[w].count = 0; } #ifdef _OPENMP #pragma omp barrier #endif } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "BC computation time: %lf seconds\n", elapsed_time_part); } #endif #ifdef _OPENMP #pragma omp barrier #endif #ifdef _OPENMP #pragma omp for for (i=0; i<n; i++) { omp_destroy_lock(&vLock[i]); } #endif free(myS); if (tid == 0) { free(S); free(pListMem); free(P); free(sig); free(d); free(del); #ifdef _OPENMP free(vLock); #endif free(start); free(end); free(psCount); #ifdef DIAGNOSTIC elapsed_time = get_seconds() - elapsed_time; fprintf(stderr, "Time taken: %lf\n seconds", elapsed_time); #endif #if RANDSRCS free(Srcs); #endif } free_sprng(stream); #ifdef _OPENMP } #endif }
int main (void) { double d, e; int l; omp_lock_t lck; omp_nest_lock_t nlck; d = omp_get_wtime (); omp_init_lock (&lck); omp_set_lock (&lck); if (omp_test_lock (&lck)) abort (); omp_unset_lock (&lck); if (! omp_test_lock (&lck)) abort (); if (omp_test_lock (&lck)) abort (); omp_unset_lock (&lck); omp_destroy_lock (&lck); omp_init_nest_lock (&nlck); if (omp_test_nest_lock (&nlck) != 1) abort (); omp_set_nest_lock (&nlck); if (omp_test_nest_lock (&nlck) != 3) abort (); omp_unset_nest_lock (&nlck); omp_unset_nest_lock (&nlck); if (omp_test_nest_lock (&nlck) != 2) abort (); omp_unset_nest_lock (&nlck); omp_unset_nest_lock (&nlck); omp_destroy_nest_lock (&nlck); omp_set_dynamic (1); if (! omp_get_dynamic ()) abort (); omp_set_dynamic (0); if (omp_get_dynamic ()) abort (); omp_set_nested (1); if (! omp_get_nested ()) abort (); omp_set_nested (0); if (omp_get_nested ()) abort (); omp_set_num_threads (5); if (omp_get_num_threads () != 1) abort (); if (omp_get_max_threads () != 5) abort (); if (omp_get_thread_num () != 0) abort (); omp_set_num_threads (3); if (omp_get_num_threads () != 1) abort (); if (omp_get_max_threads () != 3) abort (); if (omp_get_thread_num () != 0) abort (); l = 0; #pragma omp parallel reduction (|:l) { l = omp_get_num_threads () != 3; l |= omp_get_thread_num () < 0; l |= omp_get_thread_num () >= 3; #pragma omp master l |= omp_get_thread_num () != 0; } if (l) abort (); if (omp_get_num_procs () <= 0) abort (); if (omp_in_parallel ()) abort (); #pragma omp parallel reduction (|:l) l = ! omp_in_parallel (); #pragma omp parallel reduction (|:l) if (1) l = ! omp_in_parallel (); if (l) abort (); e = omp_get_wtime (); if (d > e) abort (); d = omp_get_wtick (); /* Negative precision is definitely wrong, bigger than 1s clock resolution is also strange. */ if (d <= 0 || d > 1) abort (); return 0; }
static void sort1 (int *array, int count) { omp_lock_t lock; struct int_pair_stack global_stack; int busy = 1; int num_threads; omp_init_lock (&lock); init_int_pair_stack (&global_stack); #pragma omp parallel firstprivate (array, count) { int lo = 0, hi = 0, mid, next_lo, next_hi; bool idle = true; struct int_pair_stack local_stack; init_int_pair_stack (&local_stack); if (omp_get_thread_num () == 0) { num_threads = omp_get_num_threads (); hi = count - 1; idle = false; } for (;;) { if (hi - lo < THRESHOLD) { insertsort (array, lo, hi); lo = hi; } if (lo >= hi) { if (size_int_pair_stack (&local_stack) == 0) { again: omp_set_lock (&lock); if (size_int_pair_stack (&global_stack) == 0) { if (!idle) busy--; if (busy == 0) { omp_unset_lock (&lock); break; } omp_unset_lock (&lock); idle = true; while (size_int_pair_stack (&global_stack) == 0 && busy) busy_wait (); goto again; } if (idle) busy++; pop_int_pair_stack (&global_stack, &lo, &hi); omp_unset_lock (&lock); idle = false; } else pop_int_pair_stack (&local_stack, &lo, &hi); } mid = partition (array, lo, hi); if (mid - lo < hi - mid) { next_lo = mid; next_hi = hi; hi = mid - 1; } else { next_lo = lo; next_hi = mid - 1; lo = mid; } if (next_hi - next_lo < THRESHOLD) insertsort (array, next_lo, next_hi); else { if (size_int_pair_stack (&global_stack) < num_threads - 1) { int size; omp_set_lock (&lock); size = size_int_pair_stack (&global_stack); if (size < num_threads - 1 && size < STACK_SIZE) push_int_pair_stack (&global_stack, next_lo, next_hi); else push_int_pair_stack (&local_stack, next_lo, next_hi); omp_unset_lock (&lock); } else push_int_pair_stack (&local_stack, next_lo, next_hi); } } } omp_destroy_lock (&lock); }
int main(int argc, char ** argv) { long iterations; /* total number of reference pair counter updates */ long stream_size; /* length of stream triad creating private work */ int page_fit; /* indicates that counters fit on different pages */ size_t store_size; /* amount of space reserved for counters */ double *pcounter1, *pcounter2; /* pointers to counters */ double cosa, sina; /* cosine and sine of rotation angle */ double *counter_space; /* pointer to space reserved for counters */ double refcounter1, refcounter2; /* reference values for counters */ double epsilon=1.e-7; /* required accuracy */ omp_lock_t counter_lock; /* lock that guards access to counters */ double refcount_time; /* timing parameter */ int nthread_input; /* thread parameters */ int nthread; /********************************************************************* ** process and test input parameters *********************************************************************/ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("OpenMP exclusive access test RefCount, shared counters\n"); if (argc != 4){ printf("Usage: %s <# threads> <# counter pair updates> <# private stream size>\n", *argv); return(1); } nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } iterations = atol(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); exit(EXIT_FAILURE); } stream_size = atol(*++argv); if (stream_size < 0) { printf("ERROR: private stream size %ld must be non-negative\n", stream_size); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); /* initialize shared counters; we put them on different pages, if possible. If the page size equals the whole memory, this will fail, and we reduce the space required */ page_fit = 1; store_size = (size_t) getpagesize(); #ifdef VERBOSE printf("Page size = %d\n", getpagesize()); #endif counter_space = (double *) malloc(store_size+sizeof(double)); while (!counter_space && store_size>2*sizeof(double)) { page_fit=0; store_size/=2; counter_space = (double *) malloc(store_size+sizeof(double)); } if (!counter_space) { printf("ERROR: could not allocate space for counters\n"); exit(EXIT_FAILURE); } #ifdef VERBOSE if (!page_fit) printf("Counters do not fit on different pages\n"); else printf("Counters fit on different pages\n"); #endif pcounter1 = counter_space; pcounter2 = counter_space + store_size/sizeof(double); COUNTER1 = 1.0; COUNTER2 = 0.0; cosa = cos(1.0); sina = sin(1.0); /* initialize the lock on which we will be pounding */ omp_init_lock(&counter_lock); #pragma omp parallel { long iter, j; /* dummies */ double tmp1; /* local copy of previous value of COUNTER1 */ double *a, *b, *c;/* private vectors */ int num_error=0;/* errors in private stream execution */ double aj, bj, cj; long space; space = 3*sizeof(double)*stream_size; a = (double *) malloc(space); if (!a) { printf("ERROR: Could not allocate %ld words for private streams\n", space); exit(EXIT_FAILURE); } b = a + stream_size; c = b + stream_size; for (j=0; j<stream_size; j++) { a[j] = A0; b[j] = B0; c[j] = C0; } #pragma omp master { nthread = omp_get_num_threads(); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %d\n",nthread_input); printf("Number of counter pair updates = %ld\n", iterations); printf("Length of private stream = %ld\n", stream_size); #ifdef DEPENDENT printf("Dependent counter pair update\n"); #else printf("Independent counter pair updates using"); #ifdef ATOMIC printf(" atomic operations\n"); #else printf(" using locks\n"); #endif #endif } } bail_out(num_error); /* do one warmup iteration outside main loop to avoid overhead */ #ifdef DEPENDENT omp_set_lock(&counter_lock); tmp1 = COUNTER1; COUNTER1 = cosa*tmp1 - sina*COUNTER2; COUNTER2 = sina*tmp1 + cosa*COUNTER2; omp_unset_lock(&counter_lock); #else #ifndef ATOMIC omp_set_lock(&counter_lock); #else #pragma omp atomic #endif COUNTER1++; #ifdef ATOMIC #pragma omp atomic #endif COUNTER2++; #ifndef ATOMIC omp_unset_lock(&counter_lock); #endif #endif /* give each thread some (overlappable) work to do */ private_stream(a, b, c, stream_size); #pragma omp master { refcount_time = wtime(); } #pragma omp for /* start with iteration nthread to take into account pre-loop iter */ for (iter=nthread; iter<=iterations; iter++) { #ifdef DEPENDENT omp_set_lock(&counter_lock); tmp1 = COUNTER1; COUNTER1 = cosa*tmp1 - sina*COUNTER2; COUNTER2 = sina*tmp1 + cosa*COUNTER2; omp_unset_lock(&counter_lock); #else #ifndef ATOMIC omp_set_lock(&counter_lock); #else #pragma omp atomic #endif COUNTER1++; #ifdef ATOMIC #pragma omp atomic #endif COUNTER2++; #ifndef ATOMIC omp_unset_lock(&counter_lock); #endif #endif /* give each thread some (overlappable) work to do */ private_stream(a, b, c, stream_size); } #pragma omp master { refcount_time = wtime() - refcount_time; } /* check whether the private work has been done correctly */ aj = A0; bj = B0; cj = C0; #pragma omp for for (iter=0; iter<=iterations; iter++) { aj += bj + SCALAR*cj; } for (j=0; j<stream_size; j++) { num_error += MAX(ABS(a[j]-aj)>epsilon,num_error); } if (num_error>0) { printf("ERROR: Thread %d encountered errors in private work\n", omp_get_thread_num()); } bail_out(num_error); } /* end of OpenMP parallel region */ #ifdef DEPENDENT refcounter1 = cos(iterations+1); refcounter2 = sin(iterations+1); #else refcounter1 = (double)(iterations+2); refcounter2 = (double)(iterations+1); #endif if ((ABS(COUNTER1-refcounter1)>epsilon) || (ABS(COUNTER2-refcounter2)>epsilon)) { printf("ERROR: Incorrect or inconsistent counter values %13.10lf %13.10lf; ", COUNTER1, COUNTER2); printf("should be %13.10lf, %13.10lf\n", refcounter1, refcounter2); } else { #ifdef VERBOSE printf("Solution validates; Correct counter values %13.10lf %13.10lf\n", COUNTER1, COUNTER2); #else printf("Solution validates\n"); #endif printf("Rate (MCPUPs/s): %lf time (s): %lf\n", iterations/refcount_time*1.e-6, refcount_time); } exit(EXIT_SUCCESS); }
void unlock() { omp_unset_lock( &m_lock ); }
FLA_Error FLA_Gemm_nn_omp_var15( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj CT, C0, CB, C1, C2; FLA_Obj AL, AR, A10, A11, A12; FLA_Obj BT, B0, BB, B1, B2; FLA_Obj C1_local; int i, j, lock_ldim, lock_i; int b_m, b_k; FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); #pragma intel omp parallel taskq { while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ) { b_m = FLA_Determine_blocksize( A, AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b_m, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* ** */ &C1, CB, &C2, b_m, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C1 = alpha * A1 * B + C1; */ FLA_Part_1x2( A1, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ) { b_k = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); // Get the index of the current partition. // FIX THIS: need + b_m - 1 or something like this //j = FLA_Obj_length( CT ) / b_m; //i = FLA_Obj_width( AL ) / b_k; //lock_ldim = FLA_get_num_threads_in_m_dim(omp_get_num_threads()); lock_i = FLA_Obj_length( CT ) / b_m; FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A10, /**/ &A11, &A12, b_k, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b_k, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C1 = alpha * A11 * B1 + C1; */ //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, //// alpha, A11, B1, FLA_ONE, C1 ); #pragma intel omp task captureprivate( lock_i, A11, B1, C1 ), private( C1_local ) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C1, &C1_local ); FLA_Obj_set_to_zero( C1_local ); /* C1_local = alpha * A1 * B11 + C1_local; */ FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A11, B1, FLA_ONE, C1_local ); // Acquire lock[i] (the lock for C1). omp_set_lock( &fla_omp_lock[lock_i] ); /* C1 += C1_local */ FLA_Axpy_external( FLA_ONE, C1_local, C1 ); //FLA_Axpy_sync_pipeline2( j*lock_ldim, FLA_ONE, C1_local, C1 ); //FLA_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 ); //REF_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 ); // Release lock[i] (the lock for C1). omp_unset_lock( &fla_omp_lock[lock_i] ); FLA_Obj_free( &C1_local ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A10, A11, /**/ A12, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, C1, /* ** */ /* ** */ &CB, C2, FLA_TOP ); } } return FLA_SUCCESS; }
void Roundworld::process() { if ( !pause ) { killHalf(); expireFood(); autoinsertFood(); expireCritters(); autoexchangeCritters(); autosaveCritters(); autoinsertCritters(); // adjust gravity vectors of all entities' rigid bodies unsigned int j, b; Food* f; CritterB* bod; btRigidBody* bo; for( j=0; j < entities.size(); j++) { if ( entities[j]->type == FOOD ) { // f = food[j]; Food* f = static_cast<Food*>( entities[j] ); for( b=0; b < f->body.bodyparts.size(); b++) { bo = f->body.bodyparts[b]->body; bo->setGravity( -(bo->getCenterOfMassPosition().normalized()*10) ); } } } for( j=0; j < critters.size(); j++) { bod = critters[j]; for( b=0; b < bod->body.bodyparts.size(); b++) { bo = bod->body.bodyparts[b]->body; bo->setGravity( -(bo->getCenterOfMassPosition().normalized()*10) ); } } if ( *critter_raycastvision == 0 ) { renderVision(); grabVision(); } // do a bullet step m_dynamicsWorld->stepSimulation(0.016667f, 0, 0.016667f); // m_dynamicsWorld->stepSimulation(Timer::Instance()->bullet_ms / 1000.f); int lmax = (int)critters.size(); CritterB *c; float freeEnergyc = 0.0f; // FIXME USE FROM WORLDB omp_set_num_threads( *threads ); #pragma omp parallel for ordered shared(freeEnergyc, lmax) private(c) // ordered for( int i=0; i < lmax; i++) { c = critters[i]; omp_set_lock(&my_lock1); checkCollisions( c ); omp_unset_lock(&my_lock1); // process c->process(); // record critter used energy freeEnergyc += c->energyUsed; // process Output Neurons eat(c); // procreation if procreation energy trigger is hit omp_set_lock(&my_lock1); procreate(c); omp_unset_lock(&my_lock1); } freeEnergy += freeEnergyc; getGeneralStats(); } }
void Slic::_GenerateSuperpixels() { const int MAX_ITER = 10; for(int I=0;I<MAX_ITER;++I) { clock_t t1 = clock(); std::cout<<"This is the "<<I<<"th circulation:"<<std::endl; omp_init_lock(&lock); #pragma omp parallel for for(int n=0;n<_N;++n) { for(int m=0;m<_M;++m) { // Init int nXOff = m*_regionSize; int nYOff = n*_regionSize; int nXSize = m==(_M-1)? _width - m*_regionSize : _regionSize; int nYSize = n==(_N-1)? _height - n*_regionSize : _regionSize; uchar *bufferSrc = new uchar[nXSize*nYSize*_dataSize*_bandCount]; int *bufferDst = new int[nXSize*nYSize]; omp_set_lock(&lock); _poSrcDS->RasterIO(GF_Read,nXOff,nYOff,nXSize,nYSize,bufferSrc,nXSize,nYSize,_dataType,_bandCount,0,0,0,0); _poDstDS->GetRasterBand(1)->RasterIO(GF_Read,nXOff,nYOff,nXSize,nYSize,bufferDst,nXSize,nYSize,GDT_Int32,0,0); omp_unset_lock(&lock); std::vector< int > candidateCenterID; for(int i=-1;i<2;++i) { for(int j=-1;j<2;++j) { if ((n+i)>=0 && (n+i)<_N && (m+j)>=0 && (m+j)<_M) candidateCenterID.push_back( (n+i) * _M + (m+j) ) ; } } // GetFeatureInfo FeatureVector featureVec(_bandCount + 2); for(int i=0,index=0;i<nYSize;++i) { for(int j=0;j<nXSize;++j,++index) { uchar* p = bufferSrc; for(int k=0;k<_bandCount;++k,p += nXSize*nYSize*_dataSize) { featureVec[k]= SRCVAL(p,_dataType,index)/_regularizer; } featureVec[_bandCount] = static_cast<double>(nXOff+j)/_regionSize; //x featureVec[_bandCount+1] = static_cast<double>(nYOff+i)/_regionSize; //y bufferDst[i*nXSize+j] = _GetNearestCenter(candidateCenterID,featureVec); } } omp_set_lock(&lock); _poDstDS->GetRasterBand(1)->RasterIO(GF_Write,nXOff,nYOff,nXSize,nYSize,bufferDst,nXSize,nYSize,GDT_Int32,0,0); omp_unset_lock(&lock); delete []bufferSrc; delete []bufferDst; } } omp_destroy_lock(&lock); _ComputeNewCenterVector(); std::cout<<"This circle cost: "<<static_cast<double>(clock()-t1)/CLOCKS_PER_SEC<<"s"<<std::endl; } }
void Mutex::unlock() { omp_unset_lock(&lock_); }
void PhotonMap::throughputByDensityEstimation(vec3f &color, Path &eyeMergePath, std::vector<LightPoint> &surfaceVertices, std::vector<LightPoint> &volumeVertices) { class Query{ PhotonMap *photonMap; vec3f contrib; vec3f position; vec3f hitNormal; float radius; int photonsNum; Ray outRay; float GaussianKernel(float mahalanobisDist) const{ double exponent = exp((double)-mahalanobisDist/2); //photonMap->fout << " Gaussian exp = " << exponent << std::endl; return exponent / (2*M_PI); } float Kernel(float distSqr, float radiusSqr) const{ float s = MAX(0, 1 - distSqr / radiusSqr); return 3 * s * s / M_PI; } public: Query(PhotonMap *map, float r, int n) : photonMap(map), radius(r), photonsNum(n) {} bool volumeMedia; void SetContrib(const vec3f &color) { contrib = color; } void SetPosition(const vec3f &pos) { position = pos; } void SetOutRay(const Ray &ray) { outRay = ray; } void SetNormal(const vec3f &n) { hitNormal = n; } vec3f GetContrib() const { return contrib; } vec3f GetPosition() const { return position; } void Process(const LightPoint &lightPoint){ if(volumeMedia && lightPoint.photonType != Ray::INVOL) return ; if(!volumeMedia && lightPoint.photonType != Ray::OUTVOL) return ; if(!lightPoint.pathThePointIn || lightPoint.indexInThePath < 0) return; Path &lightPath = *lightPoint.pathThePointIn; int index = lightPoint.indexInThePath; /* if (volumeMedia && lightPath[index].insideObject != outRay.insideObject) { printf("aye\n"); return; } if (!volumeMedia && lightPath[index].contactObject != outRay.contactObject) { printf("aye\n"); return; } */ vec3f photonThroughput(1,1,1); for(int i = 0; i < index; i++){ photonThroughput *= lightPath[i].color / lightPath[i].directionProb / lightPath[i].originProb; photonThroughput *= lightPath[i].getCosineTerm(); float dist = (lightPath[i].origin-lightPath[i+1].origin).length(); photonThroughput *= lightPath[i].getRadianceDecay(dist); } photonThroughput /= lightPath[index].originProb; // runs here, photon's f/p is done. Ray photonRay = lightPath[index]; photonRay.direction = lightPath[index-1].direction; vec3f color = photonThroughput * photonRay.getBSDF(outRay); float distSqr = powf((outRay.origin-lightPath[index].origin).length(), 2); if(intensity(color) < 1e-6f) return ; float kernel = Kernel(distSqr, radius*radius); float normalization = volumeMedia ? kernel/(photonsNum*radius*radius*radius) : kernel/(photonsNum*radius*radius); //float normalization = volumeMedia==false ? 1.0 / (photonsNum*PI*radius*radius) : 1.0 / (photonsNum*PI*4.0/3*radius*radius*radius); contrib += color * normalization; } double sumWeight; int photonsCount; void weightScale(){ contrib /= ( sumWeight / photonsCount ); } }; Query query(this, mRadius, mPhotonsNum); vec3f Tr(1,1,1), SurfaceColor(0,0,0), VolumeColor(0,0,0); int mergeIndex = 1; for(int i = 1; i < eyeMergePath.size(); i++){ float dist = MAX((eyeMergePath[i-1].origin-eyeMergePath[i].origin).length(), EPSILON); if(eyeMergePath[i-1].insideObject && eyeMergePath[i-1].insideObject->isVolumetric()){ if(eyeMergePath[i-1].insideObject->isHomogeneous()) { // ray marching volume radiance Ray volThroughRay = eyeMergePath[i-1]; SceneVPMObject *volume = static_cast<SceneVPMObject*>(volThroughRay.insideObject); float stepSize = volume->stepSize; int N = dist / stepSize; if(N == 0) N++; float step = dist / N; float offset = step * RandGenerator::genFloat(); float t = offset; Tr *= volume->getRadianceDecay(volThroughRay, offset); for(int j = 0; j < N; j++, t+=step){ query.SetContrib(vec3f(0,0,0)); query.SetPosition(volThroughRay.origin + volThroughRay.direction*t); Ray outRay = volThroughRay; outRay.direction = -volThroughRay.direction; outRay.origin = volThroughRay.origin + volThroughRay.direction*t; outRay.contactObject = NULL; query.SetOutRay(outRay); query.volumeMedia = true; volumeHashGrid.Process(volumeVertices, query); Tr *= volume->getRadianceDecay(outRay, step); vec3f volColor = query.GetContrib(); VolumeColor += volColor * Tr * step; } } else{ // ray marching volume radiance Ray volThroughRay = eyeMergePath[i-1]; HeterogeneousVolume *volume = static_cast<HeterogeneousVolume*>(volThroughRay.insideObject); float stepSize = volume->getStepSize(); int N = dist / stepSize; if(N == 0) N++; float step = dist / N; float offset = step * RandGenerator::genFloat(); float t = offset; Tr *= volume->getRadianceDecay(volThroughRay, offset); for(int j = 0; j < N; j++, t+=step){ query.SetContrib(vec3f(0,0,0)); query.SetPosition(volThroughRay.origin + volThroughRay.direction*t); Ray outRay = volThroughRay; outRay.direction = -volThroughRay.direction; outRay.origin = volThroughRay.origin + volThroughRay.direction*t; outRay.contactObject = NULL; query.SetOutRay(outRay); query.volumeMedia = true; volumeHashGrid.Process(volumeVertices, query); Tr *= volume->getRadianceDecay(outRay, step); vec3f volColor = query.GetContrib(); VolumeColor += volColor * Tr * step; } } } else { if (eyeMergePath[i - 1].insideObject) Tr *= eyeMergePath[i - 1].getRadianceDecay(dist); } if(eyeMergePath[i].contactObject && eyeMergePath[i].contactObject->emissive()){ // eye path hit light, surface color equals to light radiance SurfaceColor = eyeMergePath[i].color; mergeIndex = i; break; } if(eyeMergePath[i].contactObject && eyeMergePath[i].directionSampleType == Ray::RANDOM){ // non-specular photon density estimation if(eyeMergePath[i].contactObject->isVolumetric()) continue; query.SetContrib(vec3f(0,0,0)); query.SetPosition(eyeMergePath[i].origin); Ray outRay = eyeMergePath[i]; outRay.direction = -eyeMergePath[i-1].direction; query.SetOutRay(outRay); query.volumeMedia = false; Ray fromRay = eyeMergePath[i-1]; omp_set_lock(&surfaceHashGridLock); surfaceHashGrid.Process(surfaceVertices, query); omp_unset_lock(&surfaceHashGridLock); SurfaceColor = query.GetContrib(); mergeIndex = i; break; } } color = Tr * SurfaceColor + VolumeColor; if (rayMarching) { for(int i = 0; i < 1/*eyeMergePath.size()-1*/; i++){ color *= eyeMergePath[i].getCosineTerm() * eyeMergePath[i].color / eyeMergePath[i].directionProb / eyeMergePath[i].originProb; } } else { for(int i = 0; i < mergeIndex; i++){ color *= eyeMergePath[i].getCosineTerm() * eyeMergePath[i].color / eyeMergePath[i].directionProb / eyeMergePath[i].originProb; if (i + 1 < mergeIndex) { float dist = (eyeMergePath[i].origin - eyeMergePath[i+1].origin).length(); color *= eyeMergePath[i].getRadianceDecay(dist); } } } }
triangulation triangulate_cube_random(data_list * data) { int dim = data_list_dim(data); cube_points cube = gen_cube_points(dim); triangulation result; omp_lock_t result_lock; omp_init_lock(&result_lock); //If we found a triangulation, use this lock! facet_acute_data parameters; //Parameters for conform_ triangulation tmp_triang; //Triangulation we are expanding in current thread ptetra tet_list; //List of tetrahedrons, used in the parallel section unsigned short tet_list_len; //Holds the length of this list int triangulation_found = 0; //Stop if one of the threads has found a triangulation! int rand_bound, i; unsigned short tet_max, tet_min, tet_rand, tet_add; size_t max_volume; //Start the parallel loop! #pragma omp parallel default(none) \ private(parameters, tmp_triang, tet_list, tet_list_len, rand_bound, i,max_volume,tet_max, tet_min, tet_rand, tet_add) \ shared(result, result_lock, cube,data,dim, triangulation_found) { //Initalization for each thread parameters.cube = &cube; parameters.boundary_func = &triangle_boundary_cube; parameters.data = data; parameters.store_acute_ind = 1; parameters.acute_ind = malloc(sizeof(vert_index) * cube.len); tet_list = malloc(sizeof(tetra) * cube.len); max_volume = 0; while (!triangulation_found) { //Not found a triangulation //Initalize the triangulation variables tmp_triang = triangulation_init(dim); tet_list_len = 0; //Start triangle (0,0,0), (rand,0,0), (rand,rand,0) tmp_triang.bound_len = 1; tmp_triang.bound_tri = triangulation_start_facet(data); //printf("Thread %d with iteration %zu starts with:\n", omp_get_thread_num(), ++iterations); //print_triangle(tmp_triang.bound_tri); //While we have triangles on the boundary while (tmp_triang.bound_len > 0) { /* * We are going to add a tetrahedron on the boundary triangle. * To do so, we select a random triangle on the boundary. Then we generate all the * acute tetrahedra (above and below) with facets in our possible list. * From this list we remove all the tetrahedrons that intersect with our current triangulation. * Then we add a random tetrahedron to our triangulation and repeat. */ rand_bound = rand() % tmp_triang.bound_len; // //Calculate the conform tetrahedrons above and below if (!facet_conform(tmp_triang.bound_tri + rand_bound, ¶meters)) break; //Triangle on the boundary that does not have a conform facet tet_list_len = parameters.acute_ind_len; //Form explicit list of the tetrahedrons for (i = 0; i < tet_list_len; i++) { copyArr3(tet_list[i].vertices[0], tmp_triang.bound_tri[rand_bound].vertices[0]); copyArr3(tet_list[i].vertices[1], tmp_triang.bound_tri[rand_bound].vertices[1]); copyArr3(tet_list[i].vertices[2], tmp_triang.bound_tri[rand_bound].vertices[2]); copyArr3(tet_list[i].vertices[3], cube.points[parameters.acute_ind[i]]); } //Remove all the tetrahedrons that intersect with current triangulation. filter_tet_list_disjoint_triangulation(tet_list, &tet_list_len, &tmp_triang); if (tet_list_len == 0) break; //We can not find a conform tetrahedron for this boundary.. Restart //Select a ttetrahedron from the tet_list to add to the triangulation.. Different approaches. //Combinations between: random tetra, smallest volume, maximum volume. Indices stored in tet_max, tet_min and tet_rand tet_list_min_max_volume(tet_list, tet_list_len, &tet_max, &tet_min); tet_rand = rand() % tet_list_len; switch (omp_get_thread_num() % 6) { case 0: //Choose tet with max volume tet_add = tet_max; break; case 1: //Choose tet with min volume tet_add = tet_min; break; case 2: //Choose random tet tet_add = tet_rand; break; case 3: //Choose either max or min (random) tet_add = (rand() % 2)? tet_min : tet_max; break; case 4: //Choose either max or rand tet_add = (rand() % 5)? tet_max : tet_rand; break; case 5: //Either min or rand tet_add = (rand() % 5)? tet_min : tet_rand; break; default: tet_add = 0; } /* * Add the above tetra to the triangulation. * This removes all the boundary triangles that are covered by this tetrahedron */ add_tet_triangulation(tet_list + tet_add,&tmp_triang); } if (triangulation_volume(&tmp_triang) > max_volume) { max_volume = triangulation_volume(&tmp_triang); printf("Record for thread %d using method %d amount: %zu\n", omp_get_thread_num(), omp_get_thread_num() % 6, max_volume); triangulation_print(&tmp_triang); } if (tmp_triang.bound_len == 0) { printf("FOUND A TRIANGULATION!!!\n"); triangulation_print(&tmp_triang); omp_set_lock(&result_lock); result = tmp_triang; triangulation_found = 1; omp_unset_lock(&result_lock); } else triangulation_free(&tmp_triang); } free(parameters.acute_ind); free(tet_list); } free(cube.points); omp_destroy_lock(&result_lock); return result; }
int main(int argc, char **argv){ FILE *file = fopen("file1","r"); FILE *out = NULL; char str_buf[1024][50]; unsigned str_buf_in = 0; unsigned str_buf_out = 0; char str[50]; int read_finish = 0; int num_read = 0, num_write = 0; char **input_filenames = NULL; int input_len; //num of input files FILE **input_files = NULL; int i,j; double elapsed_time; int mapping_done = 0;//done when all mapper thread done struct timeval tvalBefore, tvalAfter; ////locks/// int rank, size, len; char name[MPI_MAX_PROCESSOR_NAME]; omp_set_num_threads(4); MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(name, &len); MPI_Status status; omp_init_lock(&worklock); omp_init_lock(&inclock); omp_init_lock(&readlock); omp_init_lock(&readerlock); omp_init_lock(&mapperlock); if(argc < 5){ printf("Usage ./mapreduce -in [input files].... -out [output file]\n"); return 0; }else{ if(strcmp("-in",argv[1])){ printf("Usage ./mapreduce -in [input files].... -out [output file]\n"); return 0; } for(i=2;i<argc;i++){ //start from first input file if(!strcmp("-out",argv[i])){ break; } } input_len = i - 2; input_filenames = (char**)malloc(sizeof(char*)*input_len); for(j=0;j<input_len;j++) input_filenames[j] = (char*)malloc(sizeof(char)*50); for(i=2,j=0;j<input_len;i++,j++){ strcpy(input_filenames[j],argv[i]); } input_files = read_in(input_filenames,input_len,0); if(strcmp("-out",argv[2+input_len])){ printf("output file missing, using default name 'out'\n"); out = fopen("out","w"); }else{ out = fopen(argv[3+input_len],"w"); } } omp_set_num_threads(8); fifoQ *queue_to_map = initQ(1000000, "queue_to_map"); fifoQ *queue_to_reduce = initQ(1000000, "queue_to_map"); fifoQ **queues_to_map = (fifoQ**)malloc(sizeof(fifoQ*)*5); queues_to_map[0] = initQ(1000000, "queue_to_map0"); queues_to_map[1] = initQ(1000000, "queue_to_map1"); queues_to_map[2] = initQ(1000000, "queue_to_map2"); queues_to_map[3] = initQ(1000000, "queue_to_map3"); queues_to_map[4] = initQ(1000000, "queue_to_map4"); fifoQ **queues_to_reduce = (fifoQ**)malloc(sizeof(fifoQ*)*5); queues_to_reduce[0] = initQ(1000000, "queue_to_reduce0"); queues_to_reduce[1] = initQ(1000000, "queue_to_reduce1"); queues_to_reduce[2] = initQ(1000000, "queue_to_reduce2"); queues_to_reduce[3] = initQ(1000000, "queue_to_reduce3"); queues_to_reduce[4] = initQ(1000000, "queue_to_reduce4"); fifoQ **queues_reduced = (fifoQ**)malloc(sizeof(fifoQ*)*5); fifoQ *final_queue = initQ(1000000, "final Q"); int sendsize = input_len/size + (input_len % size - rank > 0 ? 1 : 0); //num of files send to a node if(rank==0){ //distribute files int i,j; char ***files_tosend = (char***)malloc(sizeof(char**)*input_len); int lsendsize; FILE **node_files; for(i=0;i<size;i++){ lsendsize = input_len/size + (input_len % size - i > 0 ? 1 : 0); //num of files send to a node printf("send size of core %d is %d\n",i,lsendsize); files_tosend[i] = (char**)malloc(sizeof(char*)*lsendsize); for(j=0;j<lsendsize;j++){ files_tosend[i][j] = (char*)malloc(sizeof(char)*50); } } for(i=0;i<input_len;i++){ int belongs_to = i % size; int pos = i/size; strcpy(files_tosend[belongs_to][pos],input_filenames[i]); printf("distributing file %s to files_tosend %d,%d, value %s\n",input_filenames[i],belongs_to,pos,files_tosend[belongs_to][pos]); } if(size>1){ for(i=1;i<size;i++){ lsendsize = input_len/size + (input_len % size - i > 0 ? 1 : 0); for(j=0;j<lsendsize;j++){ printf("sending %s to cpu %d\n",files_tosend[i][j],i); MPI_Send(files_tosend[i][j],50,MPI_BYTE,i,1,MPI_COMM_WORLD); printf("send done\n"); } } } node_files = (FILE**)malloc(sizeof(FILE*)*sendsize); for(i=0;i<sendsize;i++){ node_files[i] = fopen(files_tosend[rank][i],"r"); } gettimeofday (&tvalBefore, NULL); #pragma omp parallel sections { #pragma omp section //reader thread0 { int i; int odd_even = 0; //printf("reader 0 is core #%d\n",rank); for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[0], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread0 done\n"); } #pragma omp section //reader thread1 { int i; int odd_even = 0; //printf("reader 1 is core #%d\n",rank); for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[1], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread1 done\n"); } #pragma omp section //reader thread2 { int i; int odd_even = 0; //printf("reader 2 is core #%d\n",rank); for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[2], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread2 done\n"); } #pragma omp section //reader thread3 { // printf("reader 3 is core #%d\n",rank); int i; int odd_even = 0; for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[3], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread3 done %d\n",rank); } #pragma omp section //mapper thread 0 { int i; fifoQ *innerQ = initQ(50000,"innerQ 0"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[0])){ printf(""); if(!is_empty(queues_to_map[0])){ work work = getWork(queues_to_map[0]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread0 done %d\n",rank); gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d in map 0\n",elapsed_time,rank); } #pragma omp section //mapper thread 1 { int i; fifoQ *innerQ = initQ(50000,"innerQ 1"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[1])){ printf(""); if(!is_empty(queues_to_map[1])){ work work = getWork(queues_to_map[1]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread1 done %d\n",rank); gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d in map 1\n",elapsed_time,rank); } #pragma omp section //mapper thread 2 { int i; fifoQ *innerQ = initQ(50000,"innerQ 2"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[2])){ printf(""); if(!is_empty(queues_to_map[2])){ work work = getWork(queues_to_map[2]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread2 done %d\n",rank); gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d in map 2\n",elapsed_time,rank); } #pragma omp section //mapper thread 3 { int i; fifoQ *innerQ = initQ(50000,"innerQ 2"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[3])){ printf(""); if(!is_empty(queues_to_map[3])){ work work = getWork(queues_to_map[3]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread3 done %d\n",rank); gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d in map 3\n",elapsed_time,rank); } #pragma omp section //reducer thread 0 { int i; gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d\n",elapsed_time,rank); while(mapping_done<NUM_READ_THREADS){ printf(""); } gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d\n",elapsed_time,rank); queues_reduced[0] = reducer(queues_to_reduce[0]); //printf("reducer thread 0 done\n"); gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d\n",elapsed_time,rank); } #pragma omp section //reducer thread 1 { int i; while(mapping_done<NUM_READ_THREADS){printf("");} queues_reduced[1] = reducer(queues_to_reduce[1]); //printf("reducer thread 1 done\n"); } #pragma omp section //reducer thread 2 { int i; while(mapping_done<NUM_READ_THREADS){printf("");} queues_reduced[2] = reducer(queues_to_reduce[2]); //printf("reducer thread 2 done\n"); } #pragma omp section //reducer thread 3 { int i; while(mapping_done<NUM_READ_THREADS){printf("");} queues_reduced[3] = reducer(queues_to_reduce[3]); //printf("reducer thread 3 done\n"); } } gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d\n",elapsed_time,rank); } else{ int i; FILE** node_files = (FILE**)malloc(sizeof(FILE*)*sendsize); for(i=0;i<sendsize;i++){ char *bufstr = (char*)malloc(sizeof(char)*50); MPI_Recv(bufstr,50,MPI_BYTE, 0,1, MPI_COMM_WORLD, &status); //printf("%s received\n",bufstr); node_files[i] = fopen(bufstr,"r"); } #pragma omp parallel sections shared(input_files) private(str) { //printf("using %d threads in core %d\n",omp_get_num_threads(),rank); #pragma omp section //reader thread0 { int i; int odd_even = 0; // printf("reader 0 is core #%d\n",rank); for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[0], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread0 done\n"); } #pragma omp section //reader thread1 { int i; int odd_even = 0; // printf("reader 1 is core #%d\n",rank); for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[1], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread1 done\n"); } #pragma omp section //reader thread2 { int i; int odd_even = 0; //printf("reader 2 is core #%d\n",rank); for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[2], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread2 done\n"); } #pragma omp section //reader thread3 { //printf("reader 3 is core #%d\n",rank); int i; int odd_even = 0; for(i=0;i<sendsize;i++){ while(!feof(node_files[i])){ /////////check if full/////////// omp_set_lock(&readerlock); if(!feof(node_files[i])){ strcpy(str,""); fscanf(node_files[i],"%s",str); } else{ omp_unset_lock(&readerlock); break; } omp_unset_lock(&readerlock); if(strcmp(str,"")) putWork(queues_to_map[3], constr_work(str)); } } omp_set_lock(&inclock); read_finish++; omp_unset_lock(&inclock); //printf("reader thread3 done %d\n",rank); } #pragma omp section //mapper thread 0 { int i; fifoQ *innerQ = initQ(50000,"innerQ 0"); //printf("map1\n"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[0])){ printf(""); if(!is_empty(queues_to_map[0])){ work work = getWork(queues_to_map[0]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread0 done %d\n",rank); } #pragma omp section //mapper thread 1 { int i; fifoQ *innerQ = initQ(50000,"innerQ 1"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[1])){ printf(""); if(!is_empty(queues_to_map[1])){ work work = getWork(queues_to_map[1]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread1 done %d\n",rank); } #pragma omp section //mapper thread 2 { int i; fifoQ *innerQ = initQ(50000,"innerQ 2"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[2])){ printf(""); if(!is_empty(queues_to_map[2])){ work work = getWork(queues_to_map[2]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread2 done %d\n",rank); } #pragma omp section //mapper thread 3 { int i; fifoQ *innerQ = initQ(50000,"innerQ 2"); while(read_finish<NUM_READ_THREADS || !is_empty(queues_to_map[3])){ printf(""); if(!is_empty(queues_to_map[3])){ work work = getWork(queues_to_map[3]); //mapper(queues_to_reduce[hash(work.str)], work); mapper(innerQ, work); } } for(i=0;i<=innerQ->in;i++){ work work = getWork(innerQ); putWork(queues_to_reduce[hash(work.str)],work); } omp_set_lock(&inclock); mapping_done++; omp_unset_lock(&inclock); //printf("mapper thread3 done %d\n",rank); } #pragma omp section //reducer thread 0 { int i; while(mapping_done<NUM_READ_THREADS){ printf(""); } queues_reduced[0] = reducer(queues_to_reduce[0]); //printf("reducer thread 0 done\n"); } #pragma omp section //reducer thread 1 { int i; while(mapping_done<NUM_READ_THREADS){printf("");} queues_reduced[1] = reducer(queues_to_reduce[1]); //printf("reducer thread 1 done\n"); } #pragma omp section //reducer thread 2 { int i; while(mapping_done<NUM_READ_THREADS){printf("");} queues_reduced[2] = reducer(queues_to_reduce[2]); //printf("reducer thread 2 done\n"); } #pragma omp section //reducer thread 3 { int i; while(mapping_done<NUM_READ_THREADS){printf("");} queues_reduced[3] = reducer(queues_to_reduce[3]); //printf("reducer thread 3 done\n"); } } } MPI_Barrier(MPI_COMM_WORLD); gettimeofday (&tvalAfter, NULL); elapsed_time = (float)(tvalAfter.tv_sec - tvalBefore.tv_sec)+((float)(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000); if(rank==0) printf("elapsed time = %.2f sec,rank %d\n",elapsed_time,rank); if(rank==0){ //final reducuction int i,j,revbuf;int mainct; for(i=0;i<NUM_READ_THREADS;i++){ combine_queue(final_queue,queues_reduced[i]); } //printf("main node has %d to final reduce\n",calcnum(queues_reduced,NUM_READ_THREADS)); for(i=1;i<size;i++){ MPI_Recv(&revbuf,1,MPI_INT,i,1,MPI_COMM_WORLD,&status); //printf("need to receive %d strings from node %d\n",revbuf,i); char *strbuf = (char*)malloc(sizeof(char)*50); char ctbuf = 0; for(j=0;j<revbuf;j++){ MPI_Recv(strbuf,50,MPI_BYTE,i,1,MPI_COMM_WORLD,&status); MPI_Recv(&ctbuf,50,MPI_INT,i,1,MPI_COMM_WORLD,&status); work work; strcpy(work.str,strbuf); work.count = ctbuf; //printf("received <%s,%d> from node %d\n",work.str,work.count,i); putWork(final_queue,work); } } fifoQ *output = reducer(final_queue); printQ_to_file(&output,1,out); }else{ int i,total_num; total_num = calcnum(queues_reduced,NUM_READ_THREADS); MPI_Send(&total_num,1,MPI_INT,0,1,MPI_COMM_WORLD); for(i=0;i<NUM_READ_THREADS;i++){ combine_queue(final_queue,queues_reduced[i]); } for(i=0;i<total_num;i++){ MPI_Send(&final_queue->works[i].str,50,MPI_BYTE,0,1,MPI_COMM_WORLD); MPI_Send(&final_queue->works[i].count,1,MPI_INT,0,1,MPI_COMM_WORLD); } } for(i=0;i<input_len;i++){ fclose(input_files[i]); } fclose(out); /*printQ(queues_to_map[0]); printQ(queues_to_map[1]); printQ(queues_to_map[2]); printQ(queues_to_map[3]);*/ /*printQ(queues_reduced[0]); printQ(queues_reduced[1]); printQ(queues_reduced[2]); printQ(queues_reduced[3]);*/ omp_destroy_lock(&inclock); omp_destroy_lock(&worklock); omp_destroy_lock(&readlock); omp_destroy_lock(&readerlock); omp_destroy_lock(&mapperlock); MPI_Finalize(); return 0; }
double betweennessCentrality(graph* G, DOUBLE_T* BC, int filter) { VERT_T *S; /* stack of vertices in the order of non-decreasing distance from s. Also used to implicitly represent the BFS queue */ plist* P; /* predecessors of a vertex v on shortest paths from s */ DOUBLE_T* sig; /* No. of shortest paths */ LONG_T* d; /* Length of the shortest path between every pair */ DOUBLE_T* del; /* dependency of vertices */ LONG_T *in_degree, *numEdges, *pSums; LONG_T *pListMem; LONG_T* Srcs; LONG_T *start, *end; LONG_T MAX_NUM_PHASES; LONG_T *psCount; #ifdef _OPENMP omp_lock_t* vLock; LONG_T chunkSize; #endif int seed = 2387; double elapsed_time; #ifdef _OPENMP #pragma omp parallel { #endif VERT_T *myS, *myS_t; LONG_T myS_size; LONG_T i, j, k, p, count, myCount; LONG_T v, w, vert; LONG_T numV, num_traversals, n, m, phase_num; LONG_T tid, nthreads; int* stream; #ifdef DIAGNOSTIC double elapsed_time_part; #endif #ifdef _OPENMP int myLock; tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); #else tid = 0; nthreads = 1; #endif #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds(); } #endif /* numV: no. of vertices to run BFS from = 2^K4approx */ numV = 1<<K4approx; n = G->n; m = G->m; /* Permute vertices */ if (tid == 0) { Srcs = (LONG_T *) malloc(n*sizeof(LONG_T)); #ifdef _OPENMP vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t)); #endif } #ifdef _OPENMP #pragma omp barrier #pragma omp for for (i=0; i<n; i++) { omp_init_lock(&vLock[i]); } #endif /* Initialize RNG stream */ stream = init_sprng(0, tid, nthreads, seed, SPRNG_DEFAULT); #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { Srcs[i] = i; } #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { j = n*sprng(stream); if (i != j) { #ifdef _OPENMP int l1 = omp_test_lock(&vLock[i]); if (l1) { int l2 = omp_test_lock(&vLock[j]); if (l2) { #endif k = Srcs[i]; Srcs[i] = Srcs[j]; Srcs[j] = k; #ifdef _OPENMP omp_unset_lock(&vLock[j]); } omp_unset_lock(&vLock[i]); } #endif } } #ifdef _OPENMP #pragma omp barrier #endif #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() -elapsed_time_part; fprintf(stderr, "Vertex ID permutation time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif /* Start timing code from here */ if (tid == 0) { elapsed_time = get_seconds(); #ifdef VERIFYK4 MAX_NUM_PHASES = 2*sqrt(n); #else MAX_NUM_PHASES = 50; #endif } #ifdef _OPENMP #pragma omp barrier #endif /* Initialize predecessor lists */ /* The size of the predecessor list of each vertex is bounded by its in-degree. So we first compute the in-degree of every vertex */ if (tid == 0) { P = (plist *) calloc(n, sizeof(plist)); in_degree = (LONG_T *) calloc(n+1, sizeof(LONG_T)); numEdges = (LONG_T *) malloc((n+1)*sizeof(LONG_T)); pSums = (LONG_T *) malloc(nthreads*sizeof(LONG_T)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<m; i++) { v = G->endV[i]; #ifdef _OPENMP omp_set_lock(&vLock[v]); #endif in_degree[v]++; #ifdef _OPENMP omp_unset_lock(&vLock[v]); #endif } prefix_sums(in_degree, numEdges, pSums, n); if (tid == 0) { pListMem = (LONG_T *) malloc(m*sizeof(LONG_T)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<n; i++) { P[i].list = pListMem + numEdges[i]; P[i].degree = in_degree[i]; P[i].count = 0; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "In-degree computation time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif /* Allocate shared memory */ if (tid == 0) { free(in_degree); free(numEdges); free(pSums); S = (VERT_T *) malloc(n*sizeof(VERT_T)); sig = (DOUBLE_T *) malloc(n*sizeof(DOUBLE_T)); d = (LONG_T *) malloc(n*sizeof(LONG_T)); del = (DOUBLE_T *) calloc(n, sizeof(DOUBLE_T)); start = (LONG_T *) malloc(MAX_NUM_PHASES*sizeof(LONG_T)); end = (LONG_T *) malloc(MAX_NUM_PHASES*sizeof(LONG_T)); psCount = (LONG_T *) malloc((nthreads+1)*sizeof(LONG_T)); } /* local memory for each thread */ myS_size = (2*n)/nthreads; myS = (LONG_T *) malloc(myS_size*sizeof(LONG_T)); num_traversals = 0; myCount = 0; #ifdef _OPENMP #pragma omp barrier #endif #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { d[i] = -1; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() -elapsed_time_part; fprintf(stderr, "BC initialization time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif for (p=0; p<n; p++) { i = Srcs[p]; //printf ("%d \n", i); // i = p; if (G->numEdges[i+1] - G->numEdges[i] == 0) { continue; } else { num_traversals++; } if (num_traversals == numV + 1) { break; } if (tid == 0) { sig[i] = 1; d[i] = 0; S[0] = i; start[0] = 0; end[0] = 1; } count = 1; phase_num = 0; #ifdef _OPENMP #pragma omp barrier #endif while (end[phase_num] - start[phase_num] > 0) { myCount = 0; #ifdef _OPENMP #pragma omp barrier #pragma omp for schedule(dynamic) #endif for (vert = start[phase_num]; vert < end[phase_num]; vert++) { v = S[vert]; for (j=G->numEdges[v]; j<G->numEdges[v+1]; j++) { if ((G->weight[j] & 7) == 0 && filter==1) continue; w = G->endV[j]; if (v != w) { #ifdef _OPENMP myLock = omp_test_lock(&vLock[w]); if (myLock) { #endif /* w found for the first time? */ if (d[w] == -1) { if (myS_size == myCount) { /* Resize myS */ myS_t = (LONG_T *) malloc(2*myS_size*sizeof(VERT_T)); memcpy(myS_t, myS, myS_size*sizeof(VERT_T)); free(myS); myS = myS_t; myS_size = 2*myS_size; } myS[myCount++] = w; d[w] = d[v] + 1; sig[w] = sig[v]; P[w].list[P[w].count++] = v; } else if (d[w] == d[v] + 1) { sig[w] += sig[v]; P[w].list[P[w].count++] = v; } #ifdef _OPENMP omp_unset_lock(&vLock[w]); } else { if ((d[w] == -1) || (d[w] == d[v]+ 1)) { omp_set_lock(&vLock[w]); sig[w] += sig[v]; P[w].list[P[w].count++] = v; omp_unset_lock(&vLock[w]); } } #endif } } } /* Merge all local stacks for next iteration */ phase_num++; psCount[tid+1] = myCount; #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { start[phase_num] = end[phase_num-1]; psCount[0] = start[phase_num]; for(k=1; k<=nthreads; k++) { psCount[k] = psCount[k-1] + psCount[k]; } end[phase_num] = psCount[nthreads]; } #ifdef _OPENMP #pragma omp barrier #endif for (k = psCount[tid]; k < psCount[tid+1]; k++) { S[k] = myS[k-psCount[tid]]; } #ifdef _OPENMP #pragma omp barrier #endif count = end[phase_num]; } phase_num--; #ifdef _OPENMP #pragma omp barrier #endif //printf ("%d\n", phase_num); while (phase_num > 0) { #ifdef _OPENMP #pragma omp for #endif for (j=start[phase_num]; j<end[phase_num]; j++) { w = S[j]; for (k = 0; k<P[w].count; k++) { v = P[w].list[k]; #ifdef _OPENMP omp_set_lock(&vLock[v]); #endif del[v] = del[v] + sig[v]*(1+del[w])/sig[w]; #ifdef _OPENMP omp_unset_lock(&vLock[v]); #endif } BC[w] += del[w]; } phase_num--; #ifdef _OPENMP #pragma omp barrier #endif } #ifdef _OPENMP chunkSize = n/nthreads; #pragma omp for schedule(static, chunkSize) #endif for (j=0; j<count; j++) { w = S[j]; //fprintf (stderr, "w: %d\n", w); d[w] = -1; del[w] = 0; P[w].count = 0; } #ifdef _OPENMP #pragma omp barrier #endif } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() -elapsed_time_part; fprintf(stderr, "BC computation time: %lf seconds\n", elapsed_time_part); } #endif #ifdef _OPENMP #pragma omp for for (i=0; i<n; i++) { omp_destroy_lock(&vLock[i]); } #endif free(myS); if (tid == 0) { free(S); free(pListMem); free(P); free(sig); free(d); free(del); #ifdef _OPENMP free(vLock); #endif free(start); free(end); free(psCount); elapsed_time = get_seconds() - elapsed_time; free(Srcs); } free_sprng(stream); #ifdef _OPENMP } #endif /* Verification */ #ifdef VERIFYK4 double BCval; if (SCALE % 2 == 0) { BCval = 0.5*pow(2, 3*SCALE/2)-pow(2, SCALE)+1.0; } else { BCval = 0.75*pow(2, (3*SCALE-1)/2)-pow(2, SCALE)+1.0; } int failed = 0; for (int i=0; i<G->n; i++) { if (round(BC[i] - BCval) != 0) { failed = 1; break; } } if (failed) { fprintf(stderr, "Kernel 4 failed validation!\n"); } else { fprintf(stderr, "Kernel 4 validation successful!\n"); } #endif for (int i = 0; i < G->n; i++) printf ("BC: %d %f\n",i, BC[i]); return elapsed_time; }
static void unLockNode(IDnum preNodeID) { omp_unset_lock(nodeLocks + preNodeID); }
void vertex_betweenness_centrality_simple(graph_t* G, double* BC, long numSrcs) { attr_id_t *in_degree, *numEdges, *pSums; #if RANDSRCS attr_id_t* Srcs; #endif long num_traversals = 0; #ifdef _OPENMP omp_lock_t* vLock; long chunkSize; #endif #ifdef DIAGNOSTIC double elapsed_time; #endif int seed = 2387; /* The outer loop is parallelized in this case. Each thread does a BFS and the vertex BC values are incremented atomically */ #ifdef _OPENMP #pragma omp parallel firstprivate(G) { #endif attr_id_t *S; /* stack of vertices in the order of non-decreasing distance from s. Also used to implicitly represent the BFS queue */ plist_t* P; /* predecessors of a vertex v on shortest paths from s */ attr_id_t* pListMem; double* sig; /* No. of shortest paths */ attr_id_t* d; /* Length of the shortest path between every pair */ double* del; /* dependency of vertices */ attr_id_t *start, *end; long MAX_NUM_PHASES; long i, j, k, p, count; long v, w, vert; long numV, n, m, phase_num; long tid, nthreads; int* stream; #ifdef DIAGNOSTIC double elapsed_time_part; #endif #ifdef _OPENMP int myLock; tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); #else tid = 0; nthreads = 1; #endif #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time = get_seconds(); elapsed_time_part = get_seconds(); } #endif /* numV: no. of vertices to run BFS from = numSrcs */ numV = numSrcs; n = G->n; m = G->m; /* Permute vertices */ if (tid == 0) { #if RANDSRCS Srcs = (attr_id_t *) malloc(n*sizeof(attr_id_t)); #endif #ifdef _OPENMP vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t)); #endif } #ifdef _OPENMP #pragma omp barrier #pragma omp for for (i=0; i<n; i++) { omp_init_lock(&vLock[i]); } #endif /* Initialize RNG stream */ stream = init_sprng(0, tid, nthreads, seed, SPRNG_DEFAULT); #if RANDSRCS #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { Srcs[i] = i; } #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { j = n * sprng(stream); if (i != j) { #ifdef _OPENMP int l1 = omp_test_lock(&vLock[i]); if (l1) { int l2 = omp_test_lock(&vLock[j]); if (l2) { #endif k = Srcs[i]; Srcs[i] = Srcs[j]; Srcs[j] = k; #ifdef _OPENMP omp_unset_lock(&vLock[j]); } omp_unset_lock(&vLock[i]); } #endif } } #endif #ifdef _OPENMP #pragma omp barrier #endif MAX_NUM_PHASES = 50; /* Initialize predecessor lists */ /* The size of the predecessor list of each vertex is bounded by its in-degree. So we first compute the in-degree of every vertex */ if (tid == 0) { in_degree = (attr_id_t *) calloc(n+1, sizeof(attr_id_t)); numEdges = (attr_id_t *) malloc((n+1)*sizeof(attr_id_t)); pSums = (attr_id_t *) malloc(nthreads*sizeof(attr_id_t)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<m; i++) { v = G->endV[i]; #ifdef _OPENMP omp_set_lock(&vLock[v]); #endif in_degree[v]++; #ifdef _OPENMP omp_unset_lock(&vLock[v]); #endif } prefix_sums(in_degree, numEdges, pSums, n); P = (plist_t *) calloc(n, sizeof(plist_t)); pListMem = (attr_id_t *) malloc(m*sizeof(attr_id_t)); for (i=0; i<n; i++) { P[i].list = pListMem + numEdges[i]; P[i].degree = in_degree[i]; P[i].count = 0; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() -elapsed_time_part; fprintf(stderr, "In-degree computation time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #endif /* Allocate shared memory */ if (tid == 0) { free(in_degree); free(numEdges); free(pSums); } S = (attr_id_t *) malloc(n*sizeof(attr_id_t)); sig = (double *) malloc(n*sizeof(double)); d = (attr_id_t *) malloc(n*sizeof(attr_id_t)); del = (double *) calloc(n, sizeof(double)); start = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t)); end = (attr_id_t *) malloc(MAX_NUM_PHASES*sizeof(attr_id_t)); #ifdef _OPENMP #pragma omp barrier #endif for (i=0; i<n; i++) { d[i] = -1; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "BC initialization time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp for reduction(+:num_traversals) #endif for (p=0; p<numV; p++) { #if RANDSRCS i = Srcs[p]; #else i = p; #endif if (G->numEdges[i+1] - G->numEdges[i] == 0) { continue; } else { num_traversals++; } sig[i] = 1; d[i] = 0; S[0] = i; start[0] = 0; end[0] = 1; count = 1; phase_num = 0; while (end[phase_num] - start[phase_num] > 0) { for (vert = start[phase_num]; vert < end[phase_num]; vert++) { v = S[vert]; for (j=G->numEdges[v]; j<G->numEdges[v+1]; j++) { w = G->endV[j]; if (v != w) { /* w found for the first time? */ if (d[w] == -1) { S[count++] = w; d[w] = d[v] + 1; sig[w] = sig[v]; P[w].list[P[w].count++] = v; } else if (d[w] == d[v] + 1) { sig[w] += sig[v]; P[w].list[P[w].count++] = v; } } } } phase_num++; start[phase_num] = end[phase_num-1]; end[phase_num] = count; } phase_num--; while (phase_num > 0) { for (j=start[phase_num]; j<end[phase_num]; j++) { w = S[j]; for (k = 0; k<P[w].count; k++) { v = P[w].list[k]; del[v] = del[v] + sig[v]*(1+del[w])/sig[w]; } #ifdef _OPENMP omp_set_lock(&vLock[w]); BC[w] += del[w]; omp_unset_lock(&vLock[w]); #else BC[w] += del[w]; #endif } phase_num--; } for (j=0; j<count; j++) { w = S[j]; d[w] = -1; del[w] = 0; P[w].count = 0; } } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "BC computation time: %lf seconds\n", elapsed_time_part); } #endif #ifdef _OPENMP #pragma omp barrier #endif #ifdef _OPENMP #pragma omp for for (i=0; i<n; i++) { omp_destroy_lock(&vLock[i]); } #endif free(S); free(pListMem); free(P); free(sig); free(d); free(del); free(start); free(end); if (tid == 0) { #ifdef _OPENMP free(vLock); #endif #if RANDSRCS free(Srcs); #endif #ifdef DIAGNOSTIC elapsed_time = get_seconds() - elapsed_time; fprintf(stderr, "Total time taken: %lf seconds\n", elapsed_time); #endif } free_sprng(stream); #ifdef _OPENMP #pragma omp barrier } #endif }
/* * Computes clusters' centroids. */ static void compute_centroids(void) { int i, j; /* Loop indexes. */ int population; /* Centroid population. */ start = timer_get(); memcpy(lcentroids, CENTROID(rank*(ncentroids/nprocs)), lncentroids[rank]*dimension*sizeof(float)); memset(&has_changed[rank*NUM_THREADS], 0, NUM_THREADS*sizeof(int)); memset(centroids, 0, (ncentroids + DELTA*nprocs)*dimension*sizeof(float)); memset(ppopulation, 0, (ncentroids + nprocs*DELTA)*sizeof(int)); /* Compute partial centroids. */ #pragma omp parallel for schedule(static) default(shared) private(i, j) for (i = 0; i < lnpoints; i++) { j = map[i]%NUM_THREADS; omp_set_lock(&lock[j]); vector_add(CENTROID(map[i]), POINT(i)); ppopulation[map[i]]++; omp_unset_lock(&lock[j]); } end = timer_get(); total += timer_diff(start, end); sync_pcentroids(); sync_ppopulation(); start = timer_get(); /* Compute centroids. */ #pragma omp parallel for schedule(static) default(shared) private(i, j, population) for (j = 0; j < lncentroids[rank]; j++) { population = 0; for (i = 0; i < nprocs; i++) { if (*POPULATION(i, j) == 0) continue; population += *POPULATION(i, j); if (i == rank) continue; vector_add(PCENTROID(rank, j), PCENTROID(i, j)); } if (population > 1) vector_mult(PCENTROID(rank, j), 1.0/population); /* Cluster mean has changed. */ if (!vector_equal(PCENTROID(rank, j), LCENTROID(j))) { has_changed[rank*NUM_THREADS + omp_get_thread_num()] = 1; vector_assign(LCENTROID(j), PCENTROID(rank, j)); } } end = timer_get(); total += timer_diff(start, end); sync_centroids(); sync_status(); }
void OpenMPCounter::reset() { omp_set_lock(&_lock); _counter = 0; omp_unset_lock(&_lock); }
void YsOpenMPMutex::Unlock(void) { omp_unset_lock(&lock); }
void Unlock() { omp_unset_lock(&lock); }
void IptTracer::genLightPaths(omp_lock_t& cmdLock , vector<Path*>& lightPathList , bool isFirstIter) { #pragma omp parallel for for(int p=0; p<lightPathNum; p++) { if (!renderer->scene.usingGPU()) { Ray lightRay = genEmissiveSurfaceSample(true , false); lightPathList[p] = new Path; samplePath(*lightPathList[p] , lightRay); } Path& lightPath = *lightPathList[p]; if (lightPath.size() <= 1) continue; IptPathState lightState; lightState.originRay = &lightPath[0]; Real cosAtLight = lightPath[0].getCosineTerm(); lightState.throughput = lightPath[0].color * cosAtLight / (lightPath[0].originProb * lightPath[0].directionProb * lightPath[1].originProb); lightState.indirContrib = vec3f(0.0); lightState.mergedPath = 1; /* fprintf(fp , "====================\n"); vec3f decay = lightPath[0].getRadianceDecay((lightPath[0].origin - lightPath[1].origin).length()); fprintf(fp , "l = 0 , thr = (%.8f,%.8f,%.8f) , color = (%.8f,%.8f,%.8f)\ncosine = %.8f , dirPdf = %.8f , oPdf = %.8f\ndecay=(%.8f,%.8f,%.8f)\n" , lightState.throughput.x , lightState.throughput.y , lightState.throughput.z , lightPath[0].color[0] , lightPath[0].color[1] , lightPath[0].color[2] , lightPath[0].getCosineTerm() , lightPath[0].directionProb , lightPath[1].originProb , decay.x , decay.y , decay.z); */ int nonSpecPathLength = 0; for(unsigned i = 1; i < lightPath.size(); i++) //for (unsigned i = 1; i < 2; i++) { Real dist = std::max((lightPath[i].origin - lightPath[i - 1].origin).length() , 1e-5f); vec3f decayFactor = lightPath[i - 1].getRadianceDecay(dist); lightState.throughput *= decayFactor; if(lightPath[i].contactObject && lightPath[i].contactObject->emissive()) break; lightState.pos = lightPath[i].origin; lightState.lastRay = &lightPath[i - 1]; lightState.ray = &lightPath[i]; lightState.pathLen = i; if(lightPath[i].directionSampleType == Ray::RANDOM && (lightPath[i].insideObject != NULL || lightPath[i].contactObject != NULL) && (lightPath[i].origin != lightPath[i - 1].origin)) { //if (lightPath[i].insideObject && !lightPath[i].contactObject) // fprintf(fp , "path length = %d, dirContrib = (%.8f,%.8f,%.8f)\n" , // i , lightState.dirContrib[0] , lightState.dirContrib[1] , lightState.dirContrib[2]); omp_set_lock(&cmdLock); partialSubPathList.push_back(lightState); omp_unset_lock(&cmdLock); } if (i == lightPath.size() - 1) break; if (lightPath[i].direction.length() < 0.5f) break; vec3f scatterFactor = (lightPath[i].color * lightPath[i].getCosineTerm() / (lightPath[i + 1].originProb * lightPath[i].directionProb)); lightState.throughput *= scatterFactor; /* vec3f decay = lightPath[i].getRadianceDecay((lightPath[i].origin - lightPath[i + 1].origin).length()); fprintf(fp , "l = %d , thr = (%.8f,%.8f,%.8f) , color = (%.8f,%.8f,%.8f)\ncosine = %.8f , dirPdf = %.8f , oPdf = %.8f\ndecay=(%.8f,%.8f,%.8f)\n" , i , lightState.throughput.x , lightState.throughput.y , lightState.throughput.z , lightPath[i].color[0] , lightPath[i].color[1] , lightPath[i].color[2] , lightPath[i].getCosineTerm() , lightPath[i].directionProb , lightPath[i + 1].originProb , decay.x , decay.y , decay.z); */ if (lightPath[i].directionSampleType == Ray::RANDOM && useWeight) { Real pdf = lightPath[i].directionProb; if (pdf < 1e-7f) break; Real weightFactor; Real volMergeScale = 1; Real originProb; Real dirProb; if (lightPath[i].contactObject) { if (isFirstIter || useUniformSur) originProb = 1.f / totArea; else originProb = lightPath[i].contactObject->getOriginProb(lightPath[i].contactObjectTriangleID); if (useUniformDir) dirProb = INV_2_PI; else dirProb = lightPath[i].getCosineTerm() / M_PI; } //if (lightPath[i].insideObject && lightPath[i].contactObject) // printf("!!!\n"); if (lightPath[i].insideObject && !lightPath[i].contactObject && lightPath[i].insideObject->isVolumetric()) { volMergeScale = 4.f / 3.f * mergeRadius; if (isFirstIter || useUniformVol) originProb = 1.f / totVol; else originProb = lightPath[i].insideObject->getOriginProb(lightPath[i].origin); dirProb = 0.25f / M_PI; } weightFactor = connectFactor(pdf) / (connectFactor(pdf) + mergeFactor(&volMergeScale , &originProb , &dirProb , &lightPathNum)); if (_isnan(weightFactor) || abs(pdf) < 1e-6f) { fprintf(err , "sample light path error, %.8f , %.8f\n" , connectFactor(pdf) , mergeFactor(&volMergeScale , &originProb , &dirProb , &lightPathNum)); } /* if (abs(volMergeScale - 1.f) < 1e-6) printf("surface %.8f\n" , weightFactor); else printf("volume %.8f %.8f\n" , weightFactor); */ //if (lightPath[i].contactObject && lightPath[i].contactObject->objectIndex == 7) lightState.throughput *= weightFactor; } } } lightPhotonNum = partialPhotonNum = partialSubPathList.size(); }
inline void unsetLock() { omp_unset_lock(&lock_); }
void IptTracer::genIntermediatePaths(omp_lock_t& cmdLock , vector<Path*>& interPathList) { #pragma omp parallel for for(int p=0; p<interPathNum; p++) { if (!renderer->scene.usingGPU()) { Ray interRay = genIntermediateSamples(renderer->scene); interPathList[p] = new Path; samplePath(*interPathList[p] , interRay); } Path& interPath = *interPathList[p]; //fprintf(fp , "=================\n"); partPathMergeIndex[p].clear(); if (interPath.size() <= 1) continue; IptPathState interState; interState.originRay = &interPath[0]; interState.throughput = interPath[0].color * interPath[0].getCosineTerm() / (interPath[0].originProb * interPath[0].directionProb * interPath[1].originProb); interState.indirContrib = vec3f(0.f); interState.mergedPath = 0; //if (intensity(interState.throughput) > 30.f) // continue; /* fprintf(fp , "====================\n"); vec3f decay = interPath[0].getRadianceDecay((interPath[0].origin - interPath[1].origin).length()); fprintf(fp , "l = 0 , thr = (%.8f,%.8f,%.8f) , color = (%.8f,%.8f,%.8f)\ncosine = %.8f , dirPdf = %.8f , oPdf = %.8f\ndecay=(%.8f,%.8f,%.8f)\n" , interState.throughput.x , interState.throughput.y , interState.throughput.z , interPath[0].color[0] , interPath[0].color[1] , interPath[0].color[2] , interPath[0].getCosineTerm() , interPath[0].directionProb , interPath[1].originProb , decay.x , decay.y , decay.z); */ for(unsigned i = 1; i < interPath.size(); i++) //for (unsigned i = 1; i < 2; i++) { Real dist = std::max((interPath[i].origin - interPath[i - 1].origin).length() , 1e-5f); interState.throughput *= interPath[i - 1].getRadianceDecay(dist); if(interPath[i].contactObject && interPath[i].contactObject->emissive()) break; interState.pos = interPath[i].origin; interState.lastRay = &interPath[i - 1]; interState.ray = &interPath[i]; interState.pathLen = i; if(interPath[i].directionSampleType != Ray::DEFINITE && (interPath[i].insideObject != NULL || interPath[i].contactObject != NULL) && (interPath[i].origin != interPath[i - 1].origin)) //(interPath[i].insideObject && !interPath[i].contactObject)) // only volume { //fprintf(fp , "path length = %d, dirContrib = (%.8f,%.8f,%.8f)\n" , // i , interState.dirContrib[0] , interState.dirContrib[1] , interState.dirContrib[2]); omp_set_lock(&cmdLock); partialSubPathList.push_back(interState); partPathMergeIndex[p].push_back(partialSubPathList.size() - 1); omp_unset_lock(&cmdLock); } if (i == interPath.size() - 1) break; if (interPath[i].direction.length() < 0.5f) break; vec3f scatterFactor = (interPath[i].color * interPath[i].getCosineTerm() / (interPath[i + 1].originProb * interPath[i].directionProb)); interState.throughput *= scatterFactor; /* vec3f decay = interPath[i].getRadianceDecay((interPath[i].origin - interPath[i + 1].origin).length()); fprintf(fp , "l = %d , thr = (%.8f,%.8f,%.8f) , color = (%.8f,%.8f,%.8f)\ncosine = %.8f , dirPdf = %.8f , oPdf = %.8f\ndecay=(%.8f,%.8f,%.8f)\n" , i , interState.throughput.x , interState.throughput.y , interState.throughput.z , interPath[i].color[0] , interPath[i].color[1] , interPath[i].color[2] , interPath[i].getCosineTerm() , interPath[i].directionProb , interPath[i + 1].originProb , decay.x , decay.y , decay.z); */ if (interPath[i].directionSampleType != Ray::DEFINITE && useWeight) { Real pdf = interPath[i].directionProb; if (pdf < 1e-7f) break; Real weightFactor; Real volMergeScale = 1.f; Real originProb; Real dirProb; if (interPath[i].contactObject) { if (useUniformSur) originProb = 1.f / totArea; else originProb = interPath[i].contactObject->getOriginProb(interPath[i].contactObjectTriangleID); if (useUniformDir) dirProb = INV_2_PI; else dirProb = interPath[i].getCosineTerm() / M_PI; } //if (interPath[i].insideObject && interPath[i].contactObject) // printf("!!!\n"); if (interPath[i].insideObject && !interPath[i].contactObject && interPath[i].insideObject->isVolumetric()) { volMergeScale = 4.f / 3.f * mergeRadius; if (useUniformVol) originProb = 1.f / totVol; else originProb = interPath[i].insideObject->getOriginProb(interPath[i].origin); dirProb = 0.25f / M_PI; } weightFactor = connectFactor(pdf) / (connectFactor(pdf) + mergeFactor(&volMergeScale , &originProb , &dirProb , &partialPathNum)); if (_isnan(weightFactor) || abs(pdf) < 1e-6f) { fprintf(err , "sample inter path error, %.8f , %.8f\n" , connectFactor(pdf) , mergeFactor(&volMergeScale , &originProb , &dirProb , &partialPathNum)); } //if (interPath[i].contactObject && interPath[i].contactObject->objectIndex == 7) interState.throughput *= weightFactor; } } } partialPhotonNum = partialSubPathList.size(); }
void Shape::splitshapes(vector<Shape*> &shapes, ViewProgress *progress) { int n_tr = (int)triangles.size(); if (progress) progress->start(_("Split Shapes"), n_tr); int progress_steps = max(1,(int)(n_tr/100)); vector<bool> done(n_tr); bool cont = true; // make list of adjacent triangles for each triangle vector< vector<uint> > adj(n_tr); if (progress) progress->set_label(_("Split: Sorting Triangles ...")); #ifdef _OPENMP omp_lock_t progress_lock; omp_init_lock(&progress_lock); #pragma omp parallel for schedule(dynamic) #endif for (int i = 0; i < n_tr; i++) { if (progress && i%progress_steps==0) { #ifdef _OPENMP omp_set_lock(&progress_lock); #endif cont = progress->update(i); #ifdef _OPENMP omp_unset_lock(&progress_lock); #endif } vector<uint> trv; for (int j = 0; j < n_tr; j++) { if (i!=j) { bool add = false; if (j<i) // maybe(!) we have it already for (uint k = 0; k<adj[j].size(); k++) { if ((int)adj[j][k] == i) { add = true; break; } } add |= (triangles[i].isConnectedTo(triangles[j], 0.01)); if (add) trv.push_back(j); } } adj[i] = trv; if (!cont) i=n_tr; } if (progress) progress->set_label(_("Split: Building shapes ...")); // triangle indices of shapes vector< vector<uint> > shape_tri; for (int i = 0; i < n_tr; i++) done[i] = false; for (int i = 0; i < n_tr; i++) { if (progress && i%progress_steps==0) cont = progress->update(i); if (!done[i]){ cerr << _("Shape ") << shapes.size()+1 << endl; vector<uint> current; addtoshape(i, adj, current, done); Shape *shape = new Shape(); shapes.push_back(shape); shapes.back()->triangles.resize(current.size()); for (uint i = 0; i < current.size(); i++) shapes.back()->triangles[i] = triangles[current[i]]; shapes.back()->CalcBBox(); } if (!cont) i=n_tr; } if (progress) progress->stop("_(Done)"); }
vector<vec3f> PhotonMap::renderPixels(const Camera& camera){ uint width = camera.width, height = camera.height; std::vector<vec3f> pixelColors(width * height, vec3f(0,0,0)); omp_init_lock(&surfaceHashGridLock); omp_init_lock(&volumeHashGridLock); omp_init_lock(&debugPrintLock); //std::vector<int> pixelMaps(pixelColors.size(), 0); preprocessEmissionSampler(); mRadius = mBaseRadius; clock_t startTime = clock(); for(uint s = 0; s < spp; s++){ std::cout << "iteration : " << s << std::endl; std::vector<vec3f> oneIterColors(pixelColors.size(), vec3f(0,0,0)); #ifdef PPM //if (renderer->scene.getTotalVolume() > 1e-6f) if (true) { rayMarching = true; mRadius = MAX(mBaseRadius * powf(powf(s+1 , mAlpha-1) , 1.f / 3.f) , EPSILON); } else { rayMarching = false; mRadius = MAX(mBaseRadius * sqrt(powf(s+1, mAlpha-1)), EPSILON); } #endif std::vector<Path*> pixelLightPaths(mPhotonsNum, NULL); std::vector<LightPoint> surfaceLightVertices(0); std::vector<LightPoint> volumeLightVertices(0); surfaceHashGrid.Reserve(pixelColors.size()); volumeHashGrid.Reserve(pixelColors.size()); #pragma omp parallel for // step1: sample light paths and build range search struct independently for surface and volume for(int p = 0; p < mPhotonsNum; p++){ Ray lightRay = genEmissiveSurfaceSample(true , false); pixelLightPaths[p] = new Path; Path &lightPath = *pixelLightPaths[p]; samplePath(lightPath, lightRay); for(int i = 1; i < lightPath.size(); i++){ // light is not reflective if(lightPath[i].contactObject && lightPath[i].contactObject->emissive()) break; // only store particles non-specular if(lightPath[i].directionSampleType == Ray::DEFINITE) continue; LightPoint lightPoint; lightPoint.position = lightPath[i].origin; lightPoint.indexInThePath = i; lightPoint.pathThePointIn = &lightPath; lightPoint.photonType = lightPath[i].photonType; if(lightPoint.photonType == Ray::OUTVOL){ omp_set_lock(&surfaceHashGridLock); surfaceLightVertices.push_back(lightPoint); omp_unset_lock(&surfaceHashGridLock); } if(lightPoint.photonType == Ray::INVOL){ omp_set_lock(&volumeHashGridLock); volumeLightVertices.push_back(lightPoint); omp_unset_lock(&volumeHashGridLock); } } } std::cout<< "vol vertices= " << volumeLightVertices.size() << " sur vertices= " << surfaceLightVertices.size() << std::endl; surfaceHashGrid.Build(surfaceLightVertices, mRadius); volumeHashGrid.Build(volumeLightVertices, mRadius); std::cout<< "finish building hashgrid" << std::endl; // step2: calculate pixel colors by progressive photon mapping #pragma omp parallel for for(int p = 0; p < pixelColors.size(); p++){ Path eyePath; if (rayMarching) sampleMergePath(eyePath, camera.generateRay(p), 0); else samplePath(eyePath, camera.generateRay(p)); //fprintf(fp , "===================\n"); //for (int i = 0; i < eyePath.size(); i++) //{ // fprintf(fp , "l=%d, bsdf=(%.8f,%.8f,%.8f), originPdf=%.8f, dirPdf=%.8f\n" , i , eyePath[i].color.x , // eyePath[i].color.y , eyePath[i].color.z , eyePath[i].originProb , eyePath[i].directionProb); //} /*if(eyePath[1].contactObj && eyePath[1].contactObj->anisotropic()){ pixelMaps[p] = 1; }*/ throughputByDensityEstimation(oneIterColors[p], eyePath, surfaceLightVertices, volumeLightVertices); } /*std::ofstream fout(engine->renderer->name + engine->scene.name+"pixelMap.txt"); for(int p = 0; p < pixelMaps.size(); p++) fout << pixelMaps[p] << ' ' ; fout << std::endl; fout.close();*/ std::cout << "calculation done" << std::endl; for(uint i = 0; i < pixelColors.size(); i++){ pixelColors[i] *= s / float(s+1); pixelColors[i] += camera.eliminateVignetting(oneIterColors[i], i) / (s + 1); delete pixelLightPaths[i]; } unsigned nowTime = (float)(clock() - startTime) / 1000; //if (nowTime > recordTime) if (s % outputIter == 0) { showCurrentResult(pixelColors , &nowTime , &s); //showCurrentResult(pixelColors , &lastTime , &s); //recordTime += timeInterval; } else showCurrentResult(pixelColors); } return pixelColors; }