-
Notifications
You must be signed in to change notification settings - Fork 3
/
affinity.c
283 lines (218 loc) · 6.63 KB
/
affinity.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#include <stdio.h>
#include <math.h>
#define N 729
#define REPS 100
#include <omp.h>
#define MINIMUM_CHUNK_SIZE 16
#define true 1
#define false 0
double a[N][N], b[N][N], c[N];
int jmax[N];
void init1(void);
void init2(void);
void runloop(int);
void loop1chunk(int, int);
void loop2chunk(int, int);
void valid1(void);
void valid2(void);
int findThreadToStealFrom(int*, int*, int, int, float, int*, int*);
void printArray(int*, int);
int main(int argc, char *argv[]) {
double start1,start2,end1,end2;
int r;
init1();
start1 = omp_get_wtime();
for (r=0; r<REPS; r++){
runloop(1);
}
end1 = omp_get_wtime();
valid1();
printf("Total time for %d REPS of loop 1 = %f\n",REPS, (float)(end1-start1));
init2();
start2 = omp_get_wtime();
for (r=0; r<REPS; r++){
runloop(2);
}
end2 = omp_get_wtime();
valid2();
printf("Total time for %d REPS of loop 2 = %f\n",REPS, (float)(end2-start2));
}
void init1(void){
int i,j;
for (i=0; i<N; i++){
for (j=0; j<N; j++){
a[i][j] = 0.0;
b[i][j] = 3.142*(i+j);
}
}
}
void init2(void){
int i,j, expr;
for (i=0; i<N; i++){
expr = i%( 3*(i/30) + 1);
if ( expr == 0) {
jmax[i] = N;
}
else {
jmax[i] = 1;
}
c[i] = 0.0;
}
for (i=0; i<N; i++){
for (j=0; j<N; j++){
b[i][j] = (double) (i*j+1) / (double) (N*N);
}
}
}
/**
* The idea is to implement a general work-stealing algorithm using critical sections (alternatives are discussed in the report)/
* However, rather than computing iterations owned by the current thread, we're going to steal from ourself. Once own own iterations have
* been completed, we will start stealing from other threads. The preference will be given to threads with higher IDs due to the way the
* work is distributed (this, again, is explained in the report).
*/
void runloop(int loopid)
{
int thread_count = omp_get_max_threads(); // the number of threads in the system.
// we don't know how many exist yet, so use this. alternatively, we
// could have used getenv() from <stdlib.h> to get the env variable, but
// this seems cleaner. it should always work within our setup as well.
int n_over_p = (int) ceil((double) N / (double) thread_count); // what it says on the tin
float one_over_p = 1.0 / thread_count; // one over p
int lower_bounds[thread_count]; // stores the lower bound of the array not already computed.
int upper_bounds[thread_count]; // stores the upper bound of the array not already computed.
// upper_bounds[i] - lower_bounds[i] = remaining iterations
#pragma omp parallel default(none) \
shared(thread_count, loopid, lower_bounds, upper_bounds, n_over_p, one_over_p)
{
int thread_id = omp_get_thread_num(),
thread_low = thread_id * n_over_p,
thread_high = ((thread_id + 1) * n_over_p) > N ? N : (thread_id + 1) * n_over_p; // in case n mod p != 0
lower_bounds[thread_id] = thread_low;
upper_bounds[thread_id] = thread_high;
// We need to ensure that the last iteration does not compute twice. Although this could be done with an if statement below the
// switch, I feel that it should be achievable in a more succict method. Thus, in the first iteration we will perform no work
// which allows findThreadToSteaFrom() to perform it's computation and update current_low and current_high. Hence, the second
// iteration is the first one that will perform any work.
int current_low = 0,
current_high = 0,
stealing_from = 0;
while(stealing_from != -1)
{
switch(loopid)
{
case 1: loop1chunk(current_low, current_high); break;
case 2: loop2chunk(current_low, current_high); break;
}
// Find the next current_low and current_high. Notice the use of pointers to these values as replacements for C#/C++-style out params.
// This would go nicely in the while loop condition, but unfortunately we need the #pragma block.
#pragma omp critical
{
stealing_from = findThreadToStealFrom(lower_bounds, upper_bounds, thread_count, thread_id, one_over_p, ¤t_low, ¤t_high);
}
}
}
}
/**
* this method computes where the thread should steal from (locally or remotely) as well as the number of iterations to process.
* it updates the appropriate arrays too, and returns via pointers - which is a kludge to get around the lack of out params in C
*/
int findThreadToStealFrom(int* lower_bounds, int* upper_bounds, int size, int thread, float one_over_p, int* current_low, int *current_high)
{
int position;
// In the first instance, we want to 'steal' from our local set of iterations. This is the section of code that handles this.
if (upper_bounds[thread] - lower_bounds[thread] > 0)
{
position = thread;
}
else
{
// No iterations left in local set. Find lowest ID with some remaining
int index = -1,
value = 0;
for (int i = 0; i < size; i++)
{
int diff = upper_bounds[i] - lower_bounds[i];
if (diff > value)
{
index = i;
value = diff;
}
}
position = index;
}
// No other threads have any left either.
if (position == -1) return -1;
// Perform the real work
int remaining_iterations = upper_bounds[position] - lower_bounds[position];
int chunk_size = (int) ceil(one_over_p * remaining_iterations);
// Check for minimum chunk size.
if (MINIMUM_CHUNK_SIZE)
{
if (chunk_size < MINIMUM_CHUNK_SIZE)
{
if (remaining_iterations >= MINIMUM_CHUNK_SIZE)
{
chunk_size = MINIMUM_CHUNK_SIZE;
}
else
{
// This seems like a reasonable thing to do if there are less than MIN_CHUNK_SIZE
// iterations left
chunk_size = remaining_iterations;
}
}
}
// Update the "out" params
*current_low = lower_bounds[position];
*current_high = lower_bounds[position] + chunk_size;
lower_bounds[position] = lower_bounds[position] + chunk_size;
return position;
}
void printArray(int* array, int size)
{
for (int i = 0; i < size; i++)
{
printf("%d: %d, ", i, array[i]);
}
printf("\n");
}
void loop1chunk(int lo, int hi) {
int i,j;
for (i=lo; i<hi; i++){
for (j=N-1; j>i; j--){
a[i][j] += cos(b[i][j]);
}
}
}
void loop2chunk(int lo, int hi) {
int i,j,k;
double rN2;
rN2 = 1.0 / (double) (N*N);
for (i=lo; i<hi; i++){
for (j=0; j < jmax[i]; j++){
for (k=0; k<j; k++){
c[i] += (k+1) * log (b[i][j]) * rN2;
}
}
}
}
void valid1(void) {
int i,j;
double suma;
suma= 0.0;
for (i=0; i<N; i++){
for (j=0; j<N; j++){
suma += a[i][j];
}
}
printf("Loop 1 check: Sum of a is %lf\n", suma);
}
void valid2(void) {
int i;
double sumc;
sumc= 0.0;
for (i=0; i<N; i++){
sumc += c[i];
}
printf("Loop 2 check: Sum of c is %f\n", sumc);
}