int main (void) { int i; float a[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float b[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); check_vect (); /* from bzip2: */ for (i=0; i<N; i++) b[i] = i; a[0] = 0; for (i = 1; i <= 256; i++) a[i] = b[i-1]; /* check results: */ for (i = 1; i <= 256; i++) { if (a[i] != i-1) abort (); } if (a[0] != 0) abort (); main1 (a); return 0; }
int main1 () { union { char a[N] __attribute__ ((__aligned__(16))); char b[N] __attribute__ ((__aligned__(16))); } s; int i; /* Initialization. */ for (i = 0; i < N; i++) { s.b[i] = 3*i; } /* Can't vectorize - dependence analysis fails cause s.a and s.b may overlap. */ for (i = 0; i < N; i++) { s.a[i] = s.b[i] + 1; } /* check results: */ for (i = 0; i < N; i++) { if (s.a[i] != 3*i + 1) abort (); } return 0; }
int main1 (int n) { int i; float a[N] __attribute__ ((__aligned__(16))); float b[N] __attribute__ ((__aligned__(16))) = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57}; float c[N] __attribute__ ((__aligned__(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19}; float *pa = a; float *pb = b; float *pc = c; for (i = 0; i < n/2; i++) { pa[i] = pb[i+1] * pc[i+1]; } /* check results: */ for (i = 0; i < N/2; i++) { if (pa[i] != (pb[i+1] * pc[i+1])) abort (); } return 0; }
int main1 () { union { unsigned char a[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); unsigned char b[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); } s; int i; /* Initialization. */ for (i = 0; i < N; i++) { s.b[i] = i; } /* Dependence analysis fails cause s.a and s.b may overlap. Use runtime aliasing test with versioning. */ for (i = 0; i < N; i++) { s.a[i] = s.b[i] + 1; } /* check results: */ for (i = 0; i < N; i++) { if (s.a[i] != i + 1) abort (); } return 0; }
int main1 () { float A[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float B[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float C[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float D[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float E[4] = {0,1,2,480}; float s; int i, j; for (i = 0; i < N; i++) { A[i] = i; B[i] = i; C[i] = i; D[i] = i; } /* Outer-loop 1: Vectorizable with respect to dependence distance. */ for (i = 0; i < N-20; i++) { s = 0; for (j=0; j<N; j+=4) s += C[j]; A[i] = A[i+20] + s; } /* check results: */ for (i = 0; i < N-20; i++) { s = 0; for (j=0; j<N; j+=4) s += C[j]; if (A[i] != D[i+20] + s) abort (); } /* Outer-loop 2: Not vectorizable because of dependence distance. */ for (i = 0; i < 4; i++) { s = 0; for (j=0; j<N; j+=4) s += C[j]; B[i+3] = B[i] + s; } /* check results: */ for (i = 0; i < 4; i++) { if (B[i] != E[i]) abort (); } return 0; }
int main (void) { int i; float b[N+1] __attribute__ ((__aligned__(16))) = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60}; float c[N] __attribute__ ((__aligned__(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19}; check_vect (); main1 (b,c); main1 (&b[1],c); return 0; }
int main (void) { int i; float a[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float b[N+1] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))) = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60}; float c[N+1] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; check_vect (); main1 (N,&b[1],c); main1 (N,&b[1],&c[1]); return 0; }
inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) { static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = { 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, }; unalignedStore(op, vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset))); unalignedStore(op + 8, vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset + 8))); match += masks[offset]; }
inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) { #ifdef __SSSE3__ static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = { 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, }; _mm_storeu_si128(reinterpret_cast<__m128i *>(op), _mm_shuffle_epi8( _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)), _mm_load_si128(reinterpret_cast<const __m128i *>(masks) + offset))); match += masks[offset]; #else copyOverlap16(op, match, offset); #endif }
/** We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes. * * It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need. * * There is corresponding version of 'pshufb' instruction that operates on 'mm' registers, * (it operates on MMX registers although it is available in SSSE3) * and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'. * * It can be done like this: * * unalignedStore(op, _mm_shuffle_pi8( * unalignedLoad<__m64>(match), * unalignedLoad<__m64>(masks + 8 * offset))); * * This is perfectly correct and this code have the same or even better performance. * * But if we write code this way, it will lead to * extremely weird and extremely non obvious * effects in completely unrelated parts of code. * * Because using MMX registers alters the mode of operation of x87 FPU, * and then operations with FPU become broken. * * Example 1. * Compile this code without optimizations: * #include <vector> #include <unordered_set> #include <iostream> #include <tmmintrin.h> int main(int, char **) { [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); std::vector<int> vec; std::unordered_set<int> set(vec.begin(), vec.end()); std::cerr << set.size() << "\n"; return 0; } $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out terminate called after throwing an instance of 'std::bad_alloc' what(): std::bad_alloc Also reproduced with clang. But only with libstdc++, not with libc++. * Example 2. #include <math.h> #include <iostream> #include <tmmintrin.h> int main(int, char **) { double max_fill = 1; std::cerr << (long double)max_fill << "\n"; [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); std::cerr << (long double)max_fill << "\n"; return 0; } $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out 1 -nan * Explanation: * * https://stackoverflow.com/questions/33692969/assembler-mmx-errors * https://software.intel.com/en-us/node/524274 * * Actually it's possible to use 'emms' instruction after decompression routine. * But it's more easy to just use 'xmm' registers and avoid using 'mm' registers. */ inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) { #ifdef __SSSE3__ static constexpr UInt8 __attribute__((__aligned__(8))) masks[] = { 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 1, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* this row is not used: padding to allow read 16 bytes starting at previous row */ }; _mm_storeu_si128(reinterpret_cast<__m128i *>(op), _mm_shuffle_epi8( _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)), _mm_loadu_si128(reinterpret_cast<const __m128i *>(masks + 8 * offset)))); match += masks[offset]; #else copyOverlap8(op, match, offset); #endif }
void sifrpc_register_service(struct sifrpc_server_system *queue, struct sifrpc_server *server, sifrpc_id_t rpc_id, void *(*service_func)(sifrpc_callno_t, void *, size_t), void *service_arg, void *(*cancel_func)(sifrpc_callno_t, void *, size_t), void *cancel_arg) { struct { void *server; sifrpc_id_t rpc_id; sifrpc_rpcfunc_t service_func; void *service_arg; sifrpc_rpcfunc_t cancel_func; void *cancel_arg; void *receive_queue; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { server: server, rpc_id: rpc_id, service_func: service_func, service_arg: service_arg, cancel_func: cancel_func, cancel_arg: cancel_arg, receive_queue: queue, }; CALL(void, 55, &sifbios_arg); }
int sifrpc_call(struct sifrpc_client *_cookie, sifrpc_callno_t call_no, u_int32_t rpc_mode, void *sendbuf, size_t sendbuf_sz, void *recvbuf, size_t recvbuf_sz, void (*end_func)(void *), void *end_arg) { struct { struct sifrpc_client *_cookie; /* binded client cookie */ sifrpc_callno_t call_no; /* passed to service function arg. */ u_int32_t rpc_mode; void *sendbuf; size_t sendbuf_sz; void *recvbuf; size_t recvbuf_sz; sifrpc_endfunc_t end_func; void *end_arg; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { _cookie: _cookie, call_no: call_no, rpc_mode: rpc_mode, sendbuf: sendbuf, sendbuf_sz: sendbuf_sz, recvbuf: recvbuf, recvbuf_sz: recvbuf_sz, end_func: end_func, end_arg: end_arg, }; return CALL(int, 52, &sifbios_arg); }
static int helper_rfc4106_decrypt(struct aead_request *req) { __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); unsigned int i; if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length */ /* equal to 16 or 20 bytes */ /* IV below built */ for (i = 0; i < 4; i++) *(iv+i) = ctx->nonce[i]; for (i = 0; i < 8; i++) *(iv+4+i) = req->iv[i]; *((__be32 *)(iv+12)) = counter; return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, aes_ctx); }
// For now, start and len are a number of 512-byte blocks int sd_read(struct sd_card *card, int start, int len, void *dest) { int bl_addr; struct dma_cb ctrl __attribute__ ((__aligned__(32))); if(card->type == 0) start *= 512; dmb(); *BLKSIZECNT = BLKSIZE(512) | BLKCNT(len); sd_send_command(CMD_READ_MULTIPLE_BLOCK, TM_BLKCNT_EN | TM_AUTO_CMD_12 | TM_DAT_CARD_TO_HOST | TM_MULTI_BLOCK | CMD_RSPNS_48 | CMD_ISDATA, start); ctrl.ti = DMA_TI_INTEN | DMA_TI_WAIT_RESP | DMA_TI_DEST_INC | DMA_TI_DEST_WIDTH | DMA_TI_SRC_DREQ | DMA_TI_PERMAP_EMMC; ctrl.source_ad = IO_TO_BUS(DATA); ctrl.dest_ad = virt_to_phy(dest); ctrl.txfr_len = 512 * len; ctrl.stride = 0; ctrl.nextconbk = 0; return dma_initiate(DMA_CHAN_EMMC, &ctrl); }
sifdma_id_t sifcmd_queue(sifcmd_sw_t sw, vaddr_t cmd_pkt_addr, size_t cmd_pkt_sz, vaddr_t src_addr, vaddr_t dst_addr, vsize_t buf_sz) { struct { sifcmd_sw_t sw; vaddr_t cmd_pkt_addr; /* command buffer */ size_t cmd_pkt_sz; vaddr_t src_addr; /* data buffer */ vaddr_t dst_addr; vsize_t buf_sz; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { sw: sw, cmd_pkt_addr: cmd_pkt_addr, cmd_pkt_sz: cmd_pkt_sz, src_addr: src_addr, dst_addr: dst_addr, buf_sz: buf_sz, }; return CALL(sifdma_id_t, 34, &sifbios_arg); }
void sifcmd_establish(sifcmd_sw_t sw, struct sifcmd_callback_holder *holder) { struct { sifcmd_sw_t sw; sifcmd_callback_t func; void *arg; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { sw: sw, func: holder->func, arg: holder->arg, }; CALL(void, 36, &sifbios_arg); } void sifcmd_disestablish(sifcmd_sw_t sw) { u_int32_t sifbios_arg = sw; CALL(void, 37, &sifbios_arg); }
int main(){ size_t saltlen = 16; size_t outlen = 32; unsigned int t_cost = (unsigned int)pow(2,13); unsigned int m_cost = (unsigned int)pow(2,15); int i; char *passwd="password"; uint8_t res[outlen] __attribute__((__aligned__(__alignof__(uint32_t)))); srand(time(NULL)); rand(); uint32_t salt[saltlen >> 2]; for (i=0;i<(saltlen >> 2);i++) salt[i] = rand(); clock_t start = -clock(); PHS((void *)res,outlen,(void *)passwd,strlen(passwd),(void *)salt,saltlen,t_cost,m_cost); start += clock(); float sec = (float)start/CLOCKS_PER_SEC; printf("%.3f secs,%.3f passwords\n",sec,(float)(1/sec)); return 0; }
int sifrpc_receive_buffer(struct sifrpc_receive *_cookie, void *src_iop, void *dst_ee, size_t sz, u_int32_t rpc_mode, void (*end_func)(void *), void *end_arg) { struct { void *_cookie; void *src_iop; void *dst_ee; size_t sz; u_int32_t rpc_mode; sifrpc_endfunc_t end_func; void *end_arg; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { _cookie: _cookie, src_iop: src_iop, dst_ee: dst_ee, sz: sz, rpc_mode: rpc_mode, end_func: end_func, end_arg: end_arg, }; return CALL(int, 50, &sifbios_arg); }
float ADMDolbyContext::DolbyShift_convolutionAlignSSE(float *oldie, float *coef) { float *src1=oldie; // Aligned also float *src2=coef; // that one is always aligned to a 16 bytes boundary int mod16=(1+NZEROS)>>2; int left=(1+NZEROS)&3; static float __attribute__ ((__aligned__ (16))) sum16[4]; float sum = 0; __asm__( "xorps %%xmm2,%%xmm2 \n" // carry "1: \n" "movaps (%0),%%xmm0 \n" // src1 "movaps (%1),%%xmm1 \n" // src2 "mulps %%xmm1,%%xmm0 \n" // src1*src2 "addps %%xmm0,%%xmm2 \n" // sum+=src1*src2 "add $16,%0 \n" "add $16,%1 \n" "sub $1,%3 \n" "jnz 1b \n" "movaps %%xmm2,(%2) \n" : : "r" (src1),"r" (src2),"r"(sum16),"r"(mod16) ); for (int i = 0; i <left; i++) sum += (*src1++)*(*src2++); for(int i=0;i<4;i++) sum+=sum16[i]; return sum; }
int main(int argc, char* argv[]) { const int mb = 1024 * 1024; pid_t child_pid, wpid; int status = 0; for (int num_of_processes = 1; num_of_processes <= 16; num_of_processes *= 2) { for (int num_of_tries = 0; num_of_tries < 10; num_of_tries++) { printf("========START: num_of_processes : %d try: %d========\n", num_of_processes, num_of_tries); int curr_child_process = -1; for (int i = 0; i < num_of_processes; i++) { curr_child_process++; if ((child_pid = fork()) == 0) { int fd; int total_bytes_read = 0; int bytes_read = 0; static char block[4096] __attribute__ ((__aligned__ (4096))); const int block_size = 4096; const int size = 8 * mb; struct timespec start, end, time_diff; char filename[80]; snprintf(filename, sizeof (filename), "./random8M_%d", curr_child_process); printf("start reading child pid: %d, filename: %s\n", child_pid, filename); if ((fd = open(filename, O_RDONLY | O_DIRECT)) == -1) { perror("Error: read error"); exit(1); } if (lseek(fd, 0, SEEK_SET) == -1) { perror("Error: lseek()"); exit(1); } clock_gettime(CLOCK_MONOTONIC, &start); while (total_bytes_read < size) { if ((bytes_read = read(fd, block, block_size)) == -1) { perror("Error: read()"); exit(1); } total_bytes_read += bytes_read; } clock_gettime(CLOCK_MONOTONIC, &end); timespec_subtract(&start, &end, &time_diff); printf("INSTANT diff: %ld sec %lld ns, start time: %ld sec %lld ns, end time: %ld sec %lld ns\n", time_diff.tv_sec, (uint64_t)time_diff.tv_nsec, start.tv_sec, (uint64_t)start.tv_nsec, end.tv_sec, (uint64_t)end.tv_nsec); close(fd); exit(1); } } while ((wpid = wait(&status)) > 0) { printf("end reading child_pid: %d, status: %d\n", (int)wpid, status); } printf("========END: num_of_processes : %d try: %d========\n\n", num_of_processes, num_of_tries); } }
int sifrpc_bind(struct sifrpc_client *_cookie, sifrpc_id_t rpc_id, u_int32_t rpc_mode, void (*end_func)(void *), void *end_arg) { struct { void *_cookie; /* filled by this call */ sifrpc_id_t rpc_id; /* specify server RPC id */ u_int32_t rpc_mode; sifrpc_endfunc_t end_func; void *end_arg; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { _cookie: _cookie, rpc_id: rpc_id, rpc_mode: rpc_mode, end_func: end_func, end_arg: end_arg, }; return CALL(int, 51, &sifbios_arg); }
int main() { warmUp(); uint64_t start,end; static char buffer[FOUR_KB] __attribute__ ((__aligned__ (FOUR_KB))); string prefix = "/mnt/nfs/import/"; string files[] = {"file1", "file2", "file3", "file4", "file5", "file6", "file7", "file8", "file9"}; double results[lessIter], sum; for (int k=0;k<9;k++) { for(int i=0;i<lessIter;++i) { sum = 0; for (int j=0;j<lessInner;j++) { int fd = open((prefix + files[k]).c_str(), O_RDONLY | O_DIRECT); if (fd <= 0) { cout << "open failed\n"; } int n; getStartTick(start); int tot = 0; while ( true ) { n=read(fd, &buffer, FOUR_KB); if (n<0) cout << "Read error\n"; tot += n; if (tot >= READLIMIT) break; } getEndTick(end); // cout << "total:" << tot << " k: " << k << "\n"; close (fd); sum += end - start; } sum /= lessInner; results[i] = sum; } string fileName = files[k] + "SequentialRemoteCycles.txt"; string fileTimeName = files[k] + "SequentialRemoteTime.txt"; writeToFile(results, fileName); getTimeFromTicks(results, lessIter); writeToFile(results, fileTimeName); pair<double, double> meanAndVariance = getMeanAndVariance(results, lessIter); cout << "File: " << files[k] << "\n"; cout << "File read mean= " << (meanAndVariance.first * FOUR_KB / READLIMIT) << "\n"; cout << "File read variance= " << (meanAndVariance.second * FOUR_KB / READLIMIT) << "\n"; ofstream myfile; myfile.open ( (files[k] + "SequentialRemoteResults.txt").c_str()); myfile << "File: " << files[k] << "\n"; myfile << "File read mean= " << (meanAndVariance.first * FOUR_KB / READLIMIT) << "\n"; myfile << "File read variance= " << (meanAndVariance.second * FOUR_KB / READLIMIT) << "\n"; myfile.close(); } return 0; }
int main1 (float *pa) { int i; float pb[N] __attribute__ ((__aligned__(16))) = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57}; float pc[N] __attribute__ ((__aligned__(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19}; /* Not vectorizable: pa may alias pb and/or pc, since their addresses escape. */ for (i = 0; i < N; i++) { pa[i] = pb[i] * pc[i]; } bar (pa,pb,pc); return 0; }
int main1 (int n , float *pa) { int i; float b[N] __attribute__ ((__aligned__(16))) = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57}; float c[N] __attribute__ ((__aligned__(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19}; float *pb = b; float *pc = c; for (i = 0; i < n/2; i++) { pa[i] = pb[i+1] * pc[i+1]; } bar (pa,pb,pc); return 0; }
int sifbios_rpc_call(int callno, void *arg, int *result) { volatile int done = 0; int retry; struct { int result; void *arg; void (*callback)(void *, int); volatile void *callback_arg; } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { arg: arg, callback: sifbios_rpc_callback, callback_arg: (volatile void *)&done, }; /* call SIF BIOS */ retry = 100; while (CALL(int, callno, &sifbios_arg) != 0 && --retry > 0) delay(20000); /* .02 sec. for slow IOP */ if (retry == 0) { printf("SIF BIOS call %d failed\n", callno); goto error; } /* wait IOP response (1 sec.) */ _sif_call_start(); retry = 10000; while (!done && --retry > 0) delay(100); _sif_call_end(); if (retry == 0) { printf("IOP not respond (callno = %d)\n", callno); goto error; } *result = sifbios_arg.result; return (0); error: return (-1); } void sifbios_rpc_callback(void *arg, int result) { int *done = (int *)arg; *done = 1; }
int main (void) { int i; float a[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); check_vect (); main1 (a); return 0; }
int main(int argc, char** argv){ int i, j, ws, fd, o_direct; if (argc != 4){ printf(ARGS_ERROR); return -1; } if (typeCheck(argv[1]) == -1){ return -1; } if (!(strcmp(argv[2], "1") == 0 || strcmp(argv[2], "0") == 0)){ printf(ARG_ERROR, argv[2]); return -1; } ws = atoi(argv[3]); if (!ws){ // ws == 0 printf(ARG_ERROR, argv[3]); return -1; } static char buf[MB] __attribute__((__aligned__(4096))); for (j = 0; j < MB; j++) buf[i] = 'a' + (random() % 26); struct timeval t1, t2; //referance: http://stackoverflow.com/questions/2150291/how-do-i-measure-a-time-interval-in-c, first answer double elapsedTime; gettimeofday(&t1, NULL); // start timer o_direct = atoi(argv[2]); if (o_direct) fd = open(argv[1], O_WRONLY | O_DIRECT, S_IRWXU | S_IRWXG | S_IRWXO); else fd = open(argv[1], O_WRONLY, S_IRWXU | S_IRWXG | S_IRWXO); if (fd == -1){ printf(OPEN_ERROR, argv[2], strerror(errno)); return -1; } int repeats = (128 * MB) / (ws * KB); for (i = 0; i < repeats; i++){ int offset = (random() % repeats) * ws; if (lseek(fd, offset, SEEK_SET) == (off_t)-1){ printf(SEEK_ERROR, argv[1], strerror(errno)); close(fd); return -1; } if (write(fd, buf, ws) == -1){ printf(WRITE_ERROR, argv[1], strerror(errno)); close(fd); return -1; } } close(fd); gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms printf(THROUGHPUT, elapsedTime); return 0; }
__attribute__ ((noinline)) int main1 (float *pa) { int i; float b[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float c[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); float *pb = b; float *pc = c; foo (pb, pc); for (i = 0; i < N/2; i++) { pa[i] = pb[i+1] * pc[i+1]; } bar (pa, pb, pc); return 0; }
int main (void) { int i; int n=N; float a[N] __attribute__ ((__aligned__(16))); check_vect (); main1 (n,a); return 0; }
/* queue DMA request to SIFBIOS. returns queue identifier. */ sifdma_id_t sifdma_queue(struct sifdma_transfer *arg, int n) { struct { void *arg; /* pointer to sifdma_transfer array */ int n; /* # of elements */ } __attribute__((__packed__, __aligned__(4))) sifbios_arg = { arg: arg, n: n }; return CALL(sifdma_id_t, 18, &sifbios_arg); }