void ir_ternary(nodeType* n) { nodeType* expr = get_operand(n,0); nodeType* stmt1 = get_operand(n,1); nodeType* stmt2 = get_operand(n,2); set_T(expr,newlabel()); set_F(expr,newlabel()); memset(stmt1->opr.next,0,16); memset(stmt2->opr.next,0,16); memset(n->opr.next,0,16); strcpy(stmt1->opr.next,n->opr.next); strcpy(stmt2->opr.next,n->opr.next); strcpy(n->opr.next,newlabel()); seen_bool_flow = 1; generate(expr); seen_bool_flow = 0; debugger("%s:\n",get_T(expr)); fprintf(output,"%s:\n",get_T(expr)); prepost_put = 1; generate(stmt1); prepost_put = 0; debugger("br.s %s\n ",n->opr.next); fprintf(output,"br.s %s\n",n->opr.next); debugger("%s:\n",get_F(expr)); fprintf(output,"%s:\n",get_F(expr)); prepost_put = 1; generate(stmt2); prepost_put = 0; debugger("%s:\n",n->opr.next); fprintf(output,"%s:\n",n->opr.next); }
void ir_if(nodeType* n) { nodeType* expr = get_operand(n,0); nodeType* stmt = get_operand(n,1); set_T(expr,newlabel()); set_F(expr,n->opr.next); memset(stmt->opr.next,0,16); strcat(stmt->opr.next,n->opr.next); debugger("expr true label:%s\n",get_T(expr)); debugger("expr false label:%s\n",get_F(expr)); seen_bool_flow = 1;prepost_put = 1; generate(expr); seen_bool_flow = 0;prepost_put = 0; debugger("%s:\n",get_T(expr)); fprintf(output,"%s:\n",get_T(expr)); generate(stmt); return; }
int main() { // Optimal fold const char* sequence = "CGCAGGGAUACCCGCGCC"; char* structure; float mfe, gfe; structure = seq_fold(sequence, &mfe); printf("%s %s %f\n", sequence, structure, mfe); free(structure); // Ensemble fold structure = seq_pf_fold(sequence, &gfe); printf("%s %s %f\n", sequence, structure, gfe); free(structure); printf("\n"); // Find suboptimal structures SOLUTION* sol = seq_subopt(sequence, 4.0); for(SOLUTION* s = sol; s->structure != NULL; s++) { printf("%s %s %f\n", sequence, s->structure, s->energy); free(s->structure); } free(sol); printf("\n"); // Evaluate fe of a structure (given a sequence)... printf("%f\n", get_T()); const char* test_str = "(((.((.....))))).."; printf("%s %s %f\n", sequence, test_str, seq_eval(sequence, test_str)); // ... and how it changes with temperature set_T(15.0); printf("%f\n", get_T()); printf("%s %s %f\n", sequence, test_str, seq_eval(sequence, test_str)); set_T(37.0); printf("\n"); // Take a not so different sequence with a different optimal structure const char* seed_seq = "AAUAGGGAUACCCGCGCC"; structure = seq_fold(seed_seq, &mfe); printf("%s %s %f\n", seed_seq, structure, mfe); // See that is not even stable on the test fold printf("%s %s %f\n", seed_seq, test_str, seq_eval(seed_seq, test_str)); // Mutate it until you get the test fold... char* seq = malloc(strlen(seed_seq) + 1); strcpy(seq, seed_seq); float dist = str_inverse(seq, test_str, 12345, 0); // ... and confirm it's its ground state structure = seq_fold(seq, &mfe); printf("%s %s %f\n", seq, structure, mfe); free(seq); }
void ir_while(nodeType* n) { nodeType* expr = get_operand(n,0); nodeType* stmt = get_operand(n,1); char begin[16]; memset(begin,0,16); strcat(begin,newlabel()); set_T(expr,newlabel()); set_F(expr,n->opr.next); memset(stmt->opr.next,0,16); strcpy(stmt->opr.next,begin); debugger("%s:\n",begin); fprintf(output,"%s:\n",begin); seen_bool_flow = 1;prepost_put = 1; generate(expr); seen_bool_flow = 0;prepost_put = 0; debugger("%s:\n",get_T(expr)); fprintf(output,"%s:\n",get_T(expr)); //##for break statement## char initial_break_label[16]; memset(initial_break_label,0,16); strcat(initial_break_label,break_label); memset(break_label,0,16); loop_flag = loop_flag + 1; strcat(break_label,n->opr.next); //#####for continue statement############ char initial_continue_label[16]; memset(initial_continue_label,0,16); strcpy(initial_continue_label,continue_label); memset(continue_label,0,16); strcpy(continue_label,begin); debugger("CONTINUE LABEL: %s\n",continue_label); //####################################### generate(stmt); debugger("br.s %s\n ",begin); fprintf(output,"br.s %s\n",begin); //for break statement loop_flag = loop_flag - 1; memset(break_label,0,16); strcat(break_label,initial_break_label); memset(continue_label,0,16); strcpy(continue_label,initial_continue_label); }
void ir_bool_flow(nodeType* n) { nodeType* B1 = get_operand(n,0); nodeType* B2 = get_operand(n,1); debugger("n true label:%s\n",get_T(n)); debugger("n false label:%s\n",get_F(n)); switch(n->opr.oper) { case BOOL_OR: debugger("MATCHED BOOL_OR in ir_bool_flow\n"); set_T(B1,n->opr.T); set_F(B1,newlabel()); set_T(B2,n->opr.T); set_F(B2,n->opr.F); generate(B1); debugger("%s:",get_F(B1)); fprintf(output,"%s:",get_F(B1)); debugger("seen_bool_flow : %d\n",seen_bool_flow); generate(B2); break; case BOOL_EQ: //the rule is to load value of B1 and B2 on stack then use beq to jump accordingly so we have to switch of seen_bool_flow flag and restart later. seen_bool_flow = 0; generate(B1); generate(B2); seen_bool_flow = 1; debugger("MATCHED BOOL_EQ in ir_relop_flow\n"); debugger("beq %s\n",get_T(n)); fprintf(output,"beq %s\n",get_T(n)); debugger("br %s\n",get_F(n)); fprintf(output,"br %s\n",get_F(n)); break; case NEQ: debugger("NOT EQUAL TO\n"); //the rule is to load value of B1 and B2 on stack then use bne.un to jump accordingly so we have to switch of seen_bool_flow flag and restart later. seen_bool_flow = 0; generate(B1); generate(B2); seen_bool_flow = 1; debugger("MATCHED NEQ in ir_relop_flow\n"); debugger("bne.un %s\n",get_T(n)); fprintf(output,"bne.un %s\n",get_T(n)); debugger("br %s\n",get_F(n)); fprintf(output,"br %s\n",get_F(n)); break; case BOOL_AND: set_T(B1,newlabel()); set_F(B1, get_F(n)); set_T(B2, get_T(n)); set_F(B2, get_F(n)); generate(B1); debugger("%s:",get_T(B1)); fprintf(output,"%s:",get_T(B1)); generate(B2); break; default: debugger("Bool DEFAULT\n"); } }
/* * Test the result for the bic calculator. */ int main(int argc, char** argv) { char** samples = read_lines(argv[1]); int depth = strtod(argv[2], NULL); setup_BIC(samples, depth, prob_root, bic_root); print_tree(prob_root, depth); printf("----------------\n"); // print_tree(bic_root, ""); printf("\nlogN=%f\n\n",logN); // int n = size_of_sample(); get_T(prob_root); print_tree(prob_root, depth); Vec c = get_Tvec(prob_root); print_Vec(c); sort_Vec(c); uniquefy_Vec(c); print_Vec(c); for(int i = 0; i < c->len; i++) c->x[i] /= logN; print_Vec(c); Champion_item champs = champion_set_from_vec(c); ITERA(Champion_item, cs, champs, next) { pprint_Tau(cs->tau); printf("\n"); }
std::pair<T,U> operator()(Obj rec) const { if(!isa(rec)) throw GAPException("Invalid attempt to read pair"); GAP_getter<T> get_T; GAP_getter<U> get_U; std::pair<T,U> p(get_T(ELM_LIST(rec, 1)), get_U(ELM_LIST(rec, 2))); return p; }
//get rotation matrix Matrix get_R(Point vrp, Point vpn, Point vup) { //first get the translation matrix from world to view auto mt = get_T(vrp); //we can see vpn_ and vup_ as vectors. such that we can apply them to get_uvn function from q2 auto uvn = get_uvn(vup, vpn); //finally contruct our roation matrix using method 2 on class notes Row r1 = { uvn[0][0],uvn[0][1],uvn[0][2],0 }; Row r2 = { uvn[1][0],uvn[1][1],uvn[1][2],0 }; Row r3 = { uvn[2][0],uvn[2][1],uvn[2][2],0 }; Row r4 = { 0, 0, 0, 1 }; return { r1, r2, r3, r4 }; }
void ir_relop_flow(nodeType* n) { int temp_bool_flow = seen_bool_flow; seen_bool_flow = 0; nodeType* B1 = get_operand(n,0); nodeType* B2 = get_operand(n,1); generate(B1); generate(B2); switch(n->opr.oper) { case LT: debugger("MATCHED LT in ir_relop_flow\n"); debugger("blt %s\n",get_T(n)); fprintf(output,"blt %s\n",get_T(n)); debugger("br %s\n",get_F(n)); fprintf(output,"br %s\n",get_F(n)); break; case GT: debugger("MATCHED GT in ir_relop_flow\n"); debugger("bgt %s\n",get_T(n)); fprintf(output,"bgt %s\n",get_T(n)); debugger("br %s\n",get_F(n)); fprintf(output,"br %s\n",get_F(n)); break; case LE: debugger("MATCHED LE in ir_relop_flow\n"); debugger("ble %s\n",get_T(n)); fprintf(output,"ble %s\n",get_T(n)); debugger("br %s\n",get_F(n)); fprintf(output,"br %s\n",get_F(n)); break; case GE: debugger("MATCHED GE in ir_relop_flow\n"); debugger("bge %s\n",get_T(n)); fprintf(output,"bge %s\n",get_T(n)); debugger("br %s\n",get_F(n)); fprintf(output,"br %s\n",get_F(n)); break; default: debugger("Relational Flow default\n"); } seen_bool_flow = temp_bool_flow; }
node* get_T(){ node* val = node_new(); val = get_P(); if (syntax_errno) return 0; if (*cur_c == '*' || *cur_c == '/'){ node* operation = node_new(); operation -> data = *cur_c; ++cur_c; operation -> left = val; operation -> right = get_T(); return operation; } return val; }
double ymax = 0.0175; /* camera position */ Point VRP = {128.0, 64.0, 250.0}; Vector VPN = {-64.0, 0.0, -186.0}; Vector VUP = {0.0, 1.0, 0.0}; double focal = 0.05; /* focal length simulating 50 mm lens */ Vector Light = {0.577, -0.577, -0.577}; /* light direction */ double Ip = 255.0; /* intensity of the point light source */ /* Transformation from the world to the camera coordinates */ Matrix Mwc = get_M(VRP, VPN, VUP); Matrix Rwc = get_R(VRP, VPN, VUP); Matrix Twc = get_T(VRP); /* Transformation from the camera to the world coordinates */ Matrix Mcw = get_Mi(VRP, VPN, VUP); Matrix Rcw = get_Ri(VRP, VPN, VUP); Matrix Tcw = get_Ti(VRP); int main () { //main program for volume rendering Volume* ct = new Volume; read_from_file("smallHead.den", ct); //print_ct_volume(ct); Volume* color = new Volume; compute_shading_volume(ct, color); //print_ct_volume(color); ImagePanel* img = new ImagePanel;
//this is the world to view final matrix, which is Mwc, also Mwl Matrix get_M(Point vrp, Point vpn, Point vup) { return mul(get_R(vrp, vpn, vup), get_T(vrp)); }
void ParticleAroundWalls::phi_t(arr& phi, arr& J, uint t, const arr& x_bar){ uint T=get_T(), n=dim_x(), k=get_k(); //assert some dimensions CHECK(x_bar.d0==k+1,""); CHECK(x_bar.d1==n,""); CHECK(t<=T,""); //-- transition costs: append to phi if(k==1) phi = x_bar[1]-x_bar[0]; //penalize velocity if(k==2) phi = x_bar[2]-2.*x_bar[1]+x_bar[0]; //penalize acceleration if(k==3) phi = x_bar[3]-3.*x_bar[2]+3.*x_bar[1]-x_bar[0]; //penalize jerk //-- walls: append to phi //Note: here we append to phi ONLY in certain time slices: the dimensionality of phi may very with time slices; see dim_phi(uint t) double eps=.1, power=2.; if(!hardConstrained){ //-- wall costs for(uint i=0;i<n;i++){ //add barrier costs to each dimension if(t==T/4) phi.append(MT::ineqConstraintCost(i+1.-x_bar(k,i), eps, power)); //middle factor: ``greater than i'' if(t==T/2) phi.append(MT::ineqConstraintCost(x_bar(k,i)+i+1., eps, power)); //last factor: ``lower than -i'' if(t==3*T/4) phi.append(MT::ineqConstraintCost(i+1.-x_bar(k,i), eps, power)); //middle factor: ``greater than i'' if(t==T) phi.append(MT::ineqConstraintCost(x_bar(k,i)+i+1., eps, power)); //last factor: ``lower than -i'' } }else{ //-- wall constraints for(uint i=0;i<n;i++){ //add barrier costs to each dimension if(t==T/4) phi.append((i+1.-x_bar(k,i))); //middle factor: ``greater than i'' if(t==T/2) phi.append((x_bar(k,i)+i+1.)); //last factor: ``lower than -i'' if(t==3*T/4) phi.append((i+1.-x_bar(k,i))); //middle factor: ``greater than i'' if(t==T) phi.append((x_bar(k,i)+i+1.)); //last factor: ``lower than -i'' } } uint m=phi.N; CHECK(m==dim_phi(t),""); if(&J){ //we also need to return the Jacobian J.resize(m,k+1,n).setZero(); //-- transition costs for(uint i=0;i<n;i++){ if(k==1){ J(i,1,i) = 1.; J(i,0,i) = -1.; } if(k==2){ J(i,2,i) = 1.; J(i,1,i) = -2.; J(i,0,i) = 1.; } if(k==3){ J(i,3,i) = 1.; J(i,2,i) = -3.; J(i,1,i) = +3.; J(i,0,i) = -1.; } } //-- walls if(!hardConstrained){ for(uint i=0;i<n;i++){ if(t==T/4) J(n+i,k,i) = -MT::d_ineqConstraintCost(i+1.-x_bar(k,i), eps, power); if(t==T/2) J(n+i,k,i) = MT::d_ineqConstraintCost(x_bar(k,i)+i+1., eps, power); if(t==3*T/4) J(n+i,k,i) = -MT::d_ineqConstraintCost(i+1.-x_bar(k,i), eps, power); if(t==T) J(n+i,k,i) = MT::d_ineqConstraintCost(x_bar(k,i)+i+1., eps, power); } }else{ for(uint i=0;i<n;i++){ if(t==T/4) J(n+i,k,i) = -1.; if(t==T/2) J(n+i,k,i) = +1.; if(t==3*T/4) J(n+i,k,i) = -1.; if(t==T) J(n+i,k,i) = +1.; } } } }
uint ParticleAroundWalls::dim_g(uint t){ if(!hardConstrained) return 0; uint T=get_T(); if(t==T/2 || t==T/4 || t==3*T/4 || t==T) return dim_x(); return 0; }
uint ParticleAroundWalls::dim_phi(uint t){ uint T=get_T(); if(t==T/2 || t==T/4 || t==3*T/4 || t==T) return 2*dim_x(); return dim_x(); }
void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const { const u64bit* K = &get_K()[0]; const u64bit* T_64 = &get_T()[0]; const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); #define THREEFISH_ROUND(X0, X1, SHL) \ do { \ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ X0 = _mm256_add_epi64(X0, X1); \ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ X1 = _mm256_xor_si256(X1, X0); \ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ } while(0) #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \ do { \ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ X0 = _mm256_add_epi64(X0, X1); \ X2 = _mm256_add_epi64(X2, X3); \ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ X1 = _mm256_xor_si256(X1, X0); \ X3 = _mm256_xor_si256(X3, X2); \ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ } while(0) #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ do { \ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ X0 = _mm256_add_epi64(X0, K0); \ X1 = _mm256_add_epi64(X1, K1); \ X1 = _mm256_add_epi64(X1, R); \ X0 = _mm256_add_epi64(X0, T0); \ X1 = _mm256_add_epi64(X1, T1); \ R = _mm256_add_epi64(R, ONE); \ } while(0) #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ do { \ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ X0 = _mm256_add_epi64(X0, K0); \ X2 = _mm256_add_epi64(X2, K0); \ X1 = _mm256_add_epi64(X1, K1); \ X3 = _mm256_add_epi64(X3, K1); \ T1 = _mm256_add_epi64(T1, R); \ X0 = _mm256_add_epi64(X0, T0); \ X2 = _mm256_add_epi64(X2, T0); \ X1 = _mm256_add_epi64(X1, T1); \ X3 = _mm256_add_epi64(X3, T1); \ R = _mm256_add_epi64(R, ONE); \ } while(0) #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ do { \ THREEFISH_ROUND(X0, X1, ROTATE_1); \ THREEFISH_ROUND(X0, X1, ROTATE_2); \ THREEFISH_ROUND(X0, X1, ROTATE_3); \ THREEFISH_ROUND(X0, X1, ROTATE_4); \ THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ \ THREEFISH_ROUND(X0, X1, ROTATE_5); \ THREEFISH_ROUND(X0, X1, ROTATE_6); \ THREEFISH_ROUND(X0, X1, ROTATE_7); \ THREEFISH_ROUND(X0, X1, ROTATE_8); \ THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \ } while(0) #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \ do { \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \ \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0); \ } while(0) /* v1.0 key schedule: 9 ymm registers (only need 2 or 3) (0,1,2,3),(4,5,6,7) [8] then mutating with vpermq */ const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); __m256i* out_mm = reinterpret_cast<__m256i*>(out); while(blocks >= 2) { __m256i X0 = _mm256_loadu_si256(in_mm++); __m256i X1 = _mm256_loadu_si256(in_mm++); __m256i X2 = _mm256_loadu_si256(in_mm++); __m256i X3 = _mm256_loadu_si256(in_mm++); const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); __m256i R = _mm256_set_epi64x(0, 0, 0, 0); interleave_epi64(X0, X1); interleave_epi64(X2, X3); THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2); deinterleave_epi64(X0, X1); deinterleave_epi64(X2, X3); _mm256_storeu_si256(out_mm++, X0); _mm256_storeu_si256(out_mm++, X1); _mm256_storeu_si256(out_mm++, X2); _mm256_storeu_si256(out_mm++, X3); blocks -= 2; } for(size_t i = 0; i != blocks; ++i) { __m256i X0 = _mm256_loadu_si256(in_mm++); __m256i X1 = _mm256_loadu_si256(in_mm++); const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); __m256i R = _mm256_set_epi64x(0, 0, 0, 0); interleave_epi64(X0, X1); THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2); deinterleave_epi64(X0, X1); _mm256_storeu_si256(out_mm++, X0); _mm256_storeu_si256(out_mm++, X1); } #undef THREEFISH_ENC_8_ROUNDS #undef THREEFISH_ROUND #undef THREEFISH_INJECT_KEY #undef THREEFISH_ENC_2_8_ROUNDS #undef THREEFISH_ROUND_2 #undef THREEFISH_INJECT_KEY_2 }