コード例 #1
0
ファイル: ir_code.c プロジェクト: shyamupa/x10-compiler
void ir_ternary(nodeType* n)
{
	nodeType* expr = get_operand(n,0);
	nodeType* stmt1 = get_operand(n,1);
	nodeType* stmt2 = get_operand(n,2);
	set_T(expr,newlabel());
	set_F(expr,newlabel());
	memset(stmt1->opr.next,0,16);
	memset(stmt2->opr.next,0,16);
	memset(n->opr.next,0,16);
	strcpy(stmt1->opr.next,n->opr.next);
	strcpy(stmt2->opr.next,n->opr.next);
	strcpy(n->opr.next,newlabel());
	
	seen_bool_flow = 1;
	generate(expr);
	seen_bool_flow = 0;
	
	debugger("%s:\n",get_T(expr));
	fprintf(output,"%s:\n",get_T(expr));
	prepost_put = 1;
	generate(stmt1);
	prepost_put = 0;
	debugger("br.s %s\n ",n->opr.next);
	fprintf(output,"br.s %s\n",n->opr.next);
	debugger("%s:\n",get_F(expr));
	fprintf(output,"%s:\n",get_F(expr));
	prepost_put = 1;
	generate(stmt2);
	prepost_put = 0;
	debugger("%s:\n",n->opr.next);
	fprintf(output,"%s:\n",n->opr.next);
}	
コード例 #2
0
ファイル: ir_code.c プロジェクト: shyamupa/x10-compiler
void ir_if(nodeType* n)
{
	nodeType* expr = get_operand(n,0);
	nodeType* stmt = get_operand(n,1);
	set_T(expr,newlabel());
	set_F(expr,n->opr.next);
	
	memset(stmt->opr.next,0,16);
	strcat(stmt->opr.next,n->opr.next);
	
	debugger("expr true label:%s\n",get_T(expr));
	debugger("expr false label:%s\n",get_F(expr));
	
	seen_bool_flow = 1;prepost_put = 1;
	
	generate(expr);
	
	seen_bool_flow = 0;prepost_put = 0;
	
	debugger("%s:\n",get_T(expr));
	fprintf(output,"%s:\n",get_T(expr));
	
	generate(stmt);
	
	return;
}	
コード例 #3
0
ファイル: viennarna.c プロジェクト: gitter-badger/viennaRNA
int main()
{
    // Optimal fold
    const char* sequence = "CGCAGGGAUACCCGCGCC";
    char* structure;
    float mfe, gfe;
    structure = seq_fold(sequence, &mfe);
    printf("%s %s %f\n", sequence, structure, mfe);
    free(structure);

    // Ensemble fold
    structure = seq_pf_fold(sequence, &gfe);
    printf("%s %s %f\n", sequence, structure, gfe);
    free(structure);

    printf("\n");

    // Find suboptimal structures
    SOLUTION* sol = seq_subopt(sequence, 4.0);
    for(SOLUTION* s = sol; s->structure != NULL; s++)
    {
        printf("%s %s %f\n", sequence, s->structure, s->energy);
        free(s->structure);
    }
    free(sol);

    printf("\n");

    // Evaluate fe of a structure (given a sequence)...
    printf("%f\n", get_T());
    const char* test_str = "(((.((.....)))))..";
    printf("%s %s %f\n", sequence, test_str, seq_eval(sequence, test_str));
    // ... and how it changes with temperature
    set_T(15.0);
    printf("%f\n", get_T());
    printf("%s %s %f\n", sequence, test_str, seq_eval(sequence, test_str));
    set_T(37.0);

    printf("\n");

    // Take a not so different sequence with a different optimal structure
    const char* seed_seq = "AAUAGGGAUACCCGCGCC";
    structure = seq_fold(seed_seq, &mfe);
    printf("%s %s %f\n", seed_seq, structure, mfe);

    // See that is not even stable on the test fold
    printf("%s %s %f\n", seed_seq, test_str, seq_eval(seed_seq, test_str));

    // Mutate it until you get the test fold...
    char* seq = malloc(strlen(seed_seq) + 1);
    strcpy(seq, seed_seq);
    float dist = str_inverse(seq, test_str, 12345, 0);
    // ... and confirm it's its ground state
    structure = seq_fold(seq, &mfe);
    printf("%s %s %f\n", seq, structure, mfe);

    free(seq);
}
コード例 #4
0
ファイル: ir_code.c プロジェクト: shyamupa/x10-compiler
void ir_while(nodeType* n)
{
	nodeType* expr = get_operand(n,0);
	nodeType* stmt = get_operand(n,1);
	char begin[16];
	memset(begin,0,16);
	strcat(begin,newlabel());
	set_T(expr,newlabel());
	set_F(expr,n->opr.next);
	memset(stmt->opr.next,0,16);
	strcpy(stmt->opr.next,begin);
	debugger("%s:\n",begin);
	fprintf(output,"%s:\n",begin);
	
	seen_bool_flow = 1;prepost_put = 1;
	generate(expr);
	seen_bool_flow = 0;prepost_put = 0;
	
	debugger("%s:\n",get_T(expr));
	fprintf(output,"%s:\n",get_T(expr));

	//##for break statement##
	char initial_break_label[16];
	memset(initial_break_label,0,16);
	strcat(initial_break_label,break_label);
	memset(break_label,0,16);
	loop_flag = loop_flag + 1;
	strcat(break_label,n->opr.next);
	//#####for continue statement############
	char initial_continue_label[16];
	memset(initial_continue_label,0,16);
	strcpy(initial_continue_label,continue_label);
	memset(continue_label,0,16);
	strcpy(continue_label,begin);
	debugger("CONTINUE LABEL: %s\n",continue_label);
	//#######################################

	
	generate(stmt);
	
	debugger("br.s %s\n ",begin);
	fprintf(output,"br.s %s\n",begin);

	//for break statement
	loop_flag = loop_flag - 1;
	memset(break_label,0,16);
	strcat(break_label,initial_break_label);
	memset(continue_label,0,16);
	strcpy(continue_label,initial_continue_label);	
}	
コード例 #5
0
ファイル: ir_code.c プロジェクト: shyamupa/x10-compiler
void ir_bool_flow(nodeType* n)
{
	nodeType* B1 = get_operand(n,0);
	nodeType* B2 = get_operand(n,1);
	debugger("n true label:%s\n",get_T(n));
	debugger("n false label:%s\n",get_F(n));
	switch(n->opr.oper)
	{
	case BOOL_OR:
		debugger("MATCHED BOOL_OR in ir_bool_flow\n");
		set_T(B1,n->opr.T);
		set_F(B1,newlabel());
		set_T(B2,n->opr.T);
		set_F(B2,n->opr.F);
		generate(B1);
		debugger("%s:",get_F(B1)); 
		fprintf(output,"%s:",get_F(B1)); 
		debugger("seen_bool_flow : %d\n",seen_bool_flow);
		generate(B2);
		break;
	case BOOL_EQ:
		//the rule is to load value of B1 and B2 on stack then use beq to jump accordingly so we have to switch of seen_bool_flow flag and restart later.
		seen_bool_flow = 0;
		generate(B1);
		generate(B2);
		seen_bool_flow = 1;
		debugger("MATCHED BOOL_EQ in ir_relop_flow\n");
		debugger("beq %s\n",get_T(n));
		fprintf(output,"beq %s\n",get_T(n));
		debugger("br %s\n",get_F(n));
		fprintf(output,"br %s\n",get_F(n));
		break;
	case NEQ:
		debugger("NOT EQUAL TO\n");
		//the rule is to load value of B1 and B2 on stack then use bne.un to jump accordingly so we have to switch of seen_bool_flow flag and restart later.
		seen_bool_flow = 0;
		generate(B1);
		generate(B2);
		seen_bool_flow = 1;
		debugger("MATCHED NEQ in ir_relop_flow\n");
		debugger("bne.un %s\n",get_T(n));
		fprintf(output,"bne.un %s\n",get_T(n));
		debugger("br %s\n",get_F(n));
		fprintf(output,"br %s\n",get_F(n));
		break;
	case BOOL_AND:
		set_T(B1,newlabel());
		set_F(B1, get_F(n));
		set_T(B2, get_T(n));
		set_F(B2, get_F(n));
		generate(B1);
		debugger("%s:",get_T(B1)); 
		fprintf(output,"%s:",get_T(B1)); 
		generate(B2);
		break;
	default: debugger("Bool DEFAULT\n");
	}
	
}
コード例 #6
0
ファイル: thresh_test.c プロジェクト: arnaldomandel/cte
/*
 * Test the result for the bic calculator.
 */
int main(int argc, char** argv) {
  char** samples = read_lines(argv[1]);
  int depth = strtod(argv[2], NULL);
  

  setup_BIC(samples, depth, prob_root, bic_root);
  print_tree(prob_root, depth);
  printf("----------------\n");
  
//  print_tree(bic_root, "");

  printf("\nlogN=%f\n\n",logN);
  
  // int n = size_of_sample();
  get_T(prob_root);
  print_tree(prob_root, depth);
    

  Vec c = get_Tvec(prob_root);
  print_Vec(c);
  
  sort_Vec(c);
  uniquefy_Vec(c);
  print_Vec(c);
  for(int i = 0; i < c->len; i++)
      c->x[i] /= logN;
  print_Vec(c);
  Champion_item champs = champion_set_from_vec(c);
  
  ITERA(Champion_item, cs, champs, next) {
      pprint_Tau(cs->tau);
      printf("\n");
  }
コード例 #7
0
 std::pair<T,U> operator()(Obj rec) const
 {
   if(!isa(rec))
     throw GAPException("Invalid attempt to read pair");
   GAP_getter<T> get_T;
   GAP_getter<U> get_U;
   std::pair<T,U> p(get_T(ELM_LIST(rec, 1)), get_U(ELM_LIST(rec, 2)));
   return p;
 }
コード例 #8
0
ファイル: matrix.cpp プロジェクト: gzmask/cs805assignment1
//get rotation matrix
Matrix get_R(Point vrp, Point vpn, Point vup) {
  //first get the translation matrix from world to view
  auto mt = get_T(vrp);

  //we can see vpn_ and vup_ as vectors. such that we can apply them to get_uvn function from q2
  auto uvn = get_uvn(vup, vpn);
  //finally contruct our roation matrix using method 2 on class notes
  Row r1 = { uvn[0][0],uvn[0][1],uvn[0][2],0 };
  Row r2 = { uvn[1][0],uvn[1][1],uvn[1][2],0 };
  Row r3 = { uvn[2][0],uvn[2][1],uvn[2][2],0 };
  Row r4 = { 0, 0, 0, 1 };
  return { r1, r2, r3, r4 };
}
コード例 #9
0
ファイル: ir_code.c プロジェクト: shyamupa/x10-compiler
void ir_relop_flow(nodeType* n)
{
	int temp_bool_flow = seen_bool_flow;
	seen_bool_flow = 0;
	nodeType* B1 = get_operand(n,0);
	nodeType* B2 = get_operand(n,1);
	generate(B1);
	generate(B2);
	switch(n->opr.oper)
	{
		case LT:
			debugger("MATCHED LT in ir_relop_flow\n");
			debugger("blt %s\n",get_T(n));
			fprintf(output,"blt %s\n",get_T(n));
			debugger("br %s\n",get_F(n));
			fprintf(output,"br %s\n",get_F(n));
			break;
		case GT:
			debugger("MATCHED GT in ir_relop_flow\n");
			debugger("bgt %s\n",get_T(n));
			fprintf(output,"bgt %s\n",get_T(n));
			debugger("br %s\n",get_F(n));
			fprintf(output,"br %s\n",get_F(n));
			break;
		case LE:
			debugger("MATCHED LE in ir_relop_flow\n");
			debugger("ble %s\n",get_T(n));
			fprintf(output,"ble %s\n",get_T(n));
			debugger("br %s\n",get_F(n));
			fprintf(output,"br %s\n",get_F(n));
			break;
		case GE:
			debugger("MATCHED GE in ir_relop_flow\n");
			debugger("bge %s\n",get_T(n));
			fprintf(output,"bge %s\n",get_T(n));
			debugger("br %s\n",get_F(n));
			fprintf(output,"br %s\n",get_F(n));
			break;
		default:
			debugger("Relational Flow default\n");
	}
	seen_bool_flow = temp_bool_flow;
}
コード例 #10
0
ファイル: M_syn.c プロジェクト: proffK/iLab
node* get_T(){

	node* val = node_new();
	val = get_P();

	if (syntax_errno) return 0;
	
	if (*cur_c == '*' || *cur_c == '/'){
		
		node* operation = node_new();
		operation -> data = *cur_c;
		
		++cur_c;
		
		operation -> left = val;
		operation -> right = get_T();
		
		return operation;
	}

	return val;

}
コード例 #11
0
ファイル: main.cpp プロジェクト: gzmask/cs805as3
double ymax = 0.0175;

/* camera position */
Point VRP = {128.0, 64.0, 250.0};
Vector VPN = {-64.0, 0.0, -186.0};
Vector VUP = {0.0, 1.0, 0.0};

double focal = 0.05;	/* focal length simulating 50 mm lens */

Vector Light = {0.577, -0.577, -0.577}; /* light direction */
double Ip = 255.0; /* intensity of the point light source */

/* Transformation from the world to the camera coordinates */
Matrix Mwc = get_M(VRP, VPN, VUP);
Matrix Rwc = get_R(VRP, VPN, VUP);
Matrix Twc = get_T(VRP);
/* Transformation from the camera to the world coordinates */
Matrix Mcw = get_Mi(VRP, VPN, VUP);
Matrix Rcw = get_Ri(VRP, VPN, VUP);
Matrix Tcw = get_Ti(VRP);

int main () {

  //main program for volume rendering
  Volume* ct = new Volume;
  read_from_file("smallHead.den", ct);
  //print_ct_volume(ct);
  Volume* color = new Volume;
  compute_shading_volume(ct, color);
  //print_ct_volume(color);
  ImagePanel* img = new ImagePanel;
コード例 #12
0
ファイル: matrix.cpp プロジェクト: gzmask/cs805assignment1
//this is the world to view final matrix, which is Mwc, also Mwl
Matrix get_M(Point vrp, Point vpn, Point vup) {
  return mul(get_R(vrp, vpn, vup), get_T(vrp));
}
コード例 #13
0
ファイル: benchmarks.cpp プロジェクト: ipa-nhg/kukadu
void ParticleAroundWalls::phi_t(arr& phi, arr& J, uint t, const arr& x_bar){
  uint T=get_T(), n=dim_x(), k=get_k();

  //assert some dimensions
  CHECK(x_bar.d0==k+1,"");
  CHECK(x_bar.d1==n,"");
  CHECK(t<=T,"");

  //-- transition costs: append to phi
  if(k==1)  phi = x_bar[1]-x_bar[0]; //penalize velocity
  if(k==2)  phi = x_bar[2]-2.*x_bar[1]+x_bar[0]; //penalize acceleration
  if(k==3)  phi = x_bar[3]-3.*x_bar[2]+3.*x_bar[1]-x_bar[0]; //penalize jerk

  //-- walls: append to phi
  //Note: here we append to phi ONLY in certain time slices: the dimensionality of phi may very with time slices; see dim_phi(uint t)
  double eps=.1, power=2.;
  if(!hardConstrained){
    //-- wall costs
    for(uint i=0;i<n;i++){ //add barrier costs to each dimension
      if(t==T/4)   phi.append(MT::ineqConstraintCost(i+1.-x_bar(k,i), eps, power));  //middle factor: ``greater than i''
      if(t==T/2)   phi.append(MT::ineqConstraintCost(x_bar(k,i)+i+1., eps, power));  //last factor: ``lower than -i''
      if(t==3*T/4) phi.append(MT::ineqConstraintCost(i+1.-x_bar(k,i), eps, power));  //middle factor: ``greater than i''
      if(t==T)     phi.append(MT::ineqConstraintCost(x_bar(k,i)+i+1., eps, power));  //last factor: ``lower than -i''
    }
  }else{
    //-- wall constraints
    for(uint i=0;i<n;i++){ //add barrier costs to each dimension
      if(t==T/4)   phi.append((i+1.-x_bar(k,i)));  //middle factor: ``greater than i''
      if(t==T/2)   phi.append((x_bar(k,i)+i+1.));  //last factor: ``lower than -i''
      if(t==3*T/4) phi.append((i+1.-x_bar(k,i)));  //middle factor: ``greater than i''
      if(t==T)     phi.append((x_bar(k,i)+i+1.));  //last factor: ``lower than -i''
    }
  }

  uint m=phi.N;
  CHECK(m==dim_phi(t),"");

  if(&J){ //we also need to return the Jacobian
    J.resize(m,k+1,n).setZero();

    //-- transition costs
    for(uint i=0;i<n;i++){
      if(k==1){ J(i,1,i) = 1.;  J(i,0,i) = -1.; }
      if(k==2){ J(i,2,i) = 1.;  J(i,1,i) = -2.;  J(i,0,i) = 1.; }
      if(k==3){ J(i,3,i) = 1.;  J(i,2,i) = -3.;  J(i,1,i) = +3.;  J(i,0,i) = -1.; }
    }

    //-- walls
    if(!hardConstrained){
      for(uint i=0;i<n;i++){
        if(t==T/4)   J(n+i,k,i) = -MT::d_ineqConstraintCost(i+1.-x_bar(k,i), eps, power);
        if(t==T/2)   J(n+i,k,i) =  MT::d_ineqConstraintCost(x_bar(k,i)+i+1., eps, power);
        if(t==3*T/4) J(n+i,k,i) = -MT::d_ineqConstraintCost(i+1.-x_bar(k,i), eps, power);
        if(t==T)     J(n+i,k,i) =  MT::d_ineqConstraintCost(x_bar(k,i)+i+1., eps, power);
      }
    }else{
      for(uint i=0;i<n;i++){
        if(t==T/4)   J(n+i,k,i) = -1.;
        if(t==T/2)   J(n+i,k,i) = +1.;
        if(t==3*T/4) J(n+i,k,i) = -1.;
        if(t==T)     J(n+i,k,i) = +1.;
      }
    }
  }
}
コード例 #14
0
ファイル: benchmarks.cpp プロジェクト: ipa-nhg/kukadu
uint ParticleAroundWalls::dim_g(uint t){
  if(!hardConstrained) return 0;
  uint T=get_T();
  if(t==T/2 || t==T/4 || t==3*T/4 || t==T) return dim_x();
  return 0;
}
コード例 #15
0
ファイル: benchmarks.cpp プロジェクト: ipa-nhg/kukadu
uint ParticleAroundWalls::dim_phi(uint t){
  uint T=get_T();
  if(t==T/2 || t==T/4 || t==3*T/4 || t==T) return 2*dim_x();
  return dim_x();
}
コード例 #16
0
ファイル: threefish_avx2.cpp プロジェクト: pierobot/botan
void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const
   {
   const u64bit* K = &get_K()[0];
   const u64bit* T_64 = &get_T()[0];

   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);

#define THREEFISH_ROUND(X0, X1, SHL)                                                \
   do {                                                                             \
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
      X0 = _mm256_add_epi64(X0, X1);                                                \
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
      X1 = _mm256_xor_si256(X1, X0);                                                \
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
   } while(0)

#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
   do {                                                                             \
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
      X0 = _mm256_add_epi64(X0, X1);                                                \
      X2 = _mm256_add_epi64(X2, X3);                                                \
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
      X1 = _mm256_xor_si256(X1, X0);                                                \
      X3 = _mm256_xor_si256(X3, X2);                                                \
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
   } while(0)

#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
   do {                                                                          \
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
      X0 = _mm256_add_epi64(X0, K0);                                             \
      X1 = _mm256_add_epi64(X1, K1);                                             \
      X1 = _mm256_add_epi64(X1, R);                                              \
      X0 = _mm256_add_epi64(X0, T0);                                             \
      X1 = _mm256_add_epi64(X1, T1);                                             \
      R = _mm256_add_epi64(R, ONE);                                              \
   } while(0)

#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
   do {                                                                          \
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
      X0 = _mm256_add_epi64(X0, K0);                                             \
      X2 = _mm256_add_epi64(X2, K0);                                             \
      X1 = _mm256_add_epi64(X1, K1);                                             \
      X3 = _mm256_add_epi64(X3, K1);                                             \
      T1 = _mm256_add_epi64(T1, R);                                              \
      X0 = _mm256_add_epi64(X0, T0);                                             \
      X2 = _mm256_add_epi64(X2, T0);                                             \
      X1 = _mm256_add_epi64(X1, T1);                                             \
      X3 = _mm256_add_epi64(X3, T1);                                             \
      R = _mm256_add_epi64(R, ONE);                                              \
   } while(0)

#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)        \
   do {                                                        \
      THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
      THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);         \
                                                               \
      THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
      THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0);         \
   } while(0)

#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
   do {                                                                  \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);         \
                                                                         \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0);         \
   } while(0)

   /*
   v1.0 key schedule: 9 ymm registers (only need 2 or 3)
   (0,1,2,3),(4,5,6,7) [8]
   then mutating with vpermq
   */
   const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
   const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
   const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
   const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
   const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
   const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
   const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
   const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
   const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);

   const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);

   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);

   while(blocks >= 2)
      {
      __m256i X0 = _mm256_loadu_si256(in_mm++);
      __m256i X1 = _mm256_loadu_si256(in_mm++);
      __m256i X2 = _mm256_loadu_si256(in_mm++);
      __m256i X3 = _mm256_loadu_si256(in_mm++);

      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);

      __m256i R = _mm256_set_epi64x(0, 0, 0, 0);

      interleave_epi64(X0, X1);
      interleave_epi64(X2, X3);

      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3);

      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2);

      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2);

      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2);

      deinterleave_epi64(X0, X1);
      deinterleave_epi64(X2, X3);

      _mm256_storeu_si256(out_mm++, X0);
      _mm256_storeu_si256(out_mm++, X1);
      _mm256_storeu_si256(out_mm++, X2);
      _mm256_storeu_si256(out_mm++, X3);

      blocks -= 2;
      }

   for(size_t i = 0; i != blocks; ++i)
      {
      __m256i X0 = _mm256_loadu_si256(in_mm++);
      __m256i X1 = _mm256_loadu_si256(in_mm++);

      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);

      __m256i R = _mm256_set_epi64x(0, 0, 0, 0);

      interleave_epi64(X0, X1);

      THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3);

      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2);

      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2);

      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2);

      deinterleave_epi64(X0, X1);

      _mm256_storeu_si256(out_mm++, X0);
      _mm256_storeu_si256(out_mm++, X1);
      }

#undef THREEFISH_ENC_8_ROUNDS
#undef THREEFISH_ROUND
#undef THREEFISH_INJECT_KEY
#undef THREEFISH_ENC_2_8_ROUNDS
#undef THREEFISH_ROUND_2
#undef THREEFISH_INJECT_KEY_2
   }