void main() { int action,box,i; long success,trial; double x, x_dot, theta, theta_dot,reinf,predicted_value; FILE *fptr; fptr=fopen("rand_restart.txt","w"); x=x_dot=theta=theta_dot=rnd(-BETA,BETA); success=0; trial=1; reinf=0.0; while (success<1000000) /* If the pole doesn't fall during 1-million trials,assume it succcess.*/ { action=get_action(x,x_dot,theta,theta_dot,reinf); cart_pole(action,&x,&x_dot,&theta,&theta_dot); box=get_box(x,x_dot,theta,theta_dot); if (box==-1) { reinf=-1.0; predicted_value = 0.0; q_val[prev_state][prev_action] += ALPHA * (reinf + GAMMA * predicted_value - q_val[prev_state][prev_action]); reset_controller(); x=x_dot=theta=theta_dot=rnd(-BETA,BETA); trial++; printf("At %d success ,try %d trials\n",success,trial); fprintf(fptr,"%d\t%d\n",trial,success); success=0; }else{ success++; reinf=0.0; } } printf("Success at %d trials \n",trial); for (i=0;i<NUM_BOXES;i++) fprintf(fptr,"%g %g\n",q_val[i][0],q_val[i][1]); fclose(fptr); }
// cart_and_pole() was take directly from the pole simulator written // by Richard Sutton and Charles Anderson. int go_cart(Network *net,int max_steps,int thresh) { float x, /* cart position, meters */ x_dot, /* cart velocity */ theta, /* pole angle, radians */ theta_dot; /* pole angular velocity */ int steps=0,y; int random_start=1; double in[5]; //Input loading array double out1; double out2; // double one_degree= 0.0174532; /* 2pi/360 */ // double six_degrees=0.1047192; double twelve_degrees=0.2094384; // double thirty_six_degrees= 0.628329; // double fifty_degrees=0.87266; vector<NNode*>::iterator out_iter; if (random_start) { /*set up random start state*/ x = (lrand48()%4800)/1000.0 - 2.4; x_dot = (lrand48()%2000)/1000.0 - 1; theta = (lrand48()%400)/1000.0 - .2; theta_dot = (lrand48()%3000)/1000.0 - 1.5; } else x = x_dot = theta = theta_dot = 0.0; /*--- Iterate through the action-learn loop. ---*/ while (steps++ < max_steps) { /*-- setup the input layer based on the four iputs --*/ //setup_input(net,x,x_dot,theta,theta_dot); in[0]=1.0; //Bias in[1]=(x + 2.4) / 4.8;; in[2]=(x_dot + .75) / 1.5; in[3]=(theta + twelve_degrees) / .41; in[4]=(theta_dot + 1.0) / 2.0; net->load_sensors(in); //activate_net(net); /*-- activate the network based on the input --*/ //Activate the net //If it loops, exit returning only fitness of 1 step if (!(net->activate())) return 1; /*-- decide which way to push via which output unit is greater --*/ out_iter=net->outputs.begin(); out1=(*out_iter)->activation; ++out_iter; out2=(*out_iter)->activation; if (out1 > out2) y = 0; else y = 1; /*--- Apply action to the simulated cart-pole ---*/ cart_pole(y, &x, &x_dot, &theta, &theta_dot); /*--- Check for failure. If so, return steps ---*/ if (x < -2.4 || x > 2.4 || theta < -twelve_degrees || theta > twelve_degrees) return steps; } return steps; }
int main(int argc, char *argv[]) { float x, /* cart position, meters */ x_dot, /* cart velocity */ theta, /* pole angle, radians */ theta_dot; /* pole angular velocity */ int action; /* 0 for push-left, 1 for push-right */ int steps = 0; /* duration of trial, in 0.02 sec steps */ int failures = 0; /* number of failed trials */ int best_steps = 0; /* number of steps in best trial */ int best_trial = 0; /* trial number of best trial */ void reset_state(float *x, float *x_dot, float *theta, float *theta_dot); void cart_pole(int action, float *x, float *x_dot, float *theta, float *theta_dot); int fail(float x, float x_dot, float theta, float theta_dot); extern int get_action(float x, float x_dot, float theta, float theta_dot, float reinforcement); extern void reset_controller(void); /* extern void print_controller_info(); */ printf("Driver: %s\n", rcs_driver_id); if (TILTED) printf("Pole will have tilted reset,"); else printf("Pole has normal reset,"); if (JUPITER_GRAV) printf(" and \"Jupiter\" gravity.\n"); else printf(" and normal gravity.\n"); if (ECHO_STATE) { echo_file = fopen("poledata", "w"); if (echo_file == NULL) printf("ERROR: Cannot open \"poledata\" for output.\n"); } if (argc > 1) RND_SEED = atoi(argv[1]); else RND_SEED = 0; reset_state(&x, &x_dot, &theta, &theta_dot); /*--- Iterate through the action-learn loop. ---*/ while (steps++ < MAX_STEPS && failures < MAX_FAILURES) { action = get_action(x, x_dot, theta, theta_dot, 0.0); /*--- Apply action to the simulated cart-pole ---*/ cart_pole(action, &x, &x_dot, &theta, &theta_dot); if (fail(x, x_dot, theta, theta_dot)) { failures++; printf("Trial %d was %d steps.\n", failures, steps); if (steps > best_steps) { best_steps = steps; best_trial = failures; } /* Call controller with negative feedback for learning */ get_action(x, x_dot, theta, theta_dot, -1.0); reset_controller(); reset_state(&x, &x_dot, &theta, &theta_dot); steps = 0; } } /* Diagnose result */ if (failures == MAX_FAILURES) { printf("Pole not balanced. Stopping after %d failures.\n",failures); printf("High water mark: %d steps in trial %d.\n\n", best_steps, best_trial); } else printf("Pole balanced successfully for at least %d steps in trial %d.\n\n", steps - 1, failures + 1); /* print_controller_info();*/ if (echo_file != NULL) fclose(echo_file); return 0; }
int main() { int action,box,i; long success,trial; double x, x_dot, theta, theta_dot,reinf,predicted_value; FILE *fptr; FILE *fptr1; fptr=fopen("rand_restart.txt","w"); fptr1=fopen("output.csv","w"); x=x_dot=theta=theta_dot=rnd(-BETA,BETA); double angle; success=0; trial=1; reinf=0.0; double force; double j,k; double best_ALPHA=0; double best_GAMMA=0; while (success<1000000) /* If the pole doesn't fall during 1-million trials,assume it succcess.*/ { //getchar(); action=get_action(x,x_dot,theta,theta_dot,reinf); cart_pole(action,&x,&x_dot,&theta,&theta_dot); //printf("%d") if(action==0) force=10; else if(action==1) force=5; else if(action==2) force=0; else if(action==3) force=-5; else force=-10; fprintf(fptr1,"%.2f,%.2f,%.2f,%.2f,%f\n",x,theta,x_dot,theta_dot,force); angle=theta*180/3.1415926; //printf("x%.2f,angle%.2f,%.2f,%.2f,%d\n",x,angle,x_dot,theta_dot,action); box=get_box(x,x_dot,theta,theta_dot); if (box==-1) { reinf=-1.0; predicted_value = 0.0; q_val[prev_state][prev_action] += ALPHA * (reinf + GAMMA * predicted_value - q_val[prev_state][prev_action]); reset_controller(); x=x_dot=theta=theta_dot=rnd(-BETA,BETA); trial++; //printf("At %d success ,try %d trials\n",success,trial); printf("At trial %d : success--->%d (mean last how long)\n",trial,success); fprintf(fptr,"trials%d\t success%d\n",trial,success); success=0; }else{ success++; reinf=0.0; /*if(success>1000000-2) { printf("asfasdfasdf"); break; }*/ } } printf("If success > 1000000 \n Success at %d trials \n",trial); for (i=0;i<NUM_BOXES;i++) fprintf(fptr,"%g %g\n",q_val[i][0],q_val[i][1]); fclose(fptr); fclose(fptr1); system("pause"); }
main() { float x, /* cart position, meters */ x_dot, /* cart velocity */ theta, /* pole angle, radians */ theta_dot; /* pole angular velocity */ vector w, /* vector of action weights */ v, /* vector of critic weights */ e, /* vector of action weight eligibilities */ xbar; /* vector of critic weight eligibilities */ float p, oldp, rhat, r; int box, i, y, steps = 0, failures=0, failed; printf("Seed? "); scanf("%d",&i); srand(i); //srand (time(NULL)); /*--- Initialize action and heuristic critic weights and traces. ---*/ for (i = 0; i < N_BOXES; i++) w[i] = v[i] = xbar[i] = e[i] = 0.0; /*--- Starting state is (0 0 0 0) ---*/ x = x_dot = theta = theta_dot = 0.0; /*--- Find box in state space containing start state ---*/ box = get_box(x, x_dot, theta, theta_dot); /*--- Iterate through the action-learn loop. ---*/ while (steps++ < MAX_STEPS && failures < MAX_FAILURES) { /*--- Choose action randomly, biased by current weight. ---*/ y = (random < prob_push_right(w[box])); /*--- Update traces. ---*/ e[box] += (1.0 - LAMBDAw) * (y - 0.5); xbar[box] += (1.0 - LAMBDAv); /*--- Remember prediction of failure for current state ---*/ oldp = v[box]; /*--- Apply action to the simulated cart-pole ---*/ cart_pole(y, &x, &x_dot, &theta, &theta_dot); /*--- Get box of state space containing the resulting state. ---*/ box = get_box(x, x_dot, theta, theta_dot); if (box < 0) { /*--- Failure occurred. ---*/ failed = 1; failures++; printf("Trial %d was %d steps.\n", failures, steps); steps = 0; /*--- Reset state to (0 0 0 0). Find the box. ---*/ x = x_dot = theta = theta_dot = 0.0; box = get_box(x, x_dot, theta, theta_dot); /*--- Reinforcement upon failure is -1. Prediction of failure is 0. ---*/ r = -1.0; p = 0.; } else { /*--- Not a failure. ---*/ failed = 0; /*--- Reinforcement is 0. Prediction of failure given by v weight. ---*/ r = 0; p= v[box]; } /*--- Heuristic reinforcement is: current reinforcement + gamma * new failure prediction - previous failure prediction ---*/ rhat = r + GAMMA * p - oldp; for (i = 0; i < N_BOXES; i++) { /*--- Update all weights. ---*/ w[i] += ALPHA * rhat * e[i]; v[i] += BETA * rhat * xbar[i]; if (v[i] < -1.0) v[i] = v[i]; if (failed) { /*--- If failure, zero all traces. ---*/ e[i] = 0.; xbar[i] = 0.; } else { /*--- Otherwise, update (decay) the traces. ---*/ e[i] *= LAMBDAw; xbar[i] *= LAMBDAv; } } } if (failures == MAX_FAILURES) printf("Pole not balanced. Stopping after %d failures.",failures); else printf("Pole balanced successfully for at least %d steps\n", steps); }