int doAllFeatures() { int cloop=0; /* Initial weight factors */ int i, j, h, c, r; for (i=0; i < NMOVIES; i++) { for (r=0; r<5; r++) { for (c=0; c < NFACTORS; c++) { Aic[i][r][c] = 0.02 * randn() - 0.01; // Normal Distribution } } } for (c=0; c < NFACTORS; c++) { for (j=0; j < TOTAL_FEATURES; j++) { //vishid[j][0][i] = 0.02 * randn() - 0.01; // Normal Distribution //vishid[j][1][i] = 0.02 * randn() - 0.01; // Normal Distribution //vishid[j][2][i] = 0.02 * randn() - 0.01; // Normal Distribution //vishid[j][3][i] = 0.02 * randn() - 0.01; // Normal Distribution //vishid[j][4][i] = 0.02 * randn() - 0.01; // Normal Distribution Bcj[c][j] = 0.2/3.0 * randn() - 0.1/3.0; // Normal Distribution } } /* Initial biases */ for(i=0;i<TOTAL_FEATURES;i++) { hidbiases[i]=0.0; } for (j=0; j<NMOVIES; j++) { unsigned int mtot = moviercount[j*5+0] + moviercount[j*5+1] + moviercount[j*5+2] + moviercount[j*5+3] + moviercount[j*5+4]; for (i=0; i<5; i++) { visbiases[j][i] = log( ((double)moviercount[j*5+i]) / ((double) mtot) ); //printf("mrc: %d, mc %d, log:%f frac: %f\n", moviercount[j*5+i], moviecount[j] , log( moviercount[j*5+i] /(double) moviecount[j]), //(moviercount[j*5+i] /(double) moviecount[j]) ); } } /* Optimize current feature */ double nrmse=2., last_rmse=10.; double prmse = 0, last_prmse=0; double s; //double s2; int n; int loopcount=0; double EpsilonW = epsilonw; double EpsilonVB = epsilonvb; double EpsilonHB = epsilonhb; double Momentum = momentum; ZERO(Ainc); ZERO(Binc); ZERO(visbiasinc); ZERO(hidbiasinc); int tSteps = 1; //while ( ((nrmse < (last_rmse-E) && prmse<last_prmse) || loopcount < 14) && loopcount < 80 ) { while ( ((nrmse < (last_rmse-E) ) || loopcount < 14) && loopcount < 80 ) { //if ( loopcount >= 10 ) //tSteps = 1 + loopcount / 5; last_rmse=nrmse; last_prmse=prmse; clock_t t0=clock(); loopcount++; int ntrain = 0; nrmse = 0.0; s = 0.0; //s2 = 0.0; n = 0; if ( loopcount > 5 ) Momentum = finalmomentum; //* CDpos =0, CDneg=0 (matrices) ZERO(Apos); ZERO(Aneg); ZERO(Bpos); ZERO(Bneg); ZERO(poshidact); ZERO(neghidact); ZERO(posvisact); ZERO(negvisact); ZERO(moviecount); int u,m, f; for(u=0;u<NUSERS;u++) { //* CDpos =0, CDneg=0 (matrices) ZERO(negvisprobs); ZERO(nvp2); //* perform steps 1 to 8 int base0=useridx[u][0]; int d0=UNTRAIN(u); int dall=UNALL(u); // For all rated movies, accumulate contributions to hidden units double sumW[TOTAL_FEATURES]; ZERO(sumW); for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; moviecount[m]++; // 1. get one data point from data set. // 2. use values of this data point to set state of visible neurons Si int r=(userent[base0+j]>>USER_LMOVIEMASK)&7; // Add to the bias contribution for set visible units posvisact[m][r] += 1.0; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { // sum_j(W[i][j] * v[0][j])) //sumW[h] += vishid[m][r][h]; sumW[h] += Wij(m,r,h); } } // Sample the hidden units state after computing probabilities for(h=0;h<TOTAL_FEATURES;h++) { // 3. compute Sj for each hidden neuron based on formula above and states of visible neurons Si // poshidprobs[h] = 1./(1 + exp(-V*vishid - hidbiases); // compute Q(h[0][i] = 1 | v[0]) # for binomial units, sigmoid(b[i] + sum_j(W[i][j] * v[0][j])) poshidprobs[h] = 1.0/(1.0 + exp(-sumW[h] - hidbiases[h])); // sample h[0][i] from Q(h[0][i] = 1 | v[0]) if ( poshidprobs[h] > (rand()/(double)(RAND_MAX)) ) { poshidstates[h]=1; poshidact[h] += 1.0; } else { poshidstates[h]=0; } //poshidact[h] += poshidprobs[h]; } // Load up a copy of poshidstates for use in loop for ( h=0; h < TOTAL_FEATURES; h++ ) curposhidstates[h] = poshidstates[h]; // Make T Contrastive Divergence steps int stepT = 0; do { // Determine if this is the last pass through this loop int finalTStep = (stepT+1 >= tSteps); // 5. on visible neurons compute Si using the Sj computed in step3. This is known as reconstruction // for all visible units j: int r; int count = d0; count += useridx[u][1]; // too compute probe errors for(j=0;j<count;j++) { int m=userent[base0+j]&USER_MOVIEMASK; for(h=0;h<TOTAL_FEATURES;h++) { if ( curposhidstates[h] == 1 ) { for(r=0;r<5;r++) { //negvisprobs[m][r] += vishid[m][r][h]; negvisprobs[m][r] += Wij(m,r,h); } } //for(r=0;r<5;r++) //negvisprobs[m][r] += poshidprobs[h] * vishid[m][r][h]; if ( loopcount >= 10 ) { for(r=0;r<5;r++) //nvp2[m][r] += poshidprobs[h] * vishid[m][r][h]; nvp2[m][r] += poshidprobs[h] * Wij(m,r,h); } } // compute P(v[1][j] = 1 | h[0]) # for binomial units, sigmoid(c[j] + sum_i(W[i][j] * h[0][i])) negvisprobs[m][0] = 1./(1 + exp(-negvisprobs[m][0] - visbiases[m][0])); negvisprobs[m][1] = 1./(1 + exp(-negvisprobs[m][1] - visbiases[m][1])); negvisprobs[m][2] = 1./(1 + exp(-negvisprobs[m][2] - visbiases[m][2])); negvisprobs[m][3] = 1./(1 + exp(-negvisprobs[m][3] - visbiases[m][3])); negvisprobs[m][4] = 1./(1 + exp(-negvisprobs[m][4] - visbiases[m][4])); // Normalize probabilities double tsum = negvisprobs[m][0] + negvisprobs[m][1] + negvisprobs[m][2] + negvisprobs[m][3] + negvisprobs[m][4]; if ( tsum != 0 ) { negvisprobs[m][0] /= tsum; negvisprobs[m][1] /= tsum; negvisprobs[m][2] /= tsum; negvisprobs[m][3] /= tsum; negvisprobs[m][4] /= tsum; } if ( loopcount >= 10 ) { nvp2[m][0] = 1./(1 + exp(-nvp2[m][0] - visbiases[m][0])); nvp2[m][1] = 1./(1 + exp(-nvp2[m][1] - visbiases[m][1])); nvp2[m][2] = 1./(1 + exp(-nvp2[m][2] - visbiases[m][2])); nvp2[m][3] = 1./(1 + exp(-nvp2[m][3] - visbiases[m][3])); nvp2[m][4] = 1./(1 + exp(-nvp2[m][4] - visbiases[m][4])); double tsum2 = nvp2[m][0] + nvp2[m][1] + nvp2[m][2] + nvp2[m][3] + nvp2[m][4]; if ( tsum2 != 0 ) { nvp2[m][0] /= tsum2; nvp2[m][1] /= tsum2; nvp2[m][2] /= tsum2; nvp2[m][3] /= tsum2; nvp2[m][4] /= tsum2; } } // sample v[1][j] from P(v[1][j] = 1 | h[0]) double randval = (rand()/(double)(RAND_MAX)); if ( (randval -= negvisprobs[m][0]) <= 0.0 ) negvissoftmax[m] = 0; else if ( (randval -= negvisprobs[m][1]) <= 0.0 ) negvissoftmax[m] = 1; else if ( (randval -= negvisprobs[m][2]) <= 0.0 ) negvissoftmax[m] = 2; else if ( (randval -= negvisprobs[m][3]) <= 0.0 ) negvissoftmax[m] = 3; else //if ( (randval -= negvisprobs[m][4]) <= 0.0 ) negvissoftmax[m] = 4; //negvisact[m*5+0] += negvisprobs[m*5+0]; //negvisact[m*5+1] += negvisprobs[m*5+1]; //negvisact[m*5+2] += negvisprobs[m*5+2]; //negvisact[m*5+3] += negvisprobs[m*5+3]; //negvisact[m*5+4] += negvisprobs[m*5+4]; // if in training data then train on it if ( j < d0 && finalTStep ) negvisact[m][negvissoftmax[m]] += 1.0; } // 6. compute state of hidden neurons Sj again using Si from 5 step. // For all rated movies accumulate contributions to hidden units from sampled visible units ZERO(sumW); for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { //sumW[h] += vishid[m][negvissoftmax[m]][h]; sumW[h] += Wij(m,negvissoftmax[m],h); //sumW[h] += vishid[m][0][h] * negvisprobs[m*5+0]; //sumW[h] += vishid[m][1][h] * negvisprobs[m*5+1]; //sumW[h] += vishid[m][2][h] * negvisprobs[m*5+2]; //sumW[h] += vishid[m][3][h] * negvisprobs[m*5+3]; //sumW[h] += vishid[m][4][h] * negvisprobs[m*5+4]; } } // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { // compute Q(h[1][i] = 1 | v[1]) # for binomial units, sigmoid(b[i] + sum_j(W[i][j] * v[1][j])) neghidprobs[h] = 1./(1 + exp(-sumW[h] - hidbiases[h])); // Experimentally sample the hidden units state again TODO: What is best? if ( neghidprobs[h] > (rand()/(double)(RAND_MAX)) ) { neghidstates[h]=1; if ( finalTStep ) neghidact[h] += 1.0; } else { neghidstates[h]=0; } //if ( finalTStep ) //neghidact[h] += neghidprobs[h]; } // Compute error rmse and prmse before we start iterating on T if ( stepT == 0 ) { // Compute rmse on training data for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; int r=(userent[base0+j]>>USER_LMOVIEMASK)&7; //# Compute some error function like sum of squared difference between Si in 1) and Si in 5) if ( loopcount < 10 ) { double expectedV = negvisprobs[m][1] + 2.0 * negvisprobs[m][2] + 3.0 * negvisprobs[m][3] + 4.0 * negvisprobs[m][4]; double vdelta = (((double)r)-expectedV); nrmse += (vdelta * vdelta); } else { double expectedV = nvp2[m][1] + 2.0 * nvp2[m][2] + 3.0 * nvp2[m][3] + 4.0 * nvp2[m][4]; double vdelta = (((double)r)-expectedV); nrmse += (vdelta * vdelta); } } ntrain+=d0; // Sum up probe rmse int base=useridx[u][0]; for(i=1;i<2;i++) base+=useridx[u][i]; int d=useridx[u][2]; for(i=0; i<d;i++) { int m=userent[base+i]&USER_MOVIEMASK; int r=(userent[base+i]>>USER_LMOVIEMASK)&7; //# Compute some error function like sum of squared difference between Si in 1) and Si in 5) if ( loopcount < 10 ) { double expectedV = negvisprobs[m][1] + 2.0 * negvisprobs[m][2] + 3.0 * negvisprobs[m][3] + 4.0 * negvisprobs[m][4]; double vdelta = (((double)r)-expectedV); s+=vdelta*vdelta; } else { double expectedV = nvp2[m][1] + 2.0 * nvp2[m][2] + 3.0 * nvp2[m][3] + 4.0 * nvp2[m][4]; double vdelta = (((double)r)-expectedV); s+=vdelta*vdelta; } } n+=d; } // If looping again, load the curposvisstates if ( !finalTStep ) { for ( h=0; h < TOTAL_FEATURES; h++ ) curposhidstates[h] = neghidstates[h]; ZERO(negvisprobs); } // 8. repeating multiple times steps 5,6 and 7 compute (Si.Sj)n. Where n is small number and can // increase with learning steps to achieve better accuracy. } while ( ++stepT < tSteps ); // Accumulate contrastive divergence contributions for (Si.Sj)0 and (Si.Sj)T for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; int r=(userent[base0+j]>>USER_LMOVIEMASK)&7; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { if ( poshidstates[h] == 1 ) { // 4. now Si and Sj values can be used to compute (Si.Sj)0 here () means just values not average //* accumulate CDpos = CDpos + (Si.Sj)0 //CDpos[m][r][h] += 1.0; for (c=0; c < NFACTORS; c++) { Apos[m][r][c] += Bcj[c][h]; Bpos[c][h] += Aic[m][r][c]; } } //CDpos[m][r][h] += poshidprobs[h]; // 7. now use Si and Sj to compute (Si.Sj)1 (fig.3) //TODO - This is experimental!!!!!!! //CDneg[m][negvissoftmax[m]][h] += neghidprobs[h]; //CDneg[m][negvissoftmax[m]][h] += (double)neghidstates[h]; if ( neghidstates[h] == 1 ) { for (c=0; c < NFACTORS; c++) { Aneg[m][negvissoftmax[m]][c] += Bcj[c][h]; Bneg[c][h] += Aic[m][negvissoftmax[m]][c]; } } } } // Update weights and biases after batch // //int bsize = 1000; int bsize = 100; if ( ((u+1) % bsize) == 0 || (u+1) == NUSERS ) { int numcases = u % bsize; numcases++; cloop++; //if ( numcases != bsize ) printf("u: %d, numcases: %d\n", u, numcases); // Update A factors for(m=0;m < NMOVIES;m++) { if ( moviecount[m] == 0 ) continue; // for all c factors for(c=0;c < NFACTORS; c++) { // for all softmax int rr; for(rr=0;rr<5;rr++) { //# At the end compute average of CDpos and CDneg by dividing them by number of data points. //# Compute CD = < Si.Sj >0 < Si.Sj >n = CDpos CDneg double Ap = Apos[m][rr][c]; double An = Aneg[m][rr][c]; if ( Ap != 0.0 || An != 0.0 ) { Ap /= ((double)moviecount[m]); An /= ((double)moviecount[m]); // W += epsilon * (h[0] * v[0]' - Q(h[1][.] = 1 | v[1]) * v[1]') //# Update weights and biases W = W + alpha*CD (biases are just weights to neurons that stay always 1.0) //e.g between data and reconstruction. //double preW = vishid[m][rr][h]; Ainc[m][rr][c] = Momentum * Ainc[m][rr][c] + EpsilonW * ((Ap - An) - weightcost * Aic[m][rr][c]); Aic[m][rr][c] += Ainc[m][rr][c]; //if ( cloop % 50 == 0 && c == 7 ) //printf("Aic: %f\t m: %d\t r: %d\t c: %d\n", Aic[m][rr][c], m, rr, c); //printf("W: %f preW: %f, CDp: %f, CDn: %f, m: %d, r: %d, h: %d, nhp: %f, nvp: %f\n", vishid[m][rr][h], preW, CDp, CDn, m, rr, h, //neghidprobs[h],negvisprobs[m*5+rr] //); } } } // Update visible softmax biases // c += epsilon * (v[0] - v[1])$ // for all softmax int rr; for(rr=0;rr<5;rr++) { if ( posvisact[m][rr] != 0.0 || negvisact[m][rr] != 0.0 ) { posvisact[m][rr] /= ((double)moviecount[m]); negvisact[m][rr] /= ((double)moviecount[m]); visbiasinc[m][rr] = Momentum * visbiasinc[m][rr] + EpsilonVB * ((posvisact[m][rr] - negvisact[m][rr])); //visbiasinc[m][rr] = Momentum * visbiasinc[m][rr] + EpsilonVB * ((posvisact[m][rr] - negvisact[m][rr]) - weightcost * visbiases[m][rr]); visbiases[m][rr] += visbiasinc[m][rr]; //printf("vb: %f, pa: %f, na: %f\n", visbiases[(m*5+rr)], posvisact[(m*5+rr)], negvisact[(m*5+rr)]); } } } // Update B factors for(c=0;c<NFACTORS;c++) { // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { //# At the end compute average of CDpos and CDneg by dividing them by number of data points. //# Compute CD = < Si.Sj >0 < Si.Sj >n = CDpos CDneg double Bp = Bpos[c][h]; double Bn = Bneg[c][h]; if ( Bp != 0.0 || Bn != 0.0 ) { Bp /= ((double)numcases); Bn /= ((double)numcases); // W += epsilon * (h[0] * v[0]' - Q(h[1][.] = 1 | v[1]) * v[1]') //# Update weights and biases W = W + alpha*CD (biases are just weights to neurons that stay always 1.0) //e.g between data and reconstruction. //double preW = vishid[m][rr][h]; Binc[c][h] = Momentum * Binc[c][h] + EpsilonW * ((Bp - Bn) - weightcost * Bcj[c][h]); Bcj[c][h] += Binc[c][h]; //if ( cloop % 50 == 0 && h == 7 ) //printf("Bcj: %f\t c: %d\t h: %d\n", Bcj[c][h], c, h); //printf("W: %f preW: %f, CDp: %f, CDn: %f, m: %d, r: %d, h: %d, nhp: %f, nvp: %f\n", vishid[m][rr][h], preW, CDp, CDn, m, rr, h, //neghidprobs[h],negvisprobs[m*5+rr] //); } } } // Update hidden biases // b += epsilon * (h[0] - Q(h[1][.] = 1 | v[1])) for(h=0;h<TOTAL_FEATURES;h++) { if ( poshidact[h] != 0.0 || neghidact[h] != 0.0 ) { //poshidact[h] /= ((double)(numcases*ntrain*5)); //neghidact[h] /= ((double)(numcases*ntrain*5)); poshidact[h] /= ((double)(numcases)); neghidact[h] /= ((double)(numcases)); //poshidact[h] /= ((double)(mcount)); //neghidact[h] /= ((double)(mcount)); hidbiasinc[h] = Momentum * hidbiasinc[h] + EpsilonHB * ((poshidact[h] - neghidact[h])); //hidbiasinc[h] = Momentum * hidbiasinc[h] + EpsilonHB * ((poshidact[h] - neghidact[h]) - weightcost * hidbiases[h]); hidbiases[h] += hidbiasinc[h]; //printf("hb: %f, pa: %f, na: %f, d0:%d\n", hidbiases[h], poshidact[h], neghidact[h], d0); } } ZERO(Apos); ZERO(Aneg); ZERO(Bpos); ZERO(Bneg); ZERO(poshidact); ZERO(neghidact); ZERO(posvisact); ZERO(negvisact); //ZERO(poscnt); //ZERO(negcnt); ZERO(moviecount); //mcount = 0; } } nrmse=sqrt(nrmse/ntrain); prmse = sqrt(s/n); //double prmse2 = sqrt(s2/n); //lg("%f\t%f\t%f\t%f\n",nrmse,prmse,prmse2,(clock()-t0)/(double)CLOCKS_PER_SEC); lg("%f\t%f\t%f\n",nrmse,prmse,(clock()-t0)/(double)CLOCKS_PER_SEC); if ( loopcount > 6 ) { EpsilonW *= 0.90; EpsilonVB *= 0.90; EpsilonHB *= 0.90; } //else if ( loopcount > 5 ) { //EpsilonW *= 0.82; //EpsilonVB *= 0.82; //EpsilonHB *= 0.82; //} //printf("dd: %d %d %d %d %d\n", dd[0], dd[1], dd[2], dd[3], dd[4]); } /* Perform a final iteration in which the errors are clipped and stored */ recordErrors(); //if(save_model) { //dappend_bin(fnameV,sV,NMOVIES); //dappend_bin(fnameU,sU,NUSERS); //} return 1; }
int doAllFeatures() { /* Initial weights */ int i, j, h; for (j=0; j<NMOVIES; j++) { for (i=0; i<TOTAL_FEATURES; i++) { vishid[j][0][i] = 0.02 * randn() - 0.01; // Normal Distribution vishid[j][1][i] = 0.02 * randn() - 0.01; // Normal Distribution vishid[j][2][i] = 0.02 * randn() - 0.01; // Normal Distribution vishid[j][3][i] = 0.02 * randn() - 0.01; // Normal Distribution vishid[j][4][i] = 0.02 * randn() - 0.01; // Normal Distribution } } /* Initial biases */ for(i=0;i<TOTAL_FEATURES;i++) { hidbiases[i]=0.0; } for (j=0; j<NMOVIES; j++) { unsigned int mtot = moviercount[j*SOFTMAX+0] + moviercount[j*SOFTMAX+1] + moviercount[j*SOFTMAX+2] + moviercount[j*SOFTMAX+3] + moviercount[j*SOFTMAX+4]; for (i=0; i<SOFTMAX; i++) { visbiases[j][i] = log( ((double)moviercount[j*SOFTMAX+i]) / ((double) mtot) ); } } /* Optimize current feature */ double nrmse=2., last_rmse=10.; double prmse = 0, last_prmse=0; double s; int n; int loopcount=0; double EpsilonW = epsilonw; double EpsilonVB = epsilonvb; double EpsilonHB = epsilonhb; double Momentum = momentum; ZERO(CDinc); ZERO(visbiasinc); ZERO(hidbiasinc); int tSteps = 1; // Iterate through the model while the RMSE is decreasing //while ( ((nrmse < (last_rmse-E) && prmse<last_prmse) || loopcount < 14) && loopcount < 80 ) { while ( ((nrmse < (last_rmse-E) ) || loopcount < 14) && loopcount < 80 ) { if ( loopcount >= 10 ) tSteps = 3 + (loopcount - 10)/5; last_rmse=nrmse; last_prmse=prmse; clock_t t0=clock(); loopcount++; int ntrain = 0; nrmse = 0.0; s = 0.0; n = 0; if ( loopcount > 5 ) Momentum = finalmomentum; //* CDpos =0, CDneg=0 (matrices) ZERO(CDpos); ZERO(CDneg); ZERO(poshidact); ZERO(neghidact); ZERO(posvisact); ZERO(negvisact); ZERO(moviecount); int u,m, f; for(u=0;u<NUSERS;u++) { //* Clear summations for probabilities ZERO(negvisprobs); ZERO(nvp2); //* perform steps 1 to 8 int base0=useridx[u][0]; int d0=UNTRAIN(u); int dall=UNALL(u); // For all rated movies, accumulate contributions to hidden units double sumW[TOTAL_FEATURES]; ZERO(sumW); for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; moviecount[m]++; // 1. get one data point from data set. // 2. use values of this data point to set state of visible neurons Si int r=(userent[base0+j]>>USER_LMOVIEMASK)&7; // Add to the bias contribution for set visible units posvisact[m][r] += 1.0; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { // sum_j(W[i][j] * v[0][j])) sumW[h] += vishid[m][r][h]; } } // Sample the hidden units state after computing probabilities for(h=0;h<TOTAL_FEATURES;h++) { // 3. compute Sj for each hidden neuron based on formula above and states of visible neurons Si // poshidprobs[h] = 1./(1 + exp(-V*vishid - hidbiases); // compute Q(h[0][i] = 1 | v[0]) # for binomial units, sigmoid(b[i] + sum_j(W[i][j] * v[0][j])) poshidprobs[h] = 1.0/(1.0 + exp(-sumW[h] - hidbiases[h])); // sample h[0][i] from Q(h[0][i] = 1 | v[0]) if ( poshidprobs[h] > (rand()/(double)(RAND_MAX)) ) { poshidstates[h]=1; poshidact[h] += 1.0; } else { poshidstates[h]=0; } } // Load up a copy of poshidstates for use in loop for ( h=0; h < TOTAL_FEATURES; h++ ) curposhidstates[h] = poshidstates[h]; // Make T Contrastive Divergence steps int stepT = 0; do { // Determine if this is the last pass through this loop int finalTStep = (stepT+1 >= tSteps); // 5. on visible neurons compute Si using the Sj computed in step3. This is known as reconstruction // for all visible units j: int r; int count = d0; count += useridx[u][2]; // too compute probe errors for(j=0;j<count;j++) { int m=userent[base0+j]&USER_MOVIEMASK; for(h=0;h<TOTAL_FEATURES;h++) { // Accumulate Weight values for sampled hidden states == 1 if ( curposhidstates[h] == 1 ) { for(r=0;r<SOFTMAX;r++) { negvisprobs[m][r] += vishid[m][r][h]; } } // Compute more accurate probabilites for RMSE reporting if ( stepT == 0 ) { for(r=0;r<SOFTMAX;r++) nvp2[m][r] += poshidprobs[h] * vishid[m][r][h]; } } // compute P(v[1][j] = 1 | h[0]) # for binomial units, sigmoid(c[j] + sum_i(W[i][j] * h[0][i])) // Softmax elements are handled individually here negvisprobs[m][0] = 1./(1 + exp(-negvisprobs[m][0] - visbiases[m][0])); negvisprobs[m][1] = 1./(1 + exp(-negvisprobs[m][1] - visbiases[m][1])); negvisprobs[m][2] = 1./(1 + exp(-negvisprobs[m][2] - visbiases[m][2])); negvisprobs[m][3] = 1./(1 + exp(-negvisprobs[m][3] - visbiases[m][3])); negvisprobs[m][4] = 1./(1 + exp(-negvisprobs[m][4] - visbiases[m][4])); // Normalize probabilities double tsum = negvisprobs[m][0] + negvisprobs[m][1] + negvisprobs[m][2] + negvisprobs[m][3] + negvisprobs[m][4]; if ( tsum != 0 ) { negvisprobs[m][0] /= tsum; negvisprobs[m][1] /= tsum; negvisprobs[m][2] /= tsum; negvisprobs[m][3] /= tsum; negvisprobs[m][4] /= tsum; } // Compute and Normalize more accurate RMSE reporting probabilities if ( stepT == 0) { nvp2[m][0] = 1./(1 + exp(-nvp2[m][0] - visbiases[m][0])); nvp2[m][1] = 1./(1 + exp(-nvp2[m][1] - visbiases[m][1])); nvp2[m][2] = 1./(1 + exp(-nvp2[m][2] - visbiases[m][2])); nvp2[m][3] = 1./(1 + exp(-nvp2[m][3] - visbiases[m][3])); nvp2[m][4] = 1./(1 + exp(-nvp2[m][4] - visbiases[m][4])); double tsum2 = nvp2[m][0] + nvp2[m][1] + nvp2[m][2] + nvp2[m][3] + nvp2[m][4]; if ( tsum2 != 0 ) { nvp2[m][0] /= tsum2; nvp2[m][1] /= tsum2; nvp2[m][2] /= tsum2; nvp2[m][3] /= tsum2; nvp2[m][4] /= tsum2; } } // sample v[1][j] from P(v[1][j] = 1 | h[0]) double randval = (rand()/(double)(RAND_MAX)); if ( (randval -= negvisprobs[m][0]) <= 0.0 ) negvissoftmax[m] = 0; else if ( (randval -= negvisprobs[m][1]) <= 0.0 ) negvissoftmax[m] = 1; else if ( (randval -= negvisprobs[m][2]) <= 0.0 ) negvissoftmax[m] = 2; else if ( (randval -= negvisprobs[m][3]) <= 0.0 ) negvissoftmax[m] = 3; else //if ( (randval -= negvisprobs[m][4]) <= 0.0 ) negvissoftmax[m] = 4; // if in training data then train on it if ( j < d0 && finalTStep ) negvisact[m][negvissoftmax[m]] += 1.0; } // 6. compute state of hidden neurons Sj again using Si from 5 step. // For all rated movies accumulate contributions to hidden units from sampled visible units ZERO(sumW); for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { sumW[h] += vishid[m][negvissoftmax[m]][h]; } } // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { // compute Q(h[1][i] = 1 | v[1]) # for binomial units, sigmoid(b[i] + sum_j(W[i][j] * v[1][j])) neghidprobs[h] = 1./(1 + exp(-sumW[h] - hidbiases[h])); // Sample the hidden units state again. if ( neghidprobs[h] > (rand()/(double)(RAND_MAX)) ) { neghidstates[h]=1; if ( finalTStep ) neghidact[h] += 1.0; } else { neghidstates[h]=0; } } // Compute error rmse and prmse before we start iterating on T if ( stepT == 0 ) { // Compute rmse on training data for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; int r=(userent[base0+j]>>USER_LMOVIEMASK)&7; //# Compute some error function like sum of squared difference between Si in 1) and Si in 5) double expectedV = nvp2[m][1] + 2.0 * nvp2[m][2] + 3.0 * nvp2[m][3] + 4.0 * nvp2[m][4]; double vdelta = (((double)r)-expectedV); nrmse += (vdelta * vdelta); } ntrain+=d0; // Sum up probe rmse int base=useridx[u][0]; for(i=1;i<2;i++) base+=useridx[u][i]; int d=useridx[u][2]; for(i=0; i<d;i++) { int m=userent[base+i]&USER_MOVIEMASK; int r=(userent[base+i]>>USER_LMOVIEMASK)&7; //# Compute some error function like sum of squared difference between Si in 1) and Si in 5) double expectedV = nvp2[m][1] + 2.0 * nvp2[m][2] + 3.0 * nvp2[m][3] + 4.0 * nvp2[m][4]; double vdelta = (((double)r)-expectedV); s+=vdelta*vdelta; } n+=d; } // If looping again, load the curposvisstates if ( !finalTStep ) { for ( h=0; h < TOTAL_FEATURES; h++ ) curposhidstates[h] = neghidstates[h]; ZERO(negvisprobs); } // 8. repeating multiple times steps 5,6 and 7 compute (Si.Sj)n. Where n is small number and can // increase with learning steps to achieve better accuracy. } while ( ++stepT < tSteps ); // Accumulate contrastive divergence contributions for (Si.Sj)0 and (Si.Sj)T for(j=0;j<d0;j++) { int m=userent[base0+j]&USER_MOVIEMASK; int r=(userent[base0+j]>>USER_LMOVIEMASK)&7; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { if ( poshidstates[h] == 1 ) { // 4. now Si and Sj values can be used to compute (Si.Sj)0 here () means just values not average //* accumulate CDpos = CDpos + (Si.Sj)0 CDpos[m][r][h] += 1.0; } // 7. now use Si and Sj to compute (Si.Sj)1 (fig.3) CDneg[m][negvissoftmax[m]][h] += (double)neghidstates[h]; } } // Update weights and biases after batch // int bsize = 100; if ( ((u+1) % bsize) == 0 || (u+1) == NUSERS ) { int numcases = u % bsize; numcases++; // Update weights for(m=0;m<NMOVIES;m++) { if ( moviecount[m] == 0 ) continue; // for all hidden units h: for(h=0;h<TOTAL_FEATURES;h++) { // for all softmax int rr; for(rr=0;rr<SOFTMAX;rr++) { //# At the end compute average of CDpos and CDneg by dividing them by number of data points. //# Compute CD = < Si.Sj >0 < Si.Sj >n = CDpos CDneg double CDp = CDpos[m][rr][h]; double CDn = CDneg[m][rr][h]; if ( CDp != 0.0 || CDn != 0.0 ) { CDp /= ((double)moviecount[m]); CDn /= ((double)moviecount[m]); // W += epsilon * (h[0] * v[0]' - Q(h[1][.] = 1 | v[1]) * v[1]') //# Update weights and biases W = W + alpha*CD (biases are just weights to neurons that stay always 1.0) //e.g between data and reconstruction. CDinc[m][rr][h] = Momentum * CDinc[m][rr][h] + EpsilonW * ((CDp - CDn) - weightcost * vishid[m][rr][h]); vishid[m][rr][h] += CDinc[m][rr][h]; } } } // Update visible softmax biases // c += epsilon * (v[0] - v[1])$ // for all softmax int rr; for(rr=0;rr<SOFTMAX;rr++) { if ( posvisact[m][rr] != 0.0 || negvisact[m][rr] != 0.0 ) { posvisact[m][rr] /= ((double)moviecount[m]); negvisact[m][rr] /= ((double)moviecount[m]); visbiasinc[m][rr] = Momentum * visbiasinc[m][rr] + EpsilonVB * ((posvisact[m][rr] - negvisact[m][rr])); //visbiasinc[m][rr] = Momentum * visbiasinc[m][rr] + EpsilonVB * ((posvisact[m][rr] - negvisact[m][rr]) - weightcost * visbiases[m][rr]); visbiases[m][rr] += visbiasinc[m][rr]; } } } // Update hidden biases // b += epsilon * (h[0] - Q(h[1][.] = 1 | v[1])) for(h=0;h<TOTAL_FEATURES;h++) { if ( poshidact[h] != 0.0 || neghidact[h] != 0.0 ) { poshidact[h] /= ((double)(numcases)); neghidact[h] /= ((double)(numcases)); hidbiasinc[h] = Momentum * hidbiasinc[h] + EpsilonHB * ((poshidact[h] - neghidact[h])); //hidbiasinc[h] = Momentum * hidbiasinc[h] + EpsilonHB * ((poshidact[h] - neghidact[h]) - weightcost * hidbiases[h]); hidbiases[h] += hidbiasinc[h]; } } ZERO(CDpos); ZERO(CDneg); ZERO(poshidact); ZERO(neghidact); ZERO(posvisact); ZERO(negvisact); ZERO(moviecount); } } nrmse=sqrt(nrmse/ntrain); prmse = sqrt(s/n); printf("%f\t%f\t%f\n",nrmse,prmse,(clock()-t0)/(double)CLOCKS_PER_SEC); if ( TOTAL_FEATURES == 200 ) { if ( loopcount > 6 ) { EpsilonW *= 0.90; EpsilonVB *= 0.90; EpsilonHB *= 0.90; } else if ( loopcount > 5 ) { // With 200 hidden variables, you need to slow things down a little more EpsilonW *= 0.50; // This could probably use some more optimization EpsilonVB *= 0.50; EpsilonHB *= 0.50; } else if ( loopcount > 2 ) { EpsilonW *= 0.70; EpsilonVB *= 0.70; EpsilonHB *= 0.70; } } else { // The 100 hidden variable case if ( loopcount > 8 ) { EpsilonW *= 0.92; EpsilonVB *= 0.92; EpsilonHB *= 0.92; } else if ( loopcount > 6 ) { EpsilonW *= 0.90; EpsilonVB *= 0.90; EpsilonHB *= 0.90; } else if ( loopcount > 2 ) { EpsilonW *= 0.78; EpsilonVB *= 0.78; EpsilonHB *= 0.78; } } } /* Perform a final iteration in which the errors are clipped and stored */ recordErrors(); //if(save_model) { //dappend_bin(fnameV,sV,NMOVIES); //dappend_bin(fnameU,sU,NUSERS); //} return 1; }