void wls_cpu(int_t y_cols, int_t y_rows, double* wts, double* y, double* out_beta_cpu){ double *xtwx = (double *)calloc((y_rows+y_cols-1)*(y_rows+y_cols-1),sizeof(double)); double *xtwy = (double *)calloc((y_rows+y_cols),sizeof(double)); //XTWX start XTWX(y_rows, y_cols, wts, xtwx); // printf("\nXTWX done\n"); fflush(stdout); //XTWXinv XTWXinv(y_rows, y_cols, xtwx); // printf("\nXTWXinv done\n"); fflush(stdout); //XTWY XTWY(y_rows, y_cols, wts, y, xtwy); // printf("\nXTWY done\n"); fflush(stdout); int i, j; //OUT_BETA for (i=0; i < y_rows+y_cols-1; i++){ out_beta_cpu[i] = 0.0; for (j=0; j < y_rows+y_cols-1; j++){ out_beta_cpu[i] += xtwx[j*(y_rows+y_cols -1)+i]*xtwy[j]; } } free(xtwx); free(xtwy); return 0; }
int main(){ struct timeval start, end; long utime; int_t y_rows = Y_ROWS; int_t y_cols = Y_COLS; double *wts = (double *)calloc((y_rows*y_cols),sizeof(double)); double *y = (double *)calloc((y_rows*y_cols),sizeof(double)); printf("testing here\n"); #ifdef CPU_COMPUTE double *xtwx = (double *)calloc((y_rows+y_cols-1)*(y_rows+y_cols-1),sizeof(double)); double *xtwy = (double *)calloc((y_rows+y_cols),sizeof(double)); double *out_beta = (double *)calloc((y_rows+y_cols),sizeof(double)); #endif double *out_beta_gpu = (double *)calloc((y_rows+y_cols),sizeof(double)); int_t i,j; /* initialize random seed: */ srand (time(NULL)); for(j=0; j<y_cols; j++){ for(i=0; i<y_rows; i++){ wts[j*y_rows+i] = rand()%(RANDOM_MAX - RANDOM_MIN + 1) + RANDOM_MIN; wts[j*y_rows+i]=1.0/wts[j*y_rows+i]; y[j*y_rows+i] = rand()%(RANDOM_MAX - RANDOM_MIN + 1) + RANDOM_MIN; y[j*y_rows+i]=1.0/y[j*y_rows+i]; } } #ifdef CPU_COMPUTE printf("\n Starting CPU Computation\n"); gettimeofday(&start, NULL); //XTWX start XTWX(y_rows,y_cols,wts,xtwx); printf("\nXTWX done\n"); fflush(stdout); //XTWXinv XTWXinv(y_rows, y_cols,xtwx); printf("\nXTWXinv done\n"); fflush(stdout); //XTWY XTWY(y_rows, y_cols, wts,y, xtwy); printf("\nXTWY done\n"); fflush(stdout); //OUT_BETA for (i=0;i < y_rows+y_cols-1; i++){ out_beta[i] = 0.0; for (j=0;j < y_rows+y_cols -1; j++){ out_beta[i] += xtwx[j*(y_rows+y_cols -1)+i]*xtwy[j]; } } gettimeofday(&end, NULL); utime = ((end.tv_sec - start.tv_sec) * 1000000 + end.tv_usec - start.tv_usec); printf("\n CPU Computation done \n"); printf("\nTime CPU = %ld us\n",utime); #endif printf("\n Starting GPU Computation\n"); //GPU start gettimeofday(&start, NULL); wls_gpu(y_cols, y_rows, wts, y, out_beta_gpu); gettimeofday(&end, NULL); utime = ((end.tv_sec - start.tv_sec) * 1000000 + end.tv_usec - start.tv_usec); printf("\n GPU Computation done \n"); printf("\nTime GPU = %ld us\n",utime); #if 0 //check int_t M_size = y_cols+y_rows-1; //check for A double Ainv_err = 0.0; for(i=0; i<y_cols; i++){ Ainv_err+=fabs(((1.0/h_Ainv[i]) - xtwx[i*M_size+i])/xtwx[i*M_size+i]); } Ainv_err/=y_cols; printf("\nError Ainv = %e\n",Ainv_err); //check for B double B_err = 0.0; for(j=0; j<(y_rows-1); j++){ for(i=0; i<y_cols; i++){ B_err+=fabs((h_B[j*y_cols+i] - xtwx[(y_cols+j)*M_size+i])/xtwx[(y_cols+j)*M_size+i]); } } B_err/=(y_cols*(y_rows-1)); printf("\nError B = %e\n",B_err); //check for D double D_err = 0.0; for(j=0; j<(y_rows-1); j++){ for(i=0; i<(y_rows-1); i++){ D_err+=fabs((h_D[j*(y_rows-1)+i] - xtwx[(y_cols+j)*M_size+y_cols+i])/xtwx[(y_cols+j)*M_size+y_cols+i]); } } D_err/=((y_rows-1)*(y_rows-1)); printf("\nError D = %e\n",D_err); //check for Q double Q_err = 0.0; for(j=0; j<(y_rows-1); j++){ for(i=0; i<y_cols; i++){ Q_err+=fabs((h_Q[j*y_cols+i] - xtwx[(y_cols+j)*M_size+i])/xtwx[(y_cols+j)*M_size+i]); } } Q_err/=(y_cols*(y_rows-1)); printf("\nError Q = %e\n",Q_err); //check for S double S_err = 0.0; for(j=0; j<(y_rows-1); j++){ for(i=0; i<(y_rows-1); i++){ S_err+=fabs((h_S[j*(y_rows-1)+i] - xtwx[(y_cols+j)*M_size+y_cols+i])/xtwx[(y_cols+j)*M_size+y_cols+i]); } } S_err/=((y_rows-1)*(y_rows-1)); printf("\nError S = %e\n",S_err); #endif #ifdef CPU_COMPUTE //check for out_beta double out_beta_err = 0.0; for(j=0; j<(y_rows+y_cols-1); j++){ out_beta_err+=fabs((out_beta_gpu[j] - out_beta[j])/out_beta[j]); } out_beta_err/=(y_rows+y_cols-1); printf("\nError Out_beta_error = %e\n",out_beta_err); #endif free(wts); free(y); #ifdef CPU_COMPUTE free(xtwx); free(xtwy); free(out_beta); #endif free(out_beta_gpu); return 0; }
void XTWY_R(int *rows, int *cols, double *out_weights, double *y,double *xtwy){ XTWY(*rows, *cols, out_weights,y,xtwy); }
void rlm_wfit_anova_engine(double *y, int y_rows, int y_cols, double *input_scale, double *w, double *out_beta, double *out_resids, double *out_weights,double (* PsiFn)(double, double, int), double psi_k,int max_iter, int initialized){ int i,j,iter; /* double tol = 1e-7; */ double acc = 1e-4; double scale =0.0; double conv; double endprobe; double *wts = out_weights; double *resids = out_resids; double *old_resids = Calloc(y_rows*y_cols,double); double *rowmeans = Calloc(y_rows,double); double *xtwx = Calloc((y_rows+y_cols-1)*(y_rows+y_cols-1),double); double *xtwy = Calloc((y_rows+y_cols),double); double sumweights, rows; rows = y_rows*y_cols; if (!initialized){ /* intially use equal weights */ for (i=0; i < rows; i++){ wts[i] = w[i]*1.0; } } /* starting matrix */ for (i=0; i < y_rows; i++){ for (j=0; j < y_cols; j++){ resids[j*y_rows + i] = y[j*y_rows + i]; } } /* sweep columns (ie chip effects) */ for (j=0; j < y_cols; j++){ out_beta[j] = 0.0; sumweights = 0.0; for (i=0; i < y_rows; i++){ out_beta[j] += wts[j*y_rows + i]* resids[j*y_rows + i]; sumweights += wts[j*y_rows + i]; } out_beta[j]/=sumweights; for (i=0; i < y_rows; i++){ resids[j*y_rows + i] = resids[j*y_rows + i] - out_beta[j]; } } /* sweep rows (ie probe effects) */ for (i=0; i < y_rows; i++){ rowmeans[i] = 0.0; sumweights = 0.0; for (j=0; j < y_cols; j++){ rowmeans[i] += wts[j*y_rows + i]* resids[j*y_rows + i]; sumweights += wts[j*y_rows + i]; } rowmeans[i]/=sumweights; for (j=0; j < y_cols; j++){ resids[j*y_rows + i] = resids[j*y_rows + i] - rowmeans[i]; } } for (i=0; i < y_rows-1; i++){ out_beta[i+y_cols] = rowmeans[i]; } for (iter = 0; iter < max_iter; iter++){ if (*input_scale < 0){ scale = med_abs(resids,rows)/0.6745; } else { scale = *input_scale; } if (fabs(scale) < 1e-10){ /*printf("Scale too small \n"); */ break; } for (i =0; i < rows; i++){ old_resids[i] = resids[i]; } for (i=0; i < rows; i++){ wts[i] = w[i]*PsiFn(resids[i]/scale,psi_k,0); /* psi_huber(resids[i]/scale,k,0); */ } /* printf("%f\n",scale); */ /* weighted least squares */ memset(xtwx,0,(y_rows+y_cols-1)*(y_rows+y_cols-1)*sizeof(double)); XTWX(y_rows,y_cols,wts,xtwx); XTWXinv(y_rows, y_cols,xtwx); XTWY(y_rows, y_cols, wts,y, xtwy); for (i=0;i < y_rows+y_cols-1; i++){ out_beta[i] = 0.0; for (j=0;j < y_rows+y_cols -1; j++){ out_beta[i] += xtwx[j*(y_rows+y_cols -1)+i]*xtwy[j]; } } /* residuals */ for (i=0; i < y_rows-1; i++){ for (j=0; j < y_cols; j++){ resids[j*y_rows +i] = y[j*y_rows + i]- (out_beta[j] + out_beta[i + y_cols]); } } for (j=0; j < y_cols; j++){ endprobe=0.0; for (i=0; i < y_rows-1; i++){ endprobe+= out_beta[i + y_cols]; } resids[j*y_rows + y_rows-1] = y[j*y_rows + y_rows-1]- (out_beta[j] - endprobe); } /*check convergence based on residuals */ conv = irls_delta(old_resids,resids, rows); if (conv < acc){ /* printf("Converged \n");*/ break; } } if (*input_scale < 0){ scale = med_abs(resids,rows)/0.6745; } else { scale = *input_scale; } Free(xtwx); Free(xtwy); Free(old_resids); Free(rowmeans); input_scale[0] = scale; }