void GetVector(byte *picture, int width, int height, byte ref_block[16][16], int ref_block_num) { int i, j, i_max, j_max, vectorX = 0, vectorY = 0, refX, refY; int MAD0 = 0, mad; refX = (((ref_block_num+1) * 16) % width)-16; refY = ((ref_block_num+1) * 16 / width); i = refX - 16; j = refY - 16; i_max = refX + 16; j_max = refY + 16; if (i < 0) i = 0; if (j < 0) j = 0; if (i_max > width) i_max = width; if (j_max > height) j_max = height; MAD0 = MAD((byte*)ref_block, picture + ref_block_num * 256); for (i; i < i_max; i++) { for (j; j < j_max; j++) { mad = MAD((byte*)ref_block, picture + i * i_max + j); if (mad < MAD0) { MAD0 = mad; vectorX = i - refX; vectorY = j - refY; } } } printf("blok %d : (%d,%d)\n", ref_block_num, vectorX, vectorY); }
void refine_fast_s(double **x, double *y, double *weights, int n, int p, double *res, double *tmp, double *tmp2, double **tmp_mat, double **tmp_mat2, double *beta_cand, int kk, int conv, double b, double rhoc, double *is, double *beta_ref, double *scale) { /* // weights = vector of length n // res = vector of length n // x = matrix with the data // y = vector with responses // tmp = aux vector of length n // tmp2 = aux vector of length n // tmp_mat = aux matrix p x p // tmp_mat2 = aux matrix p x (p+1) */ void fast_s_irwls(double **x, double *y, double *weights, int n, int p, double *beta_ref, double **tmp_mat, double *tmp, double *tmp2); double norm_diff(double *x, double *y, int n); double norm(double *x, int n); int lu(double **a,int *P, double *x); void get_weights_rhop(double *r, double s, int n, double rhoc, double *w); void r_sum_w_x(double **x, double *w, int n, int p, double *tmp, double *sum); void r_sum_w_x_xprime(double **x, double *w, int n, int p, double **tmp, double **ans); double loss_rho(double *r, double scale, int n, int p, double rhoc); double MAD(double *a, int n, int center, double *tmp, double *tmp2); double vecprime_vec(double *a, double *b, int n); register int i,j; int zeroes=0; double initial_scale = *is, s0; for(j=0;j<n;j++) if( fabs(res[j] = y[j] - vecprime_vec(x[j], beta_cand, p)) < ZERO ) zeroes++; /* if "perfect fit", return it with a 0 assoc. scale */ /* if( zeroes > (((double)n + (double)p)/2.) ) */ if( zeroes > ((double)n /2.) ) { // Rprintf("\nToo many zeroes, refine_fast_s\n"); for(i=0;i<p;i++) beta_ref[i] = beta_cand[i]; *scale = 0.0; return; }; if( initial_scale < 0.0 ) initial_scale = MAD(res, n, 0, tmp, tmp2); s0 = initial_scale; if( conv > 0 ) kk = MAX_ITER_FAST_S; if(kk > 0) { for(i=0; i < kk; i++) { /* one step for the scale */ s0 = s0 * sqrt( loss_rho(res, s0, n, p, rhoc) / b ); /* compute weights for IRWLS */ get_weights_rhop(res, s0, n, rhoc, weights); /* compute the matrix for IRWLS */ r_sum_w_x_xprime(x, weights, n, p, tmp_mat, tmp_mat2); /* compute the vector for IRWLS */ for(j=0; j<n; j++) weights[j] = weights[j] * y[j]; r_sum_w_x(x, weights, n, p, tmp, tmp2); for(j=0; j<p; j++) tmp_mat2[j][p] = tmp2[j]; /* solve the system for IRWLS */ lu(tmp_mat2, &p, beta_ref); /* check for convergence? */ if(conv > 0) { if(norm_diff(beta_cand, beta_ref, p) / norm(beta_cand, p) < EPSILON ) { // Rprintf("\nRelative norm less than EPSILON\n"); break; }; }; for(j=0;j<n;j++) res[j] = y[j] - vecprime_vec(x[j], beta_ref , p); for(j=0; j<p; j++) beta_cand[j] = beta_ref[j]; }; }; *scale = s0; }
void gen8_vec4_generator::generate_vec4_instruction(vec4_instruction *instruction, struct brw_reg dst, struct brw_reg *src) { vec4_instruction *ir = (vec4_instruction *) instruction; if (dst.width == BRW_WIDTH_4) { /* This happens in attribute fixups for "dual instanced" geometry * shaders, since they use attributes that are vec4's. Since the exec * width is only 4, it's essential that the caller set * force_writemask_all in order to make sure the instruction is executed * regardless of which channels are enabled. */ assert(ir->force_writemask_all); /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy * the following register region restrictions (from Graphics BSpec: * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions * > Register Region Restrictions) * * 1. ExecSize must be greater than or equal to Width. * * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set * to Width * HorzStride." */ for (int i = 0; i < 3; i++) { if (src[i].file == BRW_GENERAL_REGISTER_FILE) src[i] = stride(src[i], 4, 4, 1); } } switch (ir->opcode) { case BRW_OPCODE_MOV: MOV(dst, src[0]); break; case BRW_OPCODE_ADD: ADD(dst, src[0], src[1]); break; case BRW_OPCODE_MUL: MUL(dst, src[0], src[1]); break; case BRW_OPCODE_MACH: MACH(dst, src[0], src[1]); break; case BRW_OPCODE_MAD: MAD(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_FRC: FRC(dst, src[0]); break; case BRW_OPCODE_RNDD: RNDD(dst, src[0]); break; case BRW_OPCODE_RNDE: RNDE(dst, src[0]); break; case BRW_OPCODE_RNDZ: RNDZ(dst, src[0]); break; case BRW_OPCODE_AND: AND(dst, src[0], src[1]); break; case BRW_OPCODE_OR: OR(dst, src[0], src[1]); break; case BRW_OPCODE_XOR: XOR(dst, src[0], src[1]); break; case BRW_OPCODE_NOT: NOT(dst, src[0]); break; case BRW_OPCODE_ASR: ASR(dst, src[0], src[1]); break; case BRW_OPCODE_SHR: SHR(dst, src[0], src[1]); break; case BRW_OPCODE_SHL: SHL(dst, src[0], src[1]); break; case BRW_OPCODE_CMP: CMP(dst, ir->conditional_mod, src[0], src[1]); break; case BRW_OPCODE_SEL: SEL(dst, src[0], src[1]); break; case BRW_OPCODE_DPH: DPH(dst, src[0], src[1]); break; case BRW_OPCODE_DP4: DP4(dst, src[0], src[1]); break; case BRW_OPCODE_DP3: DP3(dst, src[0], src[1]); break; case BRW_OPCODE_DP2: DP2(dst, src[0], src[1]); break; case BRW_OPCODE_F32TO16: F32TO16(dst, src[0]); break; case BRW_OPCODE_F16TO32: F16TO32(dst, src[0]); break; case BRW_OPCODE_LRP: LRP(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_BFREV: /* BFREV only supports UD type for src and dst. */ BFREV(retype(dst, BRW_REGISTER_TYPE_UD), retype(src[0], BRW_REGISTER_TYPE_UD)); break; case BRW_OPCODE_FBH: /* FBH only supports UD type for dst. */ FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); break; case BRW_OPCODE_FBL: /* FBL only supports UD type for dst. */ FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); break; case BRW_OPCODE_CBIT: /* CBIT only supports UD type for dst. */ CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); break; case BRW_OPCODE_ADDC: ADDC(dst, src[0], src[1]); break; case BRW_OPCODE_SUBB: SUBB(dst, src[0], src[1]); break; case BRW_OPCODE_BFE: BFE(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_BFI1: BFI1(dst, src[0], src[1]); break; case BRW_OPCODE_BFI2: BFI2(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_IF: IF(ir->predicate); break; case BRW_OPCODE_ELSE: ELSE(); break; case BRW_OPCODE_ENDIF: ENDIF(); break; case BRW_OPCODE_DO: DO(); break; case BRW_OPCODE_BREAK: BREAK(); break; case BRW_OPCODE_CONTINUE: CONTINUE(); break; case BRW_OPCODE_WHILE: WHILE(); break; case SHADER_OPCODE_RCP: MATH(BRW_MATH_FUNCTION_INV, dst, src[0]); break; case SHADER_OPCODE_RSQ: MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]); break; case SHADER_OPCODE_SQRT: MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]); break; case SHADER_OPCODE_EXP2: MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]); break; case SHADER_OPCODE_LOG2: MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]); break; case SHADER_OPCODE_SIN: MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]); break; case SHADER_OPCODE_COS: MATH(BRW_MATH_FUNCTION_COS, dst, src[0]); break; case SHADER_OPCODE_POW: MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]); break; case SHADER_OPCODE_INT_QUOTIENT: MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]); break; case SHADER_OPCODE_INT_REMAINDER: MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]); break; case SHADER_OPCODE_TEX: case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: case SHADER_OPCODE_TXF_CMS: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: generate_tex(ir, dst); break; case VS_OPCODE_URB_WRITE: generate_urb_write(ir, true); break; case SHADER_OPCODE_GEN4_SCRATCH_READ: generate_scratch_read(ir, dst, src[0]); break; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: generate_scratch_write(ir, dst, src[0], src[1]); break; case VS_OPCODE_PULL_CONSTANT_LOAD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: generate_pull_constant_load(ir, dst, src[0], src[1]); break; case GS_OPCODE_URB_WRITE: generate_urb_write(ir, false); break; case GS_OPCODE_THREAD_END: generate_gs_thread_end(ir); break; case GS_OPCODE_SET_WRITE_OFFSET: generate_gs_set_write_offset(dst, src[0], src[1]); break; case GS_OPCODE_SET_VERTEX_COUNT: generate_gs_set_vertex_count(dst, src[0]); break; case GS_OPCODE_SET_DWORD_2_IMMED: generate_gs_set_dword_2_immed(dst, src[0]); break; case GS_OPCODE_PREPARE_CHANNEL_MASKS: generate_gs_prepare_channel_masks(dst); break; case GS_OPCODE_SET_CHANNEL_MASKS: generate_gs_set_channel_masks(dst, src[0]); break; case SHADER_OPCODE_SHADER_TIME_ADD: assert(!"XXX: Missing Gen8 vec4 support for INTEL_DEBUG=shader_time"); break; case SHADER_OPCODE_UNTYPED_ATOMIC: assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_ATOMIC"); break; case SHADER_OPCODE_UNTYPED_SURFACE_READ: assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_SURFACE_READ"); break; case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: assert(!"VS_OPCODE_UNPACK_FLAGS_SIMD4X2 should not be used on Gen8+."); break; default: if (ir->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n", opcode_descs[ir->opcode].name); } else { _mesa_problem(ctx, "Unsupported opcode %d in VS", ir->opcode); } abort(); } }
void RunStatsCommand(ProgramData *p, int lcindex, int threadindex, _Stats *s) { int i, j, k, Npct; double *tmpdata = NULL, *tmpweight = NULL; if(p->NJD[threadindex] <= 0) { for(i=0, k=0; i < s->Nvar; i++) { for(j=0; j < s->Nstats; j++, k++) { s->statsout[threadindex][k] = 0.0; } } return; } if((tmpdata = (double *) malloc(p->NJD[threadindex]*sizeof(double))) == NULL) { error(ERR_MEMALLOC); } for(i = 0, k=0; i < s->Nvar; i++) { if(s->vars[i]->vectortype != VARTOOLS_VECTORTYPE_LC) { error(ERR_BADVARIABLETYPE_STATSCOMMAND); } for(j=0; j < p->NJD[threadindex]; j++) { tmpdata[j] = EvaluateVariable_Double(lcindex, threadindex, j, s->vars[i]); } Npct = 0; for(j = 0; j < s->Nstats; j++, k++) { switch(s->statstocalc[j]) { case VARTOOLS_STATSTYPE_MEAN: s->statsout[threadindex][k] = getmean(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_WEIGHTEDMEAN: s->statsout[threadindex][k] = getweightedmean(p->NJD[threadindex], tmpdata, p->sig[threadindex]); break; case VARTOOLS_STATSTYPE_MEDIAN: s->statsout[threadindex][k] = median(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_MEDIAN_WEIGHT: s->statsout[threadindex][k] = median_weight(p->NJD[threadindex], tmpdata, p->sig[threadindex]); break; case VARTOOLS_STATSTYPE_STDDEV: s->statsout[threadindex][k] = stddev(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_MEDDEV: s->statsout[threadindex][k] = meddev(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_MEDMEDDEV: s->statsout[threadindex][k] = medmeddev(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_MAD: s->statsout[threadindex][k] = MAD(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_KURTOSIS: s->statsout[threadindex][k] = kurtosis(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_SKEWNESS: s->statsout[threadindex][k] = skewness(p->NJD[threadindex], tmpdata); break; case VARTOOLS_STATSTYPE_PERCENTILE: s->statsout[threadindex][k] = percentile(p->NJD[threadindex], tmpdata, s->pctval[Npct]); Npct++; break; case VARTOOLS_STATSTYPE_PERCENTILE_WEIGHT: s->statsout[threadindex][k] = percentile_weight(p->NJD[threadindex], tmpdata, p->sig[threadindex], s->pctval[Npct]); Npct++; break; case VARTOOLS_STATSTYPE_MAXIMUM: s->statsout[threadindex][k] = getmaximum(p->NJD[threadindex],tmpdata); break; case VARTOOLS_STATSTYPE_MINIMUM: s->statsout[threadindex][k] = getminimum(p->NJD[threadindex],tmpdata); break; case VARTOOLS_STATSTYPE_SUM: s->statsout[threadindex][k] = getsum(p->NJD[threadindex],tmpdata); break; default: error(ERR_CODEERROR); } } } if(tmpdata != NULL) free(tmpdata); }