uint8_t ti89_get_byte(uint32_t adr) { // RAM access if(IN_BOUNDS(0x000000, adr, 0x1fffff)) { return get_b(tihw.ram, adr, RAM_SIZE_TI89 - 1); } // FLASH access else if(IN_BOUNDS(0x200000, adr, 0x5fffff)) { return get_b(tihw.rom, adr, ROM_SIZE_TI89 - 1) | wsm.ret_or; } // memory-mapped I/O else if(IN_BOUNDS(0x600000, adr, 0x6fffff)) { return io_get_byte(adr); } // memory-mapped I/O (hw2) else if(IN_RANGE(adr, 0x700000, IO2_SIZE_TI89)) { return io2_get_byte(adr); } return 0x14; }
double Mask::involution(BYTE* image,int widthBytes,int type,int x,int y){ //type indicates which color tube to use double sum=0; if (type==1)//r { for (int i = -1; i <= 1; ++i) for (int j = -1;j<=1;j++) sum+=get_r(image,widthBytes,x-i,y+j)*mask[(i+1)*3+j+1]; // sum/=16; } if (type==2)//g { for (int i = -1; i <= 1; ++i) for (int j = -1;j<=1;j++) sum+=get_g(image,widthBytes,x-i,y+j)*mask[(i+1)*3+j+1]; // sum/=16; } if (type==3)//b { for (int i = -1; i <= 1; ++i) for (int j = -1;j<=1;j++) sum+=get_b(image,widthBytes,x-i,y+j)*mask[(i+1)*3+j+1]; // sum/=16; } return sum; }
uint8_t ti89_get_byte(uint32_t adr) { // RAM access if(IN_BOUNDS(0x000000, adr, 0x1fffff)) { return get_b(ram, adr, RAM_SIZE_TI89 - 1); } // FLASH access else if(IN_BOUNDS(0x200000, adr, 0x5fffff)) { return get_b(rom, adr, ROM_SIZE_TI89 - 1); } return 0x14; }
void get_coefs(double *a, double *b, double *c) { *a = get_a() ; *b = get_b() ; *c = get_c() ; }
/* this performs the discrete transform */ CVC_trans discrete_transform(CVC *YPbPr) { CVC_trans pack_to_return; pack_to_return.Pb = YPbPr->avg_Pb; pack_to_return.Pr = YPbPr->avg_Pr; pack_to_return.a = get_a(YPbPr); pack_to_return.b = get_b(YPbPr); pack_to_return.c = get_c(YPbPr); pack_to_return.d = get_d(YPbPr); return pack_to_return; }
void SIAM::get_Sigma() { if (!SymmetricCase) { printf("going through asymmetric\n"); double b = get_b(); for (int i=0; i<N; i++) Sigma[i] = U*n + SOCSigma[i] / ( (double)1.0 - complex<double>(b) * SOCSigma[i] ); } else for (int i=0; i<N; i++) Sigma[i] = U*n + SOCSigma[i]; }
void Analyze::writeGrid () { std::ofstream fout ("/Users/fred.christensen/Dropbox/school/Parallel/genetic/result.ppm"); fout << "P3\n640 420\n255\n"; for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { double bias = (double)grid[x][y] / (double)max; fout << get_r(bias) << " " << get_g(bias) << " " << get_b(bias); if (x < width -1) { fout << "\t"; } else { fout << std::endl; } } } }
void IASIAM::get_G_f_CHM() { get_b(); get_Sigma_f(); for (int i=0; i<N; i++) G_f[i] = 1.0/(mu + ii*omega[i] - Sigma_f[i] - Delta[i]); /*for (int i=0; i<N; i++) { complex<double>* g= new complex<double>[N_CHM]; for (int j=0; j<N_CHM; j++) g[j] = dos[j] / ( mu + ii*omega[i] - omega_CHM[j] - Sigma_f[i]); G_f[i] = TrapezIntegral(N_CHM, g, omega_CHM); delete [] g; }*/ }
void IASIAM::get_Sigma() { if (not PHSymmetricCase) { printf("going through asymmetric\n"); double b = get_b(); for (int i=0; i<N; i++) r->Sigma[i] = U*r->n + r->SOCSigma[i] / ( 1.0 - b * r->SOCSigma[i] ); } else { printf("going through symmetric\n"); for (int i=0; i<N; i++) r->Sigma[i] = U * r->n + r->SOCSigma[i]; } if (PatchTailWithAtomicLimit) r->PatchAtomicLimitSigma(AtomicCutoff, U); }
void SIAM::get_Sigma() { if (!SymmetricCase) { //printf("going through asymmetric\n"); double b = get_b(); #pragma omp parallel for for (int i=0; i<N; i++) r->Sigma[i] = U*r->n + r->SOCSigma[i] / ( 1.0 - b * r->SOCSigma[i] ); } else #pragma omp parallel for for (int i=0; i<N; i++) r->Sigma[i] = U * r->n + r->SOCSigma[i]; }
double HeMoTKMaterial :: computeCapacityCoeff(MatResponseMode mode, GaussPoint *gp, TimeStep *tStep) { if ( mode == Capacity_ww ) { return 1.0 * rho; } else if ( mode == Capacity_wh ) { return 0.0; } else if ( mode == Capacity_hw ) { TransportMaterialStatus *status = static_cast< TransportMaterialStatus * >( this->giveStatus(gp) ); FloatArray s; double w, t; s = status->giveTempField(); if ( s.isEmpty() ) { OOFEM_ERROR("undefined state vector"); } w = s.at(2); t = s.at(1); return get_b(w, t) * get_latent(w, t); } else if ( mode == Capacity_hh ) { TransportMaterialStatus *status = static_cast< TransportMaterialStatus * >( this->giveStatus(gp) ); FloatArray s; double w, t; s = status->giveTempField(); if ( s.isEmpty() ) { OOFEM_ERROR("undefined state vector"); } w = s.at(2); t = s.at(1); return get_ceff(w, t); } else { OOFEM_ERROR("Unknown MatResponseMode"); } return 0.0; // to make compiler happy }
double HeMoTKMaterial :: computeCapacityCoeff(MatResponseMode mode, GaussPoint *gp, TimeStep *atTime) { if ( mode == Capacity_ww ) { return 1.0 * rho; } else if ( mode == Capacity_wh ) { return 0.0; } else if ( mode == Capacity_hw ) { TransportMaterialStatus *status = ( TransportMaterialStatus * ) this->giveStatus(gp); FloatArray s; double w, t; s = status->giveTempStateVector(); if ( s.isEmpty() ) { _error("computeCapacityCoeff: undefined state vector"); } w = s.at(2); t = s.at(1); return get_b(w, t) * get_latent(w, t); } else if ( mode == Capacity_hh ) { TransportMaterialStatus *status = ( TransportMaterialStatus * ) this->giveStatus(gp); FloatArray s; double w, t; s = status->giveTempStateVector(); if ( s.isEmpty() ) { _error("computeCapacityCoeff: undefined state vector"); } w = s.at(2); t = s.at(1); return get_ceff(w, t); } else { _error("Unknown MatResponseMode"); } return 0.0; // to make compiler happy }
uint8_t ti89t_get_byte(uint32_t adr) { // RAM access if(IN_BOUNDS(0x000000, adr, 0x03ffff) || IN_BOUNDS(0x200000, adr, 0x23ffff) || IN_BOUNDS(0x400000, adr, 0x43ffff)) { return get_b(tihw.ram, adr, 0x03ffff); } // FLASH access else if(IN_BOUNDS(0x800000, adr, 0xbfffff)) { return FlashReadByte(adr); } // memory-mapped I/O else if(IN_BOUNDS(0x600000, adr, 0x6fffff)) { return io_get_byte(adr); } // memory-mapped I/O (hw2) else if(IN_RANGE(adr, 0x700000, IO2_SIZE_TI89T)) { return io2_get_byte(adr); } // memory-mapped I/O (hw3) else if(IN_RANGE(adr, 0x710000, IO3_SIZE_TI89T)) { return io3_get_byte(adr); } return 0x14; }
std::list<Point3D> const LineIntersectionFinder::find_intersections(std::list<Line3D>& lines) const { EventStructure event_structure; StatusStructure status_structure; std::list<Point3D> intersections; for (auto line = lines.begin(); line != lines.end(); ++line) { event_structure.add_event(new StartEvent(line->get_a(), &(*line))); event_structure.add_event(new EndEvent(line->get_b(), &(*line))); } while (!event_structure.empty()) { // std::cout << "Event structure:\n" << event_structure << std::endl; Event* event(event_structure.get_top()); event_structure.delete_top(); Point3D intersection(status_structure.process_event(event, event_structure)); if (intersection.get_z() != -1) intersections.push_back(intersection); // std::cout << "Status structure:\n" << status_structure << std::endl; } return intersections; }
uint calc(int x, int y){ // r,g,b color values around center 'c11' // 00 10 20 // 01 11 21 // 02 12 22 uint c00=get(x-1,y-1); uint c01=get(x-1,y); uint c02=get(x-1,y+1); uint c10=get(x,y-1); uint c11=get(x,y); uint c12=get(x,y+1); uint c20=get(x+1,y-1); uint c21=get(x+1,y); uint c22=get(x+1,y+1); // red uint r00=get_r(x-1,y-1); uint r01=get_r(x-1,y) ; uint r02=get_r(x-1,y+1); uint r10=get_r(x,y-1) ; uint r11=get_r(x,y) ; uint r12=get_r(x,y+1) ; uint r20=get_r(x+1,y-1); uint r21=get_r(x+1,y) ; uint r22=get_r(x+1,y+1); // green uint g00=get_g(x-1,y-1); uint g01=get_g(x-1,y) ; uint g02=get_g(x-1,y+1); uint g10=get_g(x,y-1) ; uint g11=get_g(x,y) ; uint g12=get_g(x,y+1) ; uint g20=get_g(x+1,y-1); uint g21=get_g(x+1,y) ; uint g22=get_g(x+1,y+1); // blue uint b00=get_b(x-1,y-1); uint b01=get_b(x-1,y) ; uint b02=get_b(x-1,y+1); uint b10=get_b(x,y-1) ; uint b11=get_b(x,y) ; uint b12=get_b(x,y+1) ; uint b20=get_b(x+1,y-1); uint b21=get_b(x+1,y) ; uint b22=get_b(x+1,y+1); // center colors of THIS dot uint r=r11; uint g=g11; uint b=b11; // random colors uint zr=rand()%256; uint zg=rand()%256; uint zb=rand()%256; // mean color value of surrounding uint r0=((r10+r01+r21+r12)/4); uint g0=((g10+g01+g21+g12)/4); uint b0=(b10+b01+b21+b12)/4; // Including this dot // uint r0=((r10+r01+r11+r21+r12)/5); // uint g0=((g10+g01+g11+g21+g12)/5); // uint b0=(b10+b01+b11+b21+b12)/5; float fr=r0/256.0; float fg=g0/256.0; float fb=b0/256.0; uint k=128; float h=100.0; ////////// // done with initialization, // now updated the dot's color based on some crazy experimental whatever function // feel free to wildly experiment with this algorithm! // POPULATION UPDATE // inspired by Conway's game of life: // if there is a sufficient population, then grow in numbers: if(b0>=100)b=b0*1.01; // if the population is small then shrink if(b0<100)b=b0*(0.99-fr/100); // if the population is too big then collapse if(b0>240)b=0; // repopulate collapsed populations if(b0<3)b=120;//zb*((1.5-f)+fr*f); if(r0>=100)r=r0*1.01; if(r0<100)r=r0*(0.99-fg/100); if(r11>240)r=0; if(r11<3)r=120*(1.2-fr*fg);//zb*((1.5-f)+fr*f); if(g0>=100)g=g0*1.01; if(g0<100)g=g0*(0.99-f*fb/100); if(g11>240){g=0;b=b/2;} if(g11<3)g=120*(1-fb*fr);//zb*((1.5-f)+fr*f); // END OF POPULATION UPDATE if(r>255)r=255; if(g>255)g=255; if(b>255)b=255; return (r<<16)+(g<<8)+b; }
bool BinAsm::finds_labels() { unsigned int lc = 0; unsigned int errc = 0; unsigned int offset = 0; uint16_t opd = 0; std::string err = ""; std::vector<std::string> lines=split_text(_src); std::vector<std::string>::iterator lit = lines.begin(); for (;lit!=lines.end();lit++) { lc++; std::vector<std::string> w=split_line(*lit); if (!w.size()) continue; if (get_op(w[0])) { offset++; if (w.size() < 3) continue; opd=0; get_b(w[1],opd,err); if (opd) offset++; opd=0; get_a(w[2],opd,err); if (opd) offset++; } else if (get_sop(w[0],err) && !err.size()) { offset++; if (w.size() < 2) continue; opd=0; get_a(w[1],opd,err); if (opd) offset++; } else { std::vector<std::string>::iterator wit = w.begin(); Label l; l.line = lc; l.offset = offset; l.name = std::string(); for (;wit!=w.end();wit++) { if (wit->size() < 2) continue; if ((*wit)[wit->size()-1] == ':') { l.name = wit->substr(0,wit->size()-1); break; } else if ((*wit)[0] == ':') { l.name = wit->substr(1,wit->size()); break; } } if (_labels.find(l.name)!=_labels.end()) { err="label " + l.name + " redefined"; print_error(lc, false,err); errc++; } else if (l.name.size()) { _labels[l.name]=l; } } uint16_t* data = new uint16_t[lit->size()]; offset += get_data(*lit, data,err); delete data; } return !errc; }
void BinAsm::save(const std::string& filename) { uint32_t allocsize=0x10000; uint32_t filesize=0; int error_count=0; uint16_t* buff = new uint16_t[allocsize]; unsigned int lc = 0; uint16_t opcode = 0; std::vector<std::string> lines=split_text(_src); std::vector<std::string>::iterator lit = lines.begin(); for (;lit!=lines.end();lit++) { lc++; std::string err = std::string(); std::vector<std::string> w=split_line(*lit); if (!w.size()) continue; opcode = 0; if (filesize > allocsize - lit->size() - 3) { std::cerr << "fatal error: assembling file is too big"; std::cerr << "cannot be used on dcpu-16" << std::endl; break; } if (w[0].size() && (w[0][w[0].size()-1] == ':' || w[0][0] == ':')) { continue; } if ((opcode = get_op(w[0]))) { if (w.size() != 3) { err = "instruction " + w[0]; err += " need 2 arguments"; } else { uint16_t a_word=0, b_word=0; opcode |= ((get_b(w[1],b_word,err) & 0x1F) << 5); if (err.size()) { print_error(lc,false,err); error_count++; } opcode |= (get_a(w[2],a_word,err) << 10); buff[filesize]=opcode; filesize++; if (a_word) { buff[filesize]=a_word; filesize++; } if (b_word) { buff[filesize]=b_word; filesize++; } } } else if ((opcode=get_sop(w[0],err)) && !err.size()) { if (w.size() != 2) { err = "special instruction " + w[0]; err += " need 1 argument"; } else { uint16_t a_word=0; opcode = ((opcode & 0x1F) << 5); opcode |= (get_a(w[1],a_word,err) << 10); buff[filesize]=opcode; filesize++; if (a_word) { buff[filesize]=a_word; filesize++; } } } else { if (w[0]=="dat"||w[0]==".dat"||w[0]=="DAT"||w[0]==".DAT") err=""; filesize += get_data(*lit, &(buff[filesize]),err); } if (err.size()) { print_error(lc,false,err); error_count++; } } if (error_count) { std::cerr << "assembling terminated with " << error_count; std::cerr << " error(s)" << std::endl; } else { FILE* f = fopen(filename.c_str(), "wb"); if (!f) std::cerr << "error: cannot open output file " << filename << std::endl; fswitchendian(buff, filesize); fwrite(buff,2,filesize,f); fclose(f); std::cout << "assembling " << filename; std::cout << " terminated final size " << filesize*2; std::cout << " bytes" << std::endl; } delete buff; }
/////////////////////////////////////////////////////////////////////// // Class : CWinDisplay // Method : data // Description : Draw the data onto the screen // Return Value : - // Comments : int CWinDisplay::data(int x,int y,int w,int h,float *d) { int i,j; clampData(w,h,d); for (i=0;i<h;i++) { const float *src = &d[i*w*numSamples]; unsigned int *dest = &imageData[((height-(i+y)-1)*width+x)]; switch(numSamples) { case 0: break; case 1: for (j=0;j<w;j++) { unsigned char d = (unsigned char) (src[0]*255); *dest++ = color(d,d,d,d); src++; } break; case 2: for (j=0;j<w;j++) { const float r = src[0]*src[1]*255 + (1-src[1])*get_r(dest[0]); const float a = src[1]*255 + (1-src[1])*get_a(dest[0]); unsigned char dr = (unsigned char) r; unsigned char da = (unsigned char) a; *dest++ = color(dr,dr,dr,da); src += 2; } break; case 3: for (j=0;j<w;j++) { unsigned char dr = (unsigned char) (src[0]*255); unsigned char dg = (unsigned char) (src[1]*255); unsigned char db = (unsigned char) (src[2]*255); *dest++ = color(dr,dg,db,(unsigned char) 255); src += 3; } break; case 4: for (j=0;j<w;j++) { const float r = src[0]*src[3]*255 + (1-src[3])*get_r(dest[0]); const float g = src[1]*src[3]*255 + (1-src[3])*get_g(dest[0]); const float b = src[2]*src[3]*255 + (1-src[3])*get_b(dest[0]); const float a = src[3]*255 + (1-src[3])*get_a(dest[0]); unsigned char dr = (unsigned char) r; unsigned char dg = (unsigned char) g; unsigned char db = (unsigned char) b; unsigned char da = (unsigned char) a; *dest++ = color(dr,dg,db,da); src += 4; } break; default: for (j=0;j<w;j++) { float r = src[0]*src[3]*255 + (1-src[3])*get_r(*dest); float g = src[1]*src[3]*255 + (1-src[3])*get_g(*dest); float b = src[2]*src[3]*255 + (1-src[3])*get_b(*dest); float a = src[3]*255 + (1-src[3])*get_a(*dest); unsigned char dr = (unsigned char) r; unsigned char dg = (unsigned char) g; unsigned char db = (unsigned char) b; unsigned char da = (unsigned char) a; *dest++ = color(dr,dg,db,da); src += numSamples; } break; } } if (active) { if (willRedraw == FALSE) { // Pump messages willRedraw = TRUE; PostMessage(hWnd,WM_PAINT,(WPARAM) this,0); } } return active; }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }