void* allocPages(void* addr, size_t len, size_t align) { RELEASE_ASSERT(len < INT_MAX - align); ASSERT(len >= kPageAllocationGranularity); ASSERT(!(len & kPageAllocationGranularityOffsetMask)); ASSERT(align >= kPageAllocationGranularity); ASSERT(!(align & kPageAllocationGranularityOffsetMask)); ASSERT(!(reinterpret_cast<uintptr_t>(addr) & kPageAllocationGranularityOffsetMask)); size_t alignOffsetMask = align - 1; size_t alignBaseMask = ~alignOffsetMask; ASSERT(!(reinterpret_cast<uintptr_t>(addr) & alignOffsetMask)); // If the client passed null as the address, choose a good one. if (!addr) { addr = getRandomPageBase(); addr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(addr) & alignBaseMask); } // The common case, which is also the least work we can do, is that the // address and length are suitable. Just try it. void* ret = systemAllocPages(addr, len); // If the alignment is to our liking, we're done. if (!(reinterpret_cast<uintptr_t>(ret) & alignOffsetMask)) return ret; // Annoying. Unmap and map a larger range to be sure to succeed on the // second, slower attempt. freePages(ret, len); size_t tryLen = len + (align - kPageAllocationGranularity); // We loop to cater for the unlikely case where another thread maps on top // of the aligned location we choose. int count = 0; while (count++ < 100) { ret = systemAllocPages(addr, tryLen); // We can now try and trim out a subset of the mapping. addr = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(ret) + alignOffsetMask) & alignBaseMask); // On POSIX systems, we can trim the oversized mapping to fit exactly. // This will always work on POSIX systems. if (trimMapping(ret, tryLen, addr, len)) return addr; // On Windows, you can't trim an existing mapping so we unmap and remap // a subset. We used to do for all platforms, but OSX 10.8 has a // broken mmap() that ignores address hints for valid, unused addresses. freePages(ret, tryLen); ret = systemAllocPages(addr, len); if (ret == addr) return ret; // Unlikely race / collision. Do the simple thing and just start again. freePages(ret, len); addr = getRandomPageBase(); addr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(addr) & alignBaseMask); } IMMEDIATE_CRASH(); return 0; }
// Trims base to given length and alignment. Windows returns null on failure and frees base. static void* trimMapping(void *base, size_t baseLen, size_t trimLen, uintptr_t align, PageAccessibilityConfiguration pageAccessibility) { size_t preSlack = reinterpret_cast<uintptr_t>(base) & (align - 1); if (preSlack) preSlack = align - preSlack; size_t postSlack = baseLen - preSlack - trimLen; ASSERT(baseLen >= trimLen || preSlack || postSlack); ASSERT(preSlack < baseLen); ASSERT(postSlack < baseLen); void* ret = base; #if OS(POSIX) // On POSIX we can resize the allocation run. (void) pageAccessibility; if (preSlack) { int res = munmap(base, preSlack); RELEASE_ASSERT(!res); ret = reinterpret_cast<char*>(base) + preSlack; } if (postSlack) { int res = munmap(reinterpret_cast<char*>(ret) + trimLen, postSlack); RELEASE_ASSERT(!res); } #else // On Windows we can't resize the allocation run. if (preSlack || postSlack) { ret = reinterpret_cast<char*>(base) + preSlack; freePages(base, baseLen); ret = systemAllocPages(ret, trimLen, pageAccessibility); } #endif return ret; }
void* allocPages(void* addr, size_t len, size_t align, PageAccessibilityConfiguration pageAccessibility) { ASSERT(len >= kPageAllocationGranularity); ASSERT(!(len & kPageAllocationGranularityOffsetMask)); ASSERT(align >= kPageAllocationGranularity); ASSERT(!(align & kPageAllocationGranularityOffsetMask)); ASSERT(!(reinterpret_cast<uintptr_t>(addr) & kPageAllocationGranularityOffsetMask)); uintptr_t alignOffsetMask = align - 1; uintptr_t alignBaseMask = ~alignOffsetMask; ASSERT(!(reinterpret_cast<uintptr_t>(addr) & alignOffsetMask)); // If the client passed null as the address, choose a good one. if (!addr) { addr = getRandomPageBase(); addr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(addr) & alignBaseMask); } // First try to force an exact-size, aligned allocation from our random base. for (int count = 0; count < 3; ++count) { void* ret = systemAllocPages(addr, len, pageAccessibility); if (kHintIsAdvisory || ret) { // If the alignment is to our liking, we're done. if (!(reinterpret_cast<uintptr_t>(ret)& alignOffsetMask)) return ret; freePages(ret, len); #if CPU(32BIT) addr = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(ret)+align) & alignBaseMask); #endif } else if (!addr) { // We know we're OOM when an unhinted allocation fails. return nullptr; } else { #if CPU(32BIT) addr = reinterpret_cast<char*>(addr) + align; #endif } #if !CPU(32BIT) // Keep trying random addresses on systems that have a large address space. addr = getRandomPageBase(); addr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(addr) & alignBaseMask); #endif } // Map a larger allocation so we can force alignment, but continue randomizing only on 64-bit POSIX. size_t tryLen = len + (align - kPageAllocationGranularity); RELEASE_ASSERT(tryLen >= len); void* ret; do { // Don't continue to burn cycles on mandatory hints (Windows). addr = kHintIsAdvisory ? getRandomPageBase() : nullptr; ret = systemAllocPages(addr, tryLen, pageAccessibility); // The retries are for Windows, where a race can steal our mapping on resize. } while (ret && !(ret = trimMapping(ret, tryLen, len, align, pageAccessibility))); return ret; }
void deleteHeap(Heap *h) { freePages(h->r0copy); freePages(h->r2copy); freePages(h->r3copy); freePages(h->r4copy); freePages(h->r5copy); freePages(h->r6copy); free(h); }
PageCache::~PageCache() { freePages(&mActivePages); freePages(&mFreePages); }
int dgemmsy_base(dgemmsyBaseArgs * args) { int status = 0; // return value int row,col; size_t slice_size,ywork_size; int p2; // P rounded to next multiple of 4 double *a_slice,*b_slice,*y_work; ComputeData cdata; TransposeData tdata; int slice_n,n,p; const double *a,*b; double *y; int lda,ldb,ldy; int transa,transb; BlockPattern_2x4_Proc pattern; void * pattern_arg; double alpha; if (args == 0) return -2; pattern = args->params.pattern; pattern_arg = args->params.pattern_arg; slice_n = args->params.slice_n; transa = args->transa; transb = args->transb; n = args->n; p = args->p; a = args->a; lda = args->lda; b = args->b; ldb = args->ldb; y = args->y; ldy = args->ldy; alpha = args->alpha; // Check dimensions if (slice_n < 8) return -1; if ( n <= 0 || p <= 0 ) return -1; p2 = (p+3) & 0x7FFFFFFC; // printf("N=%d P=%d LDA=%d LDB=%d LDY=%d P2=%d\n",n,p,lda,ldb,ldy,p2); // Check other arguments if (a == 0 || b == 0 || y == 0 || pattern == 0) return -5; // Allocate memory slice_size = slice_n * p2 * sizeof(double); // Slice size in bytes ywork_size = p2 * p2 * sizeof(double); // Result size in bytes a_slice = (double *)allocPages(slice_size); b_slice = (double *)allocPages(slice_size); y_work = (double *)allocPages(ywork_size); if (a_slice == 0 || b_slice == 0 || y_work == 0) { status = -3; goto END; } memset(y_work,0,ywork_size); // Loop on all slices and accumulate products initComputeData(&cdata,p2,slice_n,a_slice,b_slice,y_work); for (row=0;row<n;row+=slice_n) { int s = slice_n; if (row+s > n) s = n-row; // Limit size of last slice if needed // 2-pack A slice if (transa) tpack_2(s,p,a+lda*row,lda, slice_n,p2,a_slice); else npack_2(s,p,a+row,lda, slice_n,p2,a_slice); // 4-pack B slice if (transb) tpack_4(s,p,b+ldb*row,ldb, slice_n,p2,b_slice); else npack_4(s,p,b+row,ldb, slice_n,p2,b_slice); pattern(pattern_arg,p2,Compute_visitor,&cdata); } cleanupComputeData(&cdata); // Complete result by symmetry initTransposeData(&tdata,p2,y_work); pattern(pattern_arg,p2,Transpose_visitor,&tdata); cleanupTransposeData(&tdata); // Combine and store (untransposed) result. If we are multithreading, // we must protect the update with a mutex, since all threads // will update the same Y. if (args->yMutex != 0) { int locked = pthread_mutex_lock(args->yMutex); if (locked != 0) status = -4; } for (col=0;col<p;col++) { double * yy = y+ldy*col; double * yy_work = y_work+p2*col; #if USE_AXPY cblas_daxpy(p,alpha,yy_work,1,yy,1); #else if (alpha == 1) { for (row=0;row<p;row++) yy[row] += yy_work[row]; } else { for (row=0;row<p;row++) yy[row] += alpha * yy_work[row]; } #endif } if (args->yMutex != 0) { int unlocked = pthread_mutex_unlock(args->yMutex); if (unlocked != 0) status = -4; } END: // Cleanup freePages(y_work); freePages(a_slice); freePages(b_slice); return status; }