Пример #1
0
static void extend_global_pool(global_state_t *g)
{
    /* FIXME: memalign to a cache line? */
    struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c));
    g->frame_malloc.pool_begin = 
        (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK);
    g->frame_malloc.pool_end = 
        g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK;
    g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK;
    c->p = g->frame_malloc.pool_begin;
    c->cdr = g->frame_malloc.pool_list;
    g->frame_malloc.pool_list = c;
}
Пример #2
0
/**
 * Allocate memory for the list of logged events.
 *
 * This function will read through the file and count the number of records
 * so it can estimate how big a buffer to allocate for the array or replay
 * entries.  It will then rewind the file to the beginning so it can be
 * loaded into memory.
 *
 * @param w The worker we're loading the file for.
 * @param f The file of replay data we're scanning.
 */
static
void allocate_replay_list(__cilkrts_worker *w, FILE *f)
{
    // Count the number of entries - yeah, it's a hack, but it lets me
    // allocate the space all at once instead of in chunks
    char buf[1024];
    int entries = 1;    // Include "LAST" node

    while (! feof(f))
    {
        if (fgets(buf, 1024, f))
        {
            // Skip the Workers record - should only be in file for Worker 0
            if (0 != strncmp(PED_TYPE_STR_WORKERS, buf, sizeof(PED_TYPE_STR_WORKERS)-1))
                entries++;
        }
    }

    w->l->replay_list_root =
        (replay_entry_t *)__cilkrts_malloc(entries * sizeof(replay_entry_t));
    w->l->replay_list_root[entries - 1].m_type = ped_type_last;

    // Reset the file to the beginning
    rewind(f);
}
/*
 * Save TBB interop information from the cilk_fiber.  It will get picked
 * up when the thread is bound to the runtime next time.
 */
void cilk_fiber_tbb_interop_save_info_from_stack(cilk_fiber *fiber)
{
    __cilk_tbb_stack_op_thunk *saved_thunk;
    cilk_fiber_data* fdata;

    if (NULL == fiber)
        return;

    fdata = cilk_fiber_get_data(fiber);
    // If there is no TBB interop data, just return
    if (NULL == fdata->stack_op_routine)
        return;
    
    saved_thunk = __cilkrts_get_tls_tbb_interop();

    // If there is not already space allocated, allocate some.
    if (NULL == saved_thunk) {
        saved_thunk = (__cilk_tbb_stack_op_thunk*)
            __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk));
        __cilkrts_set_tls_tbb_interop(saved_thunk);
    }

    saved_thunk->routine = fdata->stack_op_routine;
    saved_thunk->data = fdata->stack_op_data;
}
Пример #4
0
    /**
     * Load data read from the log into the entry
     */
    bool load(const char *type, const char *pedigee_str, int32_t value1, int32_t value2)
    {
        // Convert the type into an enum
        if (0 == strcmp(type, PED_TYPE_STR_STEAL))
        {
            m_type = ped_type_steal;
            m_value = (int16_t)value1;   // Victim
        }
        else
        {
            m_value = -1;      // Victim not valid
            if (0 == strcmp(type, PED_TYPE_STR_SYNC))
                m_type = ped_type_sync;
            else if (0 == strcmp(type, PED_TYPE_STR_ORPHANED))
                m_type = ped_type_orphaned;
            else
            {
                m_type = ped_type_unknown;
                return false;
            }
        }

        // Parse the pedigree
        m_pedigree_len = 0;

        const char *p = pedigee_str;
        char *end;

        uint64_t temp_pedigree[PEDIGREE_BUFF_SIZE/2];

        while(1)
        {
            temp_pedigree[m_pedigree_len++] = (uint64_t)strtol(p, &end, 10);
            if ('\0' == *end)
                break;
            p = end + 1;
        }

        // Allocate memory to hold the pedigree.
        // Copy the pedigree in reverse order since that's the order we'll
        // traverse it
        m_reverse_pedigree =
            (uint64_t *)__cilkrts_malloc(sizeof(int64_t) * m_pedigree_len);
        for (int n = 0; n < m_pedigree_len; n++)
            m_reverse_pedigree[n] = temp_pedigree[(m_pedigree_len - 1) - n];

        return true;
    }
cilk_fiber* cilk_fiber::allocate_from_thread()
{
    void* retmem = __cilkrts_malloc(sizeof(cilk_fiber_sysdep));
    CILK_ASSERT(retmem);
    cilk_fiber_sysdep* ret = ::new(retmem) cilk_fiber_sysdep(from_thread);

    // A fiber allocated from a thread begins with a reference count
    // of 2.  The first is for being created, and the second is for
    // being running.
    //
    // Suspending this fiber will decrement the count down to 1.
    ret->init_ref_count(2);

#if SUPPORT_GET_CURRENT_FIBER    
    // We're creating the main fiber for this thread. Set this fiber as the
    // current fiber.
    cilkos_set_tls_cilk_fiber(ret);
#endif
    return ret;
}
cilk_fiber* cilk_fiber::allocate_from_heap(std::size_t stack_size)
{
    // Case 1: pool is NULL. create a new fiber from the heap
    // No need for locks here.
    cilk_fiber_sysdep* ret =
        (cilk_fiber_sysdep*) __cilkrts_malloc(sizeof(cilk_fiber_sysdep));

    // Error condition. If we failed to allocate a fiber from the
    // heap, we are in trouble though...
    if (!ret)
        return NULL;

    ::new(ret) cilk_fiber_sysdep(stack_size);

    CILK_ASSERT(0 == ret->m_flags);
    CILK_ASSERT(NULL == ret->m_pending_remove_ref);
    CILK_ASSERT(NULL == ret->m_pending_pool);
    ret->init_ref_count(1);
    return ret;
}
/*
 * Save TBB interop information for an unbound thread.  It will get picked
 * up when the thread is bound to the runtime.
 */
void cilk_fiber_tbb_interop_save_stack_op_info(__cilk_tbb_stack_op_thunk o)
{
    __cilk_tbb_stack_op_thunk *saved_thunk =
        __cilkrts_get_tls_tbb_interop();

    DBG_STACK_OPS("Calling save_stack_op; o.routine=%p, o.data=%p, saved_thunk=%p\n",
                  o.routine, o.data, saved_thunk);

    // If there is not already space allocated, allocate some.
    if (NULL == saved_thunk) {
        saved_thunk = (__cilk_tbb_stack_op_thunk*)
            __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk));
        __cilkrts_set_tls_tbb_interop(saved_thunk);
    }

    *saved_thunk = o;

    DBG_STACK_OPS ("Unbound Thread %04x: tbb_interop_save_stack_op_info - saved info\n",
                   cilkos_get_current_thread_id());
}
void cilk_fiber_pool_init(cilk_fiber_pool* pool,
                          cilk_fiber_pool* parent,
                          size_t           stack_size,
                          unsigned         buffer_size,
                          int              alloc_max,
                          int              is_shared)
{
#if FIBER_DEBUG >= 1    
    fprintf(stderr, "fiber_pool_init, pool=%p, parent=%p, alloc_max=%u\n",
            pool, parent, alloc_max);
#endif

    pool->lock       = (is_shared ? spin_mutex_create() : NULL);
    pool->parent     = parent;
    pool->stack_size = stack_size;
    pool->max_size   = buffer_size;
    pool->size       = 0;
    pool->total      = 0;
    pool->high_water = 0;
    pool->alloc_max  = alloc_max;
    pool->fibers     =
        (cilk_fiber**) __cilkrts_malloc(buffer_size * sizeof(cilk_fiber*));
    CILK_ASSERT(NULL != pool->fibers);

#ifdef __MIC__
#define PREALLOCATE_FIBERS
#endif
    
#ifdef PREALLOCATE_FIBERS
    // Pre-allocate 1/4 of fibers in the pools ahead of time.  This
    // value is somewhat arbitrary.  It was chosen to be less than the
    // threshold (of about 3/4) of fibers to keep in the pool when
    // transferring fibers to the parent.
    
    int pre_allocate_count = buffer_size/4;
    for (pool->size = 0; pool->size < pre_allocate_count; pool->size++) {
        pool->fibers[pool->size] = cilk_fiber::allocate_from_heap(pool->stack_size);
    }
#endif
}
Пример #9
0
void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size)
{
    int bucket;
    void *mem;

    /* if too large, or if no worker, fall back to __cilkrts_malloc()  */
    if (!w || size > FRAME_MALLOC_MAX_SIZE) {
        NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE);
        return __cilkrts_malloc(size);
    }

    START_INTERVAL(w, INTERVAL_FRAME_ALLOC); {
        bucket = bucket_of_size(size);
        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);

        while (!(mem = pop(&w->l->free_list[bucket]))) {
            /* get a batch of frames from the global pool */
            START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) {
                allocate_batch(w, bucket, size);
            } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL);
        }
    } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC);
Пример #10
0
COMMON_SYSDEP
__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
{
    __cilkrts_pedigree *pedigree_tls;    
    if (__builtin_expect(cilk_keys_defined, 1)) {
        pedigree_tls =
            (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
    }
    else {
        return 0;
    }
    
    if (!pedigree_tls && create_new) {
        // This call creates two nodes, X and Y.
        // X == pedigree_tls[0] is the leaf node, which gets copied
        // in and out of a user worker w when w binds and unbinds.
        // Y == pedigree_tls[1] is the root node,
        // which is a constant node that represents the user worker
        // thread w.
	pedigree_tls = (__cilkrts_pedigree*)
	    __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));

        // This call sets the TLS pointer to the new node.
	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
        
        pedigree_tls[0].rank = 0;
        pedigree_tls[0].parent = &pedigree_tls[1];

        // Create Y, whose rank begins as the global counter value.
        pedigree_tls[1].rank =
            __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);

        pedigree_tls[1].parent = NULL;
        CILK_ASSERT(pedigree_tls[1].rank != -1);
    }
    return pedigree_tls;
}
Пример #11
0
__CILKRTS_BEGIN_EXTERN_C

/**
 * @brief Returns the global state object.  If called for the first time,
 * initializes the user-settable values in the global state, but does not
 * initialize the rest of the structure.
 */
global_state_t* cilkg_get_user_settable_values()
{
    // Environment variable value.  More than big enough for a 64-bit signed
    // integer.
    char envstr[24];

    // Abbreviating &global_state_singleton as g is not only shorter, it also
    // facilitates grepping for the string "g->", which appears ubiquitously
    // in the runtime code.
    global_state_t* g = &global_state_singleton;

    // TBD: We need synchronization around this loop to prevent
    // multiple threads from initializing this data.
    if (! cilkg_user_settable_values_initialized)
    {
        size_t len;

        // Preserve stealing disabled since it may have been set by the
        // debugger
        int stealing_disabled = g->stealing_disabled;

        // All fields will be zero until set.  In particular
        std::memset(g, 0, sizeof(global_state_t));

        // Fetch the number of cores.  There must be at last 1, since we're
        // executing on *something*, aren't we!?
        int hardware_cpu_count = __cilkrts_hardware_cpu_count();
        CILK_ASSERT(hardware_cpu_count > 0);

        bool under_ptool = __cilkrts_running_under_sequential_ptool();
        if (under_ptool)
            hardware_cpu_count = 1;

        g->stealing_disabled        = stealing_disabled;
        g->under_ptool              = under_ptool;
        g->force_reduce             = 0;   // Default Off
        g->P                        = hardware_cpu_count;   // Defaults to hardware CPU count
        g->max_user_workers         = 0;   // 0 unless set by user
        g->fiber_pool_size          = 7;   // Arbitrary default
        
        g->global_fiber_pool_size   = 3 * 3* g->P;  // Arbitrary default
        // 3*P was the default size of the worker array (including
        // space for extra user workers).  This parameter was chosen
        // to match previous versions of the runtime.

        if (4 == sizeof(void *))
            g->max_stacks           = 1200; // Only 1GB on 32-bit machines
        else
            g->max_stacks           = 2400; // 2GB on 64-bit machines

        // If we have 2400 1MB stacks, that is 2 gb.  If we reach this
        // limit on a single-socket machine, we may have other
        // problems.  Is 2400 too small for large multicore machines?

        // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a
        // value independent of P.  When running on a Xeon Phi with
        // small values of P, I recall seeing a few microbenchmarks
        // (e.g., fib) where a limit of 10*P seemed to be
        // unnecessarily slowing things down.
        // 
        // That being said, the code has changed sufficiently that
        // this observation may no longer be true.
        //
        // Note: in general, the worst-case number of stacks required
        // for a Cilk computation with spawn depth "d" on P workers is
        // O(Pd).  Code with unbalanced recursion may run into issues
        // with this stack usage.

        g->max_steal_failures       = 128; // TBD: depend on max_workers?
        g->stack_size               = 0;   // 0 unless set by the user

        // Assume no record or replay log for now
        g->record_replay_file_name  = NULL;
        g->record_or_replay         = RECORD_REPLAY_NONE;  // set by user

        if (always_force_reduce())
            g->force_reduce = true;
        else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_FORCE_REDUCE"))
            store_bool(&g->force_reduce, envstr);

        if (under_ptool)
            g->P = 1;  // Ignore environment variable if under cilkscreen
        else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_NWORKERS"))
            // Set P to environment variable, but limit to no less than 1
            // and no more than 16 times the number of hardware threads.
            store_int(&g->P, envstr, 1, 16 * hardware_cpu_count);

        if (cilkos_getenv(envstr, sizeof(envstr), "CILK_MAX_USER_WORKERS"))
            // Set max_user_workers to environment variable, but limit to no
            // less than 1 and no more 16 times the number of hardware
            // threads.  If not specified, defaults (somewhat arbitrarily) to
            // the larger of 3 and twice the number of hardware threads.
            store_int(&g->max_user_workers, envstr, 1, 16*hardware_cpu_count);

        if (cilkos_getenv(envstr, sizeof(envstr), "CILK_STEAL_FAILURES"))
            // Set the number of times a worker should fail to steal before
            // it looks to see whether it should suspend itself.
            store_int<unsigned>(&g->max_steal_failures, envstr, 1, INT_MAX);

        // Compute the total number of workers to allocate.  Subtract one from
        // nworkers and user workers so that the first user worker isn't
        // factored in twice.
        //
        // total_workers must be computed now to support __cilkrts_get_total_workers
        g->total_workers = g->P + calc_max_user_workers(g) - 1;

#ifdef CILK_RECORD_REPLAY
        // RecordReplay: See if we've been asked to replay a log
        len = cilkos_getenv(envstr, 0, "CILK_REPLAY_LOG");
        if (len > 0)
        {
            len += 1;    // Allow for trailing NUL
            g->record_or_replay = REPLAY_LOG;
            g->record_replay_file_name = (char *)__cilkrts_malloc(len);
            cilkos_getenv(g->record_replay_file_name, len, "CILK_REPLAY_LOG");
        }

        // RecordReplay: See if we've been asked to record a log
        len = cilkos_getenv(envstr, 0, "CILK_RECORD_LOG");
        if (len > 0)
        {
            if (RECORD_REPLAY_NONE != g->record_or_replay)
                cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n");
            else
            {
                len += 1;    // Allow for trailing NUL
                g->record_or_replay = RECORD_LOG;
                g->record_replay_file_name = (char *)__cilkrts_malloc(len);
                cilkos_getenv(g->record_replay_file_name, len, "CILK_RECORD_LOG");
            }
        }
#endif
        
        cilkg_user_settable_values_initialized = true;
    }

    return g;
}
Пример #12
0
spin_mutex* spin_mutex_create() 
{
    spin_mutex* mutex = (spin_mutex*)__cilkrts_malloc(sizeof(spin_mutex));
    spin_mutex_init(mutex);
    return mutex;
}