/* We have only one thread that ever re-initialises GPUs, thus if any GPU * init command fails due to a completely wedged GPU, the thread will never * return, unable to harm other GPUs. If it does return, it means we only had * a soft failure and then the reinit_gpu thread is ready to tackle another * GPU */ void *reinit_gpu(void *userdata) { struct thr_info *mythr = userdata; struct cgpu_info *cgpu; struct thr_info *thr; struct timeval now; char name[256]; int thr_id; int gpu; pthread_detach(pthread_self()); select_cgpu: cgpu = tq_pop(mythr->q, NULL); if (!cgpu) goto out; if (clDevicesNum() != nDevs) { applog(LOG_WARNING, "Hardware not reporting same number of active devices, will not attempt to restart GPU"); goto out; } gpu = cgpu->device_id; for (thr_id = 0; thr_id < mining_threads; ++thr_id) { thr = &thr_info[thr_id]; cgpu = thr->cgpu; if (cgpu->api != &opencl_api) continue; if (dev_from_id(thr_id) != gpu) continue; thr = &thr_info[thr_id]; if (!thr) { applog(LOG_WARNING, "No reference to thread %d exists", thr_id); continue; } thr->rolling = thr->cgpu->rolling = 0; /* Reports the last time we tried to revive a sick GPU */ gettimeofday(&thr->sick, NULL); if (!pthread_cancel(thr->pth)) { applog(LOG_WARNING, "Thread %d still exists, killing it off", thr_id); } else applog(LOG_WARNING, "Thread %d no longer exists", thr_id); } for (thr_id = 0; thr_id < mining_threads; ++thr_id) { int virtual_gpu; thr = &thr_info[thr_id]; cgpu = thr->cgpu; if (cgpu->api != &opencl_api) continue; if (dev_from_id(thr_id) != gpu) continue; virtual_gpu = cgpu->virtual_gpu; /* Lose this ram cause we may get stuck here! */ //tq_freeze(thr->q); thr->q = tq_new(); if (!thr->q) quit(1, "Failed to tq_new in reinit_gpu"); /* Lose this ram cause we may dereference in the dying thread! */ //free(clState); applog(LOG_INFO, "Reinit GPU thread %d", thr_id); clStates[thr_id] = initCl(virtual_gpu, name, sizeof(name)); if (!clStates[thr_id]) { applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id); goto select_cgpu; } applog(LOG_INFO, "initCl() finished. Found %s", name); if (unlikely(thr_info_create(thr, NULL, miner_thread, thr))) { applog(LOG_ERR, "thread %d create failed", thr_id); return NULL; } applog(LOG_WARNING, "Thread %d restarted", thr_id); } gettimeofday(&now, NULL); get_datestamp(cgpu->init, &now); for (thr_id = 0; thr_id < mining_threads; ++thr_id) { thr = &thr_info[thr_id]; cgpu = thr->cgpu; if (cgpu->api != &opencl_api) continue; if (dev_from_id(thr_id) != gpu) continue; tq_push(thr->q, &ping); } goto select_cgpu; out: return NULL; }
/* We have only one thread that ever re-initialises GPUs, thus if any GPU * init command fails due to a completely wedged GPU, the thread will never * return, unable to harm other GPUs. If it does return, it means we only had * a soft failure and then the reinit_gpu thread is ready to tackle another * GPU */ void *reinit_gpu(void *userdata) { struct thr_info *mythr = (struct thr_info *)userdata; struct cgpu_info *cgpu; struct thr_info *thr; struct timeval now; char name[256]; int thr_id; int gpu; pthread_detach(pthread_self()); select_cgpu: cgpu = (struct cgpu_info *)tq_pop(mythr->q, NULL); if (!cgpu) goto out; if (clDevicesNum() != nDevs) { applog(LOG_WARNING, "Hardware not reporting same number of active devices, will not attempt to restart GPU"); goto out; } gpu = cgpu->device_id; rd_lock(&mining_thr_lock); for (thr_id = 0; thr_id < mining_threads; ++thr_id) { thr = mining_thr[thr_id]; cgpu = thr->cgpu; if (cgpu->drv->drv_id != DRIVER_opencl) continue; if (dev_from_id(thr_id) != gpu) continue; thr->rolling = thr->cgpu->rolling = 0; /* Reports the last time we tried to revive a sick GPU */ cgtime(&thr->sick); if (!pthread_kill(thr->pth, 0)) { applog(LOG_WARNING, "Thread %d still exists, killing it off", thr_id); cg_completion_timeout(&thr_info_cancel_join, thr, 5000); thr->cgpu->drv->thread_shutdown(thr); } else applog(LOG_WARNING, "Thread %d no longer exists", thr_id); } rd_unlock(&mining_thr_lock); rd_lock(&mining_thr_lock); for (thr_id = 0; thr_id < mining_threads; ++thr_id) { int virtual_gpu; thr = mining_thr[thr_id]; cgpu = thr->cgpu; if (cgpu->drv->drv_id != DRIVER_opencl) continue; if (dev_from_id(thr_id) != gpu) continue; virtual_gpu = cgpu->virtual_gpu; /* Lose this ram cause we may get stuck here! */ //tq_freeze(thr->q); thr->q = tq_new(); if (!thr->q) quit(1, "Failed to tq_new in reinit_gpu"); /* Lose this ram cause we may dereference in the dying thread! */ //free(clState); applog(LOG_INFO, "Reinit GPU thread %d", thr_id); clStates[thr_id] = initCl(virtual_gpu, name, sizeof(name), &cgpu->algorithm); if (!clStates[thr_id]) { applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id); goto select_cgpu; } applog(LOG_INFO, "initCl() finished. Found %s", name); if (unlikely(thr_info_create(thr, NULL, miner_thread, thr))) { applog(LOG_ERR, "thread %d create failed", thr_id); return NULL; } applog(LOG_WARNING, "Thread %d restarted", thr_id); } rd_unlock(&mining_thr_lock); cgtime(&now); get_datestamp(cgpu->init, sizeof(cgpu->init), &now); rd_lock(&mining_thr_lock); for (thr_id = 0; thr_id < mining_threads; ++thr_id) { thr = mining_thr[thr_id]; cgpu = thr->cgpu; if (cgpu->drv->drv_id != DRIVER_opencl) continue; if (dev_from_id(thr_id) != gpu) continue; cgsem_post(&thr->sem); } rd_unlock(&mining_thr_lock); goto select_cgpu; out: return NULL; }