int req_gpuctrl_svr( struct batch_request *preq) { int rc = PBSE_NONE; char *nodename = NULL; char *gpuid = NULL; int gpumode = -1; int reset_perm = -1; int reset_vol = -1; char log_buf[LOCAL_LOG_BUF_SIZE+1]; int local_errno = 0; struct pbsnode *pnode = NULL; int gpuidx = -1; int conn; if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { rc = PBSE_PERM; snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)"); req_reject(rc, 0, preq, NULL, log_buf); return rc; } nodename = preq->rq_ind.rq_gpuctrl.rq_momnode; gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid; gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode; reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm; reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol; if (LOGLEVEL >= 7) { sprintf( log_buf, "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); log_ext(-1, __func__, log_buf, LOG_INFO); } /* validate mom node exists */ pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return PBSE_UNKNODE; } /* validate that the node is up */ if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING)) { rc = PBSE_UNKREQ; sprintf(log_buf,"Node %s is not available",pnode->nd_name); req_reject(rc, 0, preq, NULL, log_buf); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate that the node has real gpus not virtual */ if (!pnode->nd_gpus_real) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate the gpuid exists */ if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* validate that we have a real request */ if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1)) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "No action specified"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* for mode changes validate the mode with the driver_version */ if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2)) { rc = PBSE_UNKREQ; req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3"); unlock_node(pnode, __func__, NULL, LOGLEVEL); return rc; } /* we need to relay request to the mom for processing */ /* have MOM attempt to change the gpu mode */ preq->rq_orgconn = preq->rq_conn; /* restore client socket */ unlock_node(pnode, __func__, NULL, LOGLEVEL); conn = svr_connect( pnode->nd_addrs[0], pbs_mom_port, &local_errno, NULL, NULL); if (conn >= 0) { if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE) req_reject(rc, 0, preq, NULL, NULL); else process_gpu_request_reply(preq); } else { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom"); } return rc; }
void req_gpuctrl( struct batch_request *preq) { char *id = "req_gpuctrl"; char *nodename = NULL; char *gpuid = NULL; int gpumode = -1; int reset_perm = -1; int reset_vol = -1; #ifdef NVIDIA_GPUS struct pbsnode *pnode = NULL; int gpuidx = -1; int rc = 0; int conn; #endif /* NVIDIA_GPUS */ if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0) { req_reject(PBSE_PERM, 0, preq, NULL, NULL); return; } nodename = preq->rq_ind.rq_gpuctrl.rq_momnode; gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid; gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode; reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm; reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol; #ifdef NVIDIA_GPUS if (LOGLEVEL >= 7) { sprintf( log_buffer, "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); log_ext(-1, id, log_buffer, LOG_INFO); } /* validate mom node exists */ pnode = find_nodebyname(nodename); if (pnode == NULL) { req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL); return; } /* validate that the node is up */ if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN)) { sprintf( log_buffer, "Node %s is not available", pnode->nd_name); req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer); return; } /* validate that the node has real gpus not virtual */ if (!pnode->nd_gpus_real) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus"); return; } /* validate the gpuid exists */ if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node"); return; } /* validate that we have a real request */ if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1)) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified"); return; } /* for mode changes validate the mode with the driver_version */ if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2)) { req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3"); return; } /* we need to relay request to the mom for processing */ /* have MOM attempt to change the gpu mode */ preq->rq_orgconn = preq->rq_conn; /* restore client socket */ conn = svr_connect( pnode->nd_addrs[0], pbs_mom_port, process_Dreply, ToServerDIS); if (conn >= 0) { if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0) { req_reject(rc, 0, preq, NULL, NULL); } } else { req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom"); } #else sprintf( log_buffer, "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d", nodename, gpuid, gpumode, reset_perm, reset_vol); if (LOGLEVEL >= 3) { log_ext(-1, id, log_buffer, LOG_INFO); } req_reject(PBSE_NOSUP, 0, preq, NULL, NULL); #endif /* NVIDIA_GPUS */ return; }
int is_gpustat_get( struct pbsnode *np, /* I (modified) */ unsigned int &i, std::vector<std::string> &status_info) { pbs_attribute temp; const char *gpuid = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int gpuidx = -1; std::stringstream gpuinfo; int need_delimiter = FALSE; int reportedgpucnt = 0; int startgpucnt = 0; int drv_ver = 0; if (np == NULL) { sprintf(log_buf, "Invalid parameter for np passed to is_gpustat_get"); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, log_buf); return(PBSE_BAD_PARAMETER); } if (LOGLEVEL >= 7) { sprintf(log_buf, "received gpu status from node %s", np->nd_name); log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf); } /* save current gpu count for node */ startgpucnt = np->nd_ngpus; /* * Before filling the "temp" pbs_attribute, initialize it. * The second and third parameter to decode_arst are never * used, so just leave them empty. (GBS) */ memset(&temp, 0, sizeof(temp)); if (decode_arst(&temp, NULL, NULL, NULL, 0)) { DBPRT(("is_gpustat_get: cannot initialize attribute\n")); return(DIS_NOCOMMIT); } i++; for (; i < status_info.size(); i++) { /* add the info to the "temp" attribute */ const char *str = status_info[i].c_str(); /* get timestamp */ if (!strncmp(str, "timestamp=", 10)) { if (decode_arst(&temp, NULL, NULL, str, 0)) { DBPRT(("is_gpustat_get: cannot add attributes\n")); free_arst(&temp); move_past_gpu_status(i, status_info); return(DIS_NOCOMMIT); } continue; } /* get driver version, if there is one */ if (!strncmp(str, "driver_ver=", 11)) { if (decode_arst(&temp, NULL, NULL, str, 0)) { DBPRT(("is_gpustat_get: cannot add attributes\n")); free_arst(&temp); move_past_gpu_status(i, status_info); return(DIS_NOCOMMIT); } drv_ver = atoi(str + 11); continue; } else if (!strcmp(str, END_GPU_STATUS)) { break; } /* gpuid must come before the rest or we will be in trouble */ if (!strncmp(str, "gpuid=", 6)) { if (gpuinfo.str().size() > 0) { if (decode_arst(&temp, NULL, NULL, gpuinfo.str().c_str(), 0)) { DBPRT(("is_gpustat_get: cannot add attributes\n")); free_arst(&temp); move_past_gpu_status(i, status_info); return(DIS_NOCOMMIT); } gpuinfo.str(""); } gpuid = &str[6]; /* * Get this gpus index, if it does not yet exist then find an empty entry. * We need to allow for the gpu status results being returned in * different orders since the nvidia order may change upon mom's reboot */ gpuidx = gpu_entry_by_id(np, gpuid, TRUE); if (gpuidx == -1) { /* * Failure - we could not get / create a nd_gpusn entry for this gpu, * log an error message. */ if (LOGLEVEL >= 3) { sprintf(log_buf, "Failed to get/create entry for gpu %s on node %s\n", gpuid, np->nd_name); log_ext(-1, __func__, log_buf, LOG_DEBUG); } free_arst(&temp); move_past_gpu_status(i, status_info); return(DIS_SUCCESS); } gpuinfo << "gpu[" << gpuidx << "]=gpu_id=" << gpuid << ";"; need_delimiter = FALSE; reportedgpucnt++; np->nd_gpusn[gpuidx].driver_ver = drv_ver; /* mark that this gpu node is not virtual */ np->nd_gpus_real = TRUE; /* * if we have not filled in the gpu_id returned by the mom node * then fill it in */ if ((gpuidx >= 0) && (np->nd_gpusn[gpuidx].gpuid == NULL)) { np->nd_gpusn[gpuidx].gpuid = strdup(gpuid); } } else { if (need_delimiter) { gpuinfo << ";"; } gpuinfo << str; need_delimiter = TRUE; } /* check current gpu mode and determine gpu state */ if (!memcmp(str, "gpu_mode=", 9)) { if ((!memcmp(str + 9, "Normal", 6)) || (!memcmp(str + 9, "Default", 7))) { np->nd_gpusn[gpuidx].mode = gpu_normal; if (gpu_has_job(np, gpuidx)) { np->nd_gpusn[gpuidx].state = gpu_shared; } else { np->nd_gpusn[gpuidx].inuse = 0; np->nd_gpusn[gpuidx].state = gpu_unallocated; } } else if ((!memcmp(str + 9, "Exclusive", 9)) || (!memcmp(str + 9, "Exclusive_Thread", 16))) { np->nd_gpusn[gpuidx].mode = gpu_exclusive_thread; if (gpu_has_job(np, gpuidx)) { np->nd_gpusn[gpuidx].state = gpu_exclusive; } else { np->nd_gpusn[gpuidx].inuse = 0; np->nd_gpusn[gpuidx].state = gpu_unallocated; } } else if (!memcmp(str + 9, "Exclusive_Process", 17)) { np->nd_gpusn[gpuidx].mode = gpu_exclusive_process; if (gpu_has_job(np, gpuidx)) { np->nd_gpusn[gpuidx].state = gpu_exclusive; } else { np->nd_gpusn[gpuidx].inuse = 0; np->nd_gpusn[gpuidx].state = gpu_unallocated; } } else if (!memcmp(str + 9, "Prohibited", 10)) { np->nd_gpusn[gpuidx].mode = gpu_prohibited; np->nd_gpusn[gpuidx].state = gpu_unavailable; } else { /* unknown mode, default to prohibited */ np->nd_gpusn[gpuidx].mode = gpu_prohibited; np->nd_gpusn[gpuidx].state = gpu_unavailable; if (LOGLEVEL >= 3) { sprintf(log_buf, "GPU %s has unknown mode on node %s", gpuid, np->nd_name); log_ext(-1, __func__, log_buf, LOG_DEBUG); } } /* add gpu_mode so it gets added to the pbs_attribute */ if (need_delimiter) { gpuinfo << ";"; } switch (np->nd_gpusn[gpuidx].state) { case gpu_unallocated: gpuinfo << "gpu_state=Unallocated"; break; case gpu_shared: gpuinfo << "gpu_state=Shared"; break; case gpu_exclusive: gpuinfo << "gpu_state=Exclusive"; break; case gpu_unavailable: gpuinfo << "gpu_state=Unavailable"; break; } } } /* end of while disrst */ if (gpuinfo.str().size() > 0) { if (decode_arst(&temp, NULL, NULL, gpuinfo.str().c_str(), 0)) { DBPRT(("is_gpustat_get: cannot add attributes\n")); free_arst(&temp); move_past_gpu_status(i, status_info); return(DIS_NOCOMMIT); } } /* maintain the gpu count, if it has changed we need to update the nodes file */ if (reportedgpucnt != startgpucnt) { np->nd_ngpus = reportedgpucnt; /* update the nodes file */ update_nodes_file(np); } node_gpustatus_list(&temp, np, ATR_ACTION_ALTER); move_past_gpu_status(i, status_info); return(DIS_SUCCESS); } /* END is_gpustat_get() */