Ejemplo n.º 1
0
int req_gpuctrl_svr(
    
  struct batch_request *preq)

  {
  int rc = PBSE_NONE;
  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
  char   log_buf[LOCAL_LOG_BUF_SIZE+1];
  int    local_errno = 0;
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    conn;

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    rc = PBSE_PERM;
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "invalid permissions (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)");
    req_reject(rc, 0, preq, NULL, log_buf);
    return rc;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buf,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, __func__, log_buf, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return PBSE_UNKNODE;
    }

  /* validate that the node is up */

  if ((pnode->nd_state & (INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))||(pnode->nd_power_state != POWER_STATE_RUNNING))
    {
    rc = PBSE_UNKREQ;
    sprintf(log_buf,"Node %s is not available",pnode->nd_name);
    req_reject(rc, 0, preq, NULL, log_buf);
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "Not allowed for virtual gpus");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU ID does not exist on node");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "No action specified");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    rc = PBSE_UNKREQ;
    req_reject(rc, 0, preq, NULL, "GPU driver version does not support mode 3");
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    return rc;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  unlock_node(pnode, __func__, NULL, LOGLEVEL);
  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           &local_errno,
           NULL,
           NULL);
    

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq)) != PBSE_NONE)
      req_reject(rc, 0, preq, NULL, NULL);
    else
      process_gpu_request_reply(preq);
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

  return rc;
  }
Ejemplo n.º 2
0
void req_gpuctrl(

  struct batch_request *preq)

  {
  char   *id = "req_gpuctrl";

  char  *nodename = NULL;
  char  *gpuid = NULL;
  int    gpumode = -1;
  int    reset_perm = -1;
  int    reset_vol = -1;
#ifdef NVIDIA_GPUS
  struct pbsnode *pnode = NULL;
  int    gpuidx = -1;
  int    rc = 0;
  int    conn;
#endif  /* NVIDIA_GPUS */

  if ((preq->rq_perm &
       (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD | ATR_DFLAG_OPWR)) == 0)
    {
    req_reject(PBSE_PERM, 0, preq, NULL, NULL);
    return;
    }

  nodename = preq->rq_ind.rq_gpuctrl.rq_momnode;
  gpuid = preq->rq_ind.rq_gpuctrl.rq_gpuid;
  gpumode = preq->rq_ind.rq_gpuctrl.rq_gpumode;
  reset_perm = preq->rq_ind.rq_gpuctrl.rq_reset_perm;
  reset_vol = preq->rq_ind.rq_gpuctrl.rq_reset_vol;

#ifdef NVIDIA_GPUS

  if (LOGLEVEL >= 7)
    {
    sprintf(
      log_buffer,
      "GPU control request for node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

    log_ext(-1, id, log_buffer, LOG_INFO);
    }

  /* validate mom node exists */

  pnode = find_nodebyname(nodename);

  if (pnode == NULL)
    {
    req_reject(PBSE_UNKNODE, 0, preq, NULL, NULL);
    return;
    }

  /* validate that the node is up */

  if (pnode->nd_state & (INUSE_DELETED | INUSE_DOWN | INUSE_OFFLINE | INUSE_UNKNOWN))
    {
    sprintf(
      log_buffer,
      "Node %s is not available",
      pnode->nd_name);
    req_reject(PBSE_UNKREQ, 0, preq, NULL, log_buffer);
    return;
    }


  /* validate that the node has real gpus not virtual */

  if (!pnode->nd_gpus_real)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Not allowed for virtual gpus");
    return;
    }

  /* validate the gpuid exists */

  if ((gpuidx = gpu_entry_by_id(pnode, gpuid, FALSE)) == -1)
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU ID does not exist on node");
    return;
    }

  /* validate that we have a real request */

  if ((gpumode == -1) && (reset_perm == -1) && (reset_vol == -1))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "No action specified");
    return;
    }

  /* for mode changes validate the mode with the driver_version */

  if ((pnode->nd_gpusn[gpuidx].driver_ver == 260) && (gpumode > 2))
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "GPU driver version does not support mode 3");
    return;
    }

  /* we need to relay request to the mom for processing */
  /* have MOM attempt to change the gpu mode */

  preq->rq_orgconn = preq->rq_conn;  /* restore client socket */

  conn = svr_connect(
           pnode->nd_addrs[0],
           pbs_mom_port,
           process_Dreply,
           ToServerDIS);

  if (conn >= 0)
    {
    if ((rc = issue_Drequest(conn, preq, process_gpu_request_reply, NULL)) != 0)
      {
      req_reject(rc, 0, preq, NULL, NULL);
      }
    }
  else
    {
    req_reject(PBSE_UNKREQ, 0, preq, NULL, "Failed to get connection to mom");
    }

#else

    sprintf(
      log_buffer,
      "GPU control request not supported: node %s gpuid %s mode %d reset_perm %d reset_vol %d",
      nodename,
      gpuid,
      gpumode,
      reset_perm,
      reset_vol);

  if (LOGLEVEL >= 3)
    {
      log_ext(-1, id, log_buffer, LOG_INFO);
    }

  req_reject(PBSE_NOSUP, 0, preq, NULL, NULL);

#endif  /* NVIDIA_GPUS */

  return;
  }
Ejemplo n.º 3
0
int is_gpustat_get(

  struct pbsnode           *np,      /* I (modified) */
  unsigned int             &i,
  std::vector<std::string> &status_info)

  {
  pbs_attribute      temp;
  const char        *gpuid = NULL;
  char               log_buf[LOCAL_LOG_BUF_SIZE];
  int                gpuidx = -1;
  std::stringstream  gpuinfo;
  int                need_delimiter = FALSE;
  int                reportedgpucnt = 0;
  int                startgpucnt = 0;
  int                drv_ver = 0;

  if (np == NULL)
    {
    sprintf(log_buf, "Invalid parameter for np  passed to is_gpustat_get");
    log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    return(PBSE_BAD_PARAMETER);
    }

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "received gpu status from node %s", np->nd_name);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  /* save current gpu count for node */
  startgpucnt = np->nd_ngpus;

  /*
   *  Before filling the "temp" pbs_attribute, initialize it.
   *  The second and third parameter to decode_arst are never
   *  used, so just leave them empty. (GBS)
   */

  memset(&temp, 0, sizeof(temp));

  if (decode_arst(&temp, NULL, NULL, NULL, 0))
    {
    DBPRT(("is_gpustat_get:  cannot initialize attribute\n"));

    return(DIS_NOCOMMIT);
    }

  i++;

  for (; i < status_info.size(); i++)
    {
    /* add the info to the "temp" attribute */
    const char *str = status_info[i].c_str();

    /* get timestamp */
    if (!strncmp(str, "timestamp=", 10))
      {
      if (decode_arst(&temp, NULL, NULL, str, 0))
        {
        DBPRT(("is_gpustat_get: cannot add attributes\n"));

        free_arst(&temp);
        move_past_gpu_status(i, status_info);

        return(DIS_NOCOMMIT);
        }
      continue;
      }

    /* get driver version, if there is one */
    if (!strncmp(str, "driver_ver=", 11))
      {
      if (decode_arst(&temp, NULL, NULL, str, 0))
        {
        DBPRT(("is_gpustat_get: cannot add attributes\n"));

        free_arst(&temp);
        move_past_gpu_status(i, status_info);

        return(DIS_NOCOMMIT);
        }
      drv_ver = atoi(str + 11);
      continue;
      }
    else if (!strcmp(str, END_GPU_STATUS))
      {
      break;
      }

    /* gpuid must come before the rest or we will be in trouble */

    if (!strncmp(str, "gpuid=", 6))
      {
      if (gpuinfo.str().size() > 0)
        {
        if (decode_arst(&temp, NULL, NULL, gpuinfo.str().c_str(), 0))
          {
          DBPRT(("is_gpustat_get: cannot add attributes\n"));

          free_arst(&temp);
          move_past_gpu_status(i, status_info);

          return(DIS_NOCOMMIT);
          }

        gpuinfo.str("");
        }

      gpuid = &str[6];

      /*
       * Get this gpus index, if it does not yet exist then find an empty entry.
       * We need to allow for the gpu status results being returned in
       * different orders since the nvidia order may change upon mom's reboot
       */

      gpuidx = gpu_entry_by_id(np, gpuid, TRUE);
      if (gpuidx == -1)
        {
        /*
         * Failure - we could not get / create a nd_gpusn entry for this gpu,
         * log an error message.
         */

        if (LOGLEVEL >= 3)
          {
          sprintf(log_buf,
            "Failed to get/create entry for gpu %s on node %s\n",
            gpuid,
            np->nd_name);

          log_ext(-1, __func__, log_buf, LOG_DEBUG);
          }

        free_arst(&temp);
        move_past_gpu_status(i, status_info);

        return(DIS_SUCCESS);
        }

      gpuinfo << "gpu[" << gpuidx << "]=gpu_id=" << gpuid << ";";
      need_delimiter = FALSE;
      reportedgpucnt++;
      np->nd_gpusn[gpuidx].driver_ver = drv_ver;

      /* mark that this gpu node is not virtual */
      np->nd_gpus_real = TRUE;
      
      /*
       * if we have not filled in the gpu_id returned by the mom node
       * then fill it in
       */
      if ((gpuidx >= 0) && (np->nd_gpusn[gpuidx].gpuid == NULL))
        {
        np->nd_gpusn[gpuidx].gpuid = strdup(gpuid);
        }      

      }
    else
      {
      if (need_delimiter)
        {
        gpuinfo << ";";
        }
     
      gpuinfo << str;
      
      need_delimiter = TRUE;
      }

    /* check current gpu mode and determine gpu state */
    
    if (!memcmp(str, "gpu_mode=", 9))
      {
      if ((!memcmp(str + 9, "Normal", 6)) || (!memcmp(str + 9, "Default", 7)))
        {
        np->nd_gpusn[gpuidx].mode = gpu_normal;
        if (gpu_has_job(np, gpuidx))
          {
          np->nd_gpusn[gpuidx].state = gpu_shared;
          }
        else
          {
          np->nd_gpusn[gpuidx].inuse = 0;
          np->nd_gpusn[gpuidx].state = gpu_unallocated;
          }
        }
      else if ((!memcmp(str + 9, "Exclusive", 9)) ||
              (!memcmp(str + 9, "Exclusive_Thread", 16)))
        {
        np->nd_gpusn[gpuidx].mode = gpu_exclusive_thread;
        if (gpu_has_job(np, gpuidx))
          {
          np->nd_gpusn[gpuidx].state = gpu_exclusive;
          }
        else
          {
          np->nd_gpusn[gpuidx].inuse = 0;
          np->nd_gpusn[gpuidx].state = gpu_unallocated;
          }
        }
      else if (!memcmp(str + 9, "Exclusive_Process", 17))
        {
        np->nd_gpusn[gpuidx].mode = gpu_exclusive_process;
        if (gpu_has_job(np, gpuidx))
          {
          np->nd_gpusn[gpuidx].state = gpu_exclusive;
          }
        else
          {
          np->nd_gpusn[gpuidx].inuse = 0;
          np->nd_gpusn[gpuidx].state = gpu_unallocated;
          }
        }
      else if (!memcmp(str + 9, "Prohibited", 10))
        {
        np->nd_gpusn[gpuidx].mode = gpu_prohibited;
        np->nd_gpusn[gpuidx].state = gpu_unavailable;
        }
      else
        {
        /* unknown mode, default to prohibited */
        np->nd_gpusn[gpuidx].mode = gpu_prohibited;
        np->nd_gpusn[gpuidx].state = gpu_unavailable;
        if (LOGLEVEL >= 3)
          {
          sprintf(log_buf,
            "GPU %s has unknown mode on node %s",
            gpuid,
            np->nd_name);

          log_ext(-1, __func__, log_buf, LOG_DEBUG);
          }
        }
 
      /* add gpu_mode so it gets added to the pbs_attribute */

      if (need_delimiter)
        {
        gpuinfo << ";";
        }

      switch (np->nd_gpusn[gpuidx].state)
        {
        case gpu_unallocated:

          gpuinfo << "gpu_state=Unallocated";
          break;

        case gpu_shared:

          gpuinfo << "gpu_state=Shared";
          break;

        case gpu_exclusive:

          gpuinfo << "gpu_state=Exclusive";
          break;

        case gpu_unavailable:

          gpuinfo << "gpu_state=Unavailable";
          break;
        }
      }

    } /* end of while disrst */

  if (gpuinfo.str().size() > 0)
    {
    if (decode_arst(&temp, NULL, NULL, gpuinfo.str().c_str(), 0))
      {
      DBPRT(("is_gpustat_get: cannot add attributes\n"));
      
      free_arst(&temp);
      move_past_gpu_status(i, status_info);

      return(DIS_NOCOMMIT);
      }
    }

  /* maintain the gpu count, if it has changed we need to update the nodes file */

  if (reportedgpucnt != startgpucnt)
    {
    np->nd_ngpus = reportedgpucnt;

    /* update the nodes file */
    update_nodes_file(np);
    }

  node_gpustatus_list(&temp, np, ATR_ACTION_ALTER);
  move_past_gpu_status(i, status_info);

  return(DIS_SUCCESS);
  }  /* END is_gpustat_get() */