/* * get the machine arch from LoadLeveler * Will return NULL on error or a arch string that needs to be freed * (some code from the IBM documentation, licensed as above) */ static char* orte_ras_loadleveler_get_host_arch(char * hostname) { LL_element *queryObject, *machine; int rc, obj_count, err_code; char * hostlist[2]; char * arch; /* Initialize the query: Machine query */ queryObject = ll_query(MACHINES); if(NULL == queryObject) { return NULL; } /* Set query parameters: query specific machines by name */ hostlist[0] = hostname; hostlist[1] = NULL; rc = ll_set_request(queryObject, QUERY_HOST, hostlist, ALL_DATA); if(0 != rc) { return NULL; } /* Get the machine objects from the LoadL_negotiator (central manager) daemon */ machine = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code); if(NULL == machine || 1 != obj_count) { return NULL; } /* Process the machine object */ rc = ll_get_data(machine, LL_MachineArchitecture, &arch); if(0 != rc) { return NULL; } /* Free objects obtained from Negotiator */ ll_free_objs(queryObject); /* Free query element */ ll_deallocate(queryObject); return arch; }
main(int argc, char *argv[]) { LL_element *query,*machines; int rc,number,err; int cpus,i; int *cpulist; int pools,*poollist; LL_element *adapter; int windows,*windowlist; query = ll_query(MACHINES); if (!query) { exit(1); } rc=ll_set_request(query,QUERY_ALL,NULL,ALL_DATA); if (rc) { exit(1); } machines=ll_get_objs(query,LL_CM,NULL,&number,&err); if (machines == NULL) { exit(1); } if (!rc){ printf("INT_TYPES:NUMBER=%d\n", number); } rc = ll_get_data(machines,LL_MachineCPUs,&cpus); if (!rc){ printf("INT_TYPES:CPUS=%d\n",cpus); } rc = ll_get_data(machines,LL_MachineCPUList,&cpulist); if (!rc){ printf("INT_TYPES:CPU_LIST="); for(i=0;i != cpus;i++) { printf("%d:",cpulist[i]); } printf("\n"); } rc = ll_get_data(machines, LL_MachinePoolListSize,&pools); if (!rc){ printf("INT_TYPES:POOLS=%d\n",pools); } rc = ll_get_data(machines,LL_MachinePoolList,&poollist); if (!rc){ printf("INT_TYPES:POOL_LIST="); for(i=0;i != pools;i++) { printf("%d:",poollist[i]); } printf("\n"); } rc = ll_get_data(machines, LL_MachineGetFirstAdapter,&adapter); if ( adapter != NULL ) { rc=ll_get_data(adapter, LL_AdapterTotalWindowCount,&windows); while (adapter != NULL && windows == 0 ) { rc=ll_get_data(machines, LL_MachineGetNextAdapter,&adapter); rc=ll_get_data(adapter, LL_AdapterTotalWindowCount,&windows); } if ( windows != 0 ) { printf("INT_TYPES:WINDOWS=%d\n",windows); printf("INT_TYPES:WINDOW_LIST="); rc=ll_get_data(adapter, LL_AdapterWindowList,&windowlist); for(i=0;i != windows;i++) { printf("%d:",windowlist[i]); } printf("\n"); } } }
/* * get the hostlist from LoadLeveler * *hostlist should either by NULL or a valid argv and *num_hosts * should be 0 or the number of elements in the hostlist argv */ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist) { LL_element *queryObject = NULL, *job = NULL, *step = NULL; LL_element *node = NULL, *task = NULL, *task_instance = NULL; int rc, obj_count, err_code, ll_master_task, job_step_count; char *ll_step_id= NULL, *job_step_list[2], *task_machine_name = NULL; char *schedd_host_name = NULL; int step_mode; /* Get the step ID from LOADL_STEP_ID environment variable. */ if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: could not get LOADL_STEP_ID " "from environment!"); return ORTE_ERROR; } job_step_list[0] = ll_step_id; job_step_list[1] = NULL; /* STEP 1: Get Job object from Central Manager to find out the name of the * Schedd daemon that handles this job. In a Multicluster environment we * can not get the schedd name from the job step id. */ /* Initialize the LL API. Specify that query type is JOBS. */ if(NULL == (queryObject = ll_query(JOBS))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_query faild on JOBS!"); return ORTE_ERROR; } /* Specify that this is a QUERY_STEPID type of query. */ rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA); if(0 > rc) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_set request failed: error " "%d!", rc); return ORTE_ERROR; } /* Get a Job object from LoadL_schedd that contains the relevant job step */ job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code); if(NULL == job) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_objs faild!"); return ORTE_ERROR; } if (obj_count != 1) { /* Only 1 Job object is expected. */ opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_objs: expected one job " "to match, got %d!", obj_count); return ORTE_ERROR; } if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc); return ORTE_ERROR; } if (schedd_host_name != NULL) { job_step_list[0] = ll_step_id; job_step_list[1] = NULL; } else { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_objs() Error: Could not " "determine managing schedd for job %s.\n", job_step_list[0]); return ORTE_ERROR; } ll_free_objs(queryObject); ll_deallocate(queryObject); /* STEP 2: Get Job object from Schedd that manages this job step. */ /* Only schedd query gives us all the relevant task instance info. */ /* Initialize the LL API. Specify that query type is JOBS. */ if(NULL == (queryObject = ll_query(JOBS))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_query faild on JOBS!"); return ORTE_ERROR; } /* Specify that this is a QUERY_STEPID type of query. */ rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA); if(0 != rc) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_set request failed: error " "%d!", rc); return ORTE_ERROR; } /* Get a Job object from LoadL_schedd that contains the relevant job step */ job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count, &err_code); if(NULL == job) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_set request failed: error " "%d!", rc); return ORTE_ERROR; } if (obj_count != 1) { /* Only 1 Job object is expected. */ opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_objs: expected one job " "to match, got %d!", obj_count); return ORTE_ERROR; } if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc); return ORTE_ERROR; } if (job_step_count != 1) { /* Only 1 Job Step object is expected. */ opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_objs: expected one job " "step to match, got %d!", obj_count); return ORTE_ERROR; } step = NULL; if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc); return ORTE_ERROR; } if(NULL == step) { opal_output(orte_ras_base.ras_output, "ll_get_data() Error: Unable to obtain Job Step " "information.\n"); return ORTE_ERROR; } step_mode = -1; if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_StepParallelMode. RC= %d!", rc); return ORTE_ERROR; } /* Serial job step: step_mode==0; Parallel: step_mode==1; Others:2,3,4. */ if ((step_mode != 0) && (step_mode != 1)) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: We support only Serial and " "Parallel LoadLeveler job types. PVM, NQS, and Blue Gene" "jobs are not supported by the LoadLeveler RAS!"); return ORTE_ERROR; } if(step_mode == 0) { /* serial job */ node = NULL; if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_StepGetFirstNode. RC= %d!", rc); return ORTE_ERROR; } task = NULL; if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_NodeGetFirstTask. RC= %d!", rc); return ORTE_ERROR; } task_instance = NULL; rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance); if(0 != rc) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_TaskGetFirstInstance. RC= %d!", rc); return ORTE_ERROR; } task_machine_name = NULL; if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName, &task_machine_name))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_TaskInstanceMachineName. RC= %d!", rc); return ORTE_ERROR; } opal_argv_append(num_hosts, hostlist, task_machine_name); } else { /* parallel job */ node = NULL; if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_StepGetFirstNode. RC= %d!", rc); return ORTE_ERROR; } while(NULL != node) { /* Loop through the "Node" objects. */ task = NULL; if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure on " "LL_NodeGetFirstTask. RC= %d!", rc); return ORTE_ERROR; } while(task) { /* Loop through the "Task" objects. */ ll_master_task = 0; rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task); if(0 != rc) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure" " on LL_TaskIsMaster. RC= %d!", rc); return ORTE_ERROR; } /* The "master task" Task object is a LoadLeveler abstraction * and is not relevant here. Look at only Task objects that * are not "master".*/ if (!ll_master_task) { task_instance = NULL; if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: " "failure on LL_TaskGetFirstTaskInstance. " " RC= %d!", rc); return ORTE_ERROR; } /* Loop through the "Task Instance" objects. */ while (task_instance) { task_machine_name = NULL; rc = ll_get_data(task_instance, LL_TaskInstanceMachineName, &task_machine_name); if(0 != rc) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data:" " failure on LL_TaskInstanceMachineName" "RC= %d!", rc); return ORTE_ERROR; } opal_argv_append(num_hosts, hostlist, task_machine_name); task_instance = NULL; rc = ll_get_data(task, LL_TaskGetNextTaskInstance, &task_instance); if(0 != rc) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data:" " failure on LL_TaskGetNextInstance. " "RC= %d!", rc); return ORTE_ERROR; } } } task = NULL; if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: " "failure on LL_NodeGetNextTask. RC= %d!", rc); return ORTE_ERROR; } } node = NULL; if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) { opal_output(orte_ras_base.ras_output, "ras:loadleveler:allocate: ll_get_data: failure " "on LL_StepGetNextNode. RC= %d!", rc); return ORTE_ERROR; } } } ll_free_objs(queryObject); ll_deallocate(queryObject); return ORTE_SUCCESS; }