Exemple #1
0
int
orte_ess_alps_sync_complete(void)
{
    int ret = ORTE_SUCCESS;
    int lli_ret = 0;
    int alps_status = 0;
    size_t alps_count;

    lli_ret = alps_app_lli_lock();
    if (0 != ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_lock returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_WRITE_FAILURE;
        goto fn_exit;
    }

    lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_EXITING, NULL, 0);
    if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_put_request returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_WRITE_FAILURE;
        goto fn_exit_w_lock;
    }

    lli_ret = alps_app_lli_get_response (&alps_status, &alps_count);
    if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_get_response returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status));
        ret = ORTE_ERR_FILE_READ_FAILURE;
        goto fn_exit_w_lock;
    }

   fn_exit_w_lock:
    lli_ret = alps_app_lli_unlock();
    if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_unlock returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_WRITE_FAILURE;
    }

   fn_exit:
    return ret;
}
Exemple #2
0
int 
orte_ess_alps_get_first_rank_on_node(int *first_rank)
{
    int alps_status = 0;
    uint64_t apid;
    size_t alps_count;
    int ret = ORTE_SUCCESS;
    int lli_ret = 0, place_ret;
    alpsAppLayout_t orted_layout;

    if (first_rank == NULL) {
        ret = ORTE_ERR_BAD_PARAM;
        goto fn_exit;
    }

    /*
     * First get our apid
     */

    lli_ret = alps_app_lli_lock();
    if (0 != ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_lock returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_WRITE_FAILURE;
        goto fn_exit;
    }

    lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0);
    if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_put_request - APID returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_WRITE_FAILURE;
        goto fn_exit_w_lock;
    }

    lli_ret = alps_app_lli_get_response (&alps_status, &alps_count);
    if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_get_response returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status));
        ret = ORTE_ERR_FILE_READ_FAILURE;
        goto fn_exit_w_lock;
    }

    lli_ret = alps_app_lli_get_response_bytes (&apid, sizeof(apid));
    if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_get_response_bytes returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_READ_FAILURE;
        goto fn_exit_w_lock;
    }

    place_ret = alps_get_placement_info(apid,
                                        &orted_layout,
                                        NULL,
                                        NULL,
                                        NULL,
                                        NULL,
                                        NULL,
                                        NULL,
                                        NULL,
                                        NULL,
                                        NULL);
    if (1 != place_ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_get_placement_info returned %d (%s)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, strerror(errno)));
        ret = ORTE_ERROR;
        goto fn_exit;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
                           "%s ess:alps: alps_get_placement_info returned %d first pe on node is %d",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, orted_layout.firstPe));
    *first_rank = orted_layout.firstPe;

   fn_exit_w_lock:
    lli_ret = alps_app_lli_unlock();
    if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
        OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
                             "%s ess:alps: alps_app_lli_unlock returned %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
        ret = ORTE_ERR_FILE_WRITE_FAILURE;
    }

   fn_exit:
    return ret;
}
Exemple #3
0
static int __gnix_alps_init(void)
{
	int ret = FI_SUCCESS;
	int alps_status = 0;
	size_t alps_count;
	alpsAppLLIGni_t *rdmacred_rsp = NULL;
	alpsAppGni_t *rdmacred_buf = NULL;

	/* lli_lock doesn't return anything useful */
	ret = alps_app_lli_lock();

	if (alps_init) {
		/* alps lli lock protects alps_init for now */
		alps_app_lli_unlock();
		return ret;
	}

	/*
	 * First get our apid
	 */
	ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0);
	if (ret != ALPS_APP_LLI_ALPS_STAT_OK) {
		GNIX_WARN(FI_LOG_FABRIC, "lli put failed, ret=%d(%s)\n", ret,
			  strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	ret = alps_app_lli_get_response(&alps_status, &alps_count);
	if (alps_status != ALPS_APP_LLI_ALPS_STAT_OK) {
		GNIX_WARN(FI_LOG_FABRIC, "lli get response failed, "
			  "alps_status=%d(%s)\n", alps_status,
			  strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	ret = alps_app_lli_get_response_bytes(&gnix_apid, sizeof(gnix_apid));
	if (ret != ALPS_APP_LLI_ALPS_STAT_OK) {
		GNIX_WARN(FI_LOG_FABRIC,
			  "lli get response failed, ret=%d(%s)\n",
			  ret, strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	/*
	 * now get the GNI rdma credentials info
	 */
	ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_GNI, NULL, 0);
	if (ret != ALPS_APP_LLI_ALPS_STAT_OK) {
		GNIX_WARN(FI_LOG_FABRIC, "lli put failed, ret=%d(%s)\n",
			  ret, strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	ret = alps_app_lli_get_response(&alps_status, &alps_count);
	if (alps_status != ALPS_APP_LLI_ALPS_STAT_OK) {
		GNIX_WARN(FI_LOG_FABRIC,
			  "lli get response failed, alps_status=%d(%s)\n",
			  alps_status, strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	rdmacred_rsp = malloc(alps_count);
	if (rdmacred_rsp == NULL) {
		ret = -FI_ENOMEM;
		goto err;
	}

	memset(rdmacred_rsp, 0, alps_count);

	ret = alps_app_lli_get_response_bytes(rdmacred_rsp, alps_count);
	if (ret != ALPS_APP_LLI_ALPS_STAT_OK) {
		GNIX_WARN(FI_LOG_FABRIC,
			  "lli get response failed, ret=%d(%s)\n",
			  ret, strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	rdmacred_buf = (alpsAppGni_t *) rdmacred_rsp->u.buf;

	/*
	 * just use the first ptag/cookie for now
	 */

	gnix_app_ptag = rdmacred_buf[0].ptag;
	gnix_app_cookie = rdmacred_buf[0].cookie;

	/*
	 * alps_get_placement_info(uint64_t apid, alpsAppLayout_t *appLayout,
	 *	int **placementList, int **targetNids, int **targetPes,
	 *	int **targetLen, struct in_addr **targetIps, int **startPe,
	 *	int **totalPes, int **nodePes, int **peCpus);
	 */
	ret = alps_get_placement_info(gnix_apid, &gnix_appLayout,
				      &gnix_app_placementList,
				      &gnix_app_targetNids,
				      &gnix_app_targetPes,
				      &gnix_app_targetLen,
				      &gnix_app_targetIps,
				      &gnix_app_startPe,
				      &gnix_app_totalPes,
				      &gnix_app_nodePes,
				      &gnix_app_peCpus);
	if (ret != 1) {
		GNIX_WARN(FI_LOG_FABRIC,
			  "alps_get_placement_info failed, ret=%d(%s)\n",
			  ret, strerror(errno));
		ret = -FI_EIO;
		goto err;
	}

	gnix_pes_on_node = gnix_appLayout.numPesHere;

	alps_init = true;

	ret = 0;
err:
	alps_app_lli_unlock();
	if (rdmacred_rsp != NULL) {
		free(rdmacred_rsp);
	}

	return ret;
}