gaspi_return_t pgaspi_write_list_notify (const gaspi_number_t num, gaspi_segment_id_t * const segment_id_local, gaspi_offset_t * const offset_local, const gaspi_rank_t rank, gaspi_segment_id_t * const segment_id_remote, gaspi_offset_t * const offset_remote, gaspi_size_t * const size, const gaspi_segment_id_t segment_id_notification, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if(num == 0) return GASPI_ERR_INV_NUM; if(notification_value == 0) return GASPI_ERR_INV_NOTIF_VAL; #ifdef DEBUG gaspi_verify_init("gaspi_write_list_notify"); gaspi_verify_queue(queue); gaspi_number_t n; for(n = 0; n < num; n++) { gaspi_verify_local_off(offset_local[n], segment_id_local[n]); gaspi_verify_remote_off(offset_remote[n], segment_id_remote[n], rank); gaspi_verify_comm_size(size[n], segment_id_local[n], segment_id_remote[n], rank, GASPI_MAX_TSIZE_C); } #endif gaspi_return_t eret = GASPI_ERROR; if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; if( GASPI_ENDPOINT_DISCONNECTED == glb_gaspi_ctx.ep_conn[rank].cstat ) { eret = pgaspi_connect((gaspi_rank_t) rank, timeout_ms); if ( eret != GASPI_SUCCESS) { goto endL; } } eret = pgaspi_dev_write_list_notify(num, segment_id_local, offset_local, rank, segment_id_remote, offset_remote, (unsigned int *)size, segment_id_notification, notification_id, notification_value, queue); glb_gaspi_ctx.ne_count_c[queue] += (int) (num + 1); endL: unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return eret; }
gaspi_return_t pgaspi_gpu_write_notify(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { gaspi_verify_init("gaspi_gpu_write_notify"); gaspi_verify_local_off(offset_local, segment_id_local, size); gaspi_verify_remote_off(offset_remote, segment_id_remote, rank, size); gaspi_verify_queue(queue); gaspi_verify_comm_size(size, segment_id_local, segment_id_remote, rank, GASPI_MAX_TSIZE_C); if( notification_value == 0 ) { gaspi_printf("Zero is not allowed as notification value."); return GASPI_ERR_INV_NOTIF_VAL; } gaspi_return_t eret = GASPI_ERROR; gaspi_context_t * const gctx = &glb_gaspi_ctx; if(lock_gaspi_tout (&gctx->lockC[queue], timeout_ms)) return GASPI_TIMEOUT; if( GASPI_ENDPOINT_DISCONNECTED == gctx->ep_conn[rank].cstat ) { eret = pgaspi_connect((gaspi_rank_t) rank, timeout_ms); if ( eret != GASPI_SUCCESS) { goto endL; } } eret = pgaspi_dev_gpu_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, notification_id, notification_value, queue, timeout_ms); if( eret != GASPI_SUCCESS ) { /* gctx->qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; */ goto endL; } /* GPI2_STATS_INC_COUNT(GASPI_STATS_COUNTER_NUM_WRITE_NOT, 1); */ /* GPI2_STATS_INC_COUNT(GASPI_STATS_COUNTER_BYTES_WRITE, size); */ endL: unlock_gaspi (&gctx->lockC[queue]); return eret; }
gaspi_return_t pgaspi_write_notify (const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { gaspi_verify_init("gaspi_write_notify"); gaspi_verify_local_off(offset_local, segment_id_local); gaspi_verify_remote_off(offset_remote, segment_id_remote, rank); gaspi_verify_queue(queue); gaspi_verify_comm_size(size, segment_id_local, segment_id_remote, rank, GASPI_MAX_TSIZE_C); if(notification_value == 0) return GASPI_ERR_INV_NOTIF_VAL; gaspi_return_t eret = GASPI_ERROR; if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; if( GASPI_ENDPOINT_DISCONNECTED == glb_gaspi_ctx.ep_conn[rank].cstat ) { eret = pgaspi_connect((gaspi_rank_t) rank, timeout_ms); if ( eret != GASPI_SUCCESS) { goto endL; } } eret = pgaspi_dev_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, notification_id, notification_value, queue); glb_gaspi_ctx.ne_count_c[queue] += 2; endL: unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return eret; }
gaspi_return_t pgaspi_notify (const gaspi_segment_id_t segment_id_remote, const gaspi_rank_t rank, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { gaspi_verify_init("gaspi_notify"); gaspi_verify_segment(segment_id_remote); gaspi_verify_null_ptr(glb_gaspi_ctx.rrmd[segment_id_remote]); gaspi_verify_rank(rank); gaspi_verify_queue(queue); if(notification_value == 0) return GASPI_ERR_INV_NOTIF_VAL; gaspi_return_t eret = GASPI_ERROR; if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; if( GASPI_ENDPOINT_DISCONNECTED == glb_gaspi_ctx.ep_conn[rank].cstat ) { eret = pgaspi_connect((gaspi_rank_t) rank, timeout_ms); if ( eret != GASPI_SUCCESS) { goto endL; } } eret = pgaspi_dev_notify(segment_id_remote, rank, notification_id, notification_value, queue); glb_gaspi_ctx.ne_count_c[queue]++; endL: unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return eret; }
gaspi_return_t pgaspi_passive_send (const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_size_t size, const gaspi_timeout_t timeout_ms) { gaspi_verify_init("gaspi_passive_send"); gaspi_verify_local_off(offset_local, segment_id_local, size); gaspi_verify_comm_size(size, segment_id_local, segment_id_local, glb_gaspi_ctx.rank, GASPI_MAX_TSIZE_P); gaspi_verify_rank(rank); gaspi_return_t eret = GASPI_ERROR; if( lock_gaspi_tout (&glb_gaspi_ctx.lockPS, timeout_ms) ) { return GASPI_TIMEOUT; } if( GASPI_ENDPOINT_DISCONNECTED == glb_gaspi_ctx.ep_conn[rank].cstat ) { eret = pgaspi_connect((gaspi_rank_t) rank, timeout_ms); if( eret != GASPI_SUCCESS ) { goto endL; } } eret = pgaspi_dev_passive_send(segment_id_local, offset_local, rank, size, glb_gaspi_ctx.ne_count_p, timeout_ms); if( eret == GASPI_ERROR ) { glb_gaspi_ctx.qp_state_vec[GASPI_PASSIVE_QP][rank] = GASPI_STATE_CORRUPT; } endL: unlock_gaspi (&glb_gaspi_ctx.lockPS); return eret; }
gaspi_return_t pgaspi_read (const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { gaspi_verify_init("gaspi_read"); gaspi_verify_local_off(offset_local, segment_id_local); gaspi_verify_remote_off(offset_remote, segment_id_remote, rank); gaspi_verify_queue(queue); gaspi_verify_comm_size(size, segment_id_local, segment_id_remote, rank, GASPI_MAX_TSIZE_C); gaspi_return_t eret = GASPI_ERROR; if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; if( GASPI_ENDPOINT_DISCONNECTED == glb_gaspi_ctx.ep_conn[rank].cstat ) { eret = pgaspi_connect((gaspi_rank_t) rank, timeout_ms); if ( eret != GASPI_SUCCESS) { goto endL; } } eret = pgaspi_dev_read(segment_id_local, offset_local, rank, segment_id_remote,offset_remote, (unsigned int) size, queue); glb_gaspi_ctx.ne_count_c[queue]++; endL: unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return eret; }
gaspi_return_t pgaspi_proc_init (const gaspi_timeout_t timeout_ms) { gaspi_return_t eret = GASPI_ERROR; int i; const int num_queues = (int) glb_gaspi_cfg.queue_num; if(lock_gaspi_tout (&glb_gaspi_ctx_lock, timeout_ms)) return GASPI_TIMEOUT; if(glb_gaspi_sn_init == 0) { glb_gaspi_ctx.lockPS.lock = 0; glb_gaspi_ctx.lockPR.lock = 0; for (i = 0; i < num_queues; i++) glb_gaspi_ctx.lockC[i].lock = 0; memset (&glb_gaspi_ctx, 0, sizeof (gaspi_context)); struct utsname mbuf; if (uname (&mbuf) == 0) { snprintf (glb_gaspi_ctx.mtyp, 64, "%s", mbuf.machine); } //timing glb_gaspi_ctx.mhz = gaspi_get_cpufreq (); if (glb_gaspi_ctx.mhz == 0.0f) { gaspi_print_error ("Failed to get CPU frequency"); goto errL; } glb_gaspi_ctx.cycles_to_msecs = 1.0f / (glb_gaspi_ctx.mhz * 1000.0f); //handle environment if(gaspi_handle_env(&glb_gaspi_ctx)) { gaspi_print_error("Failed to handle environment"); eret = GASPI_ERR_ENV; goto errL; } //start sn_backend if(pthread_create(&glb_gaspi_ctx.snt, NULL, gaspi_sn_backend, NULL) != 0) { gaspi_print_error("Failed to create SN thread"); goto errL; } glb_gaspi_sn_init = 1; }//glb_gaspi_sn_init if(glb_gaspi_ctx.procType == MASTER_PROC) { if(glb_gaspi_dev_init == 0) { if(access (glb_gaspi_ctx.mfile, R_OK) == -1) { gaspi_print_error ("Incorrect permissions of machinefile"); eret = GASPI_ERR_ENV; goto errL; } //read hostnames char *line = NULL; size_t len = 0; int read; FILE *fp = fopen (glb_gaspi_ctx.mfile, "r"); if (fp == NULL) { gaspi_print_error("Failed to open machinefile"); eret = GASPI_ERR_ENV; goto errL; } glb_gaspi_ctx.tnc = 0; while ((read = getline (&line, &len, fp)) != -1) { //we assume a single hostname per line if ((read < 2) || (read > 64)) continue; glb_gaspi_ctx.tnc++; if (glb_gaspi_ctx.tnc >= GASPI_MAX_NODES) break; } rewind (fp); free (glb_gaspi_ctx.hn_poff); glb_gaspi_ctx.hn_poff = (char *) calloc (glb_gaspi_ctx.tnc, 65); if(glb_gaspi_ctx.hn_poff == NULL) { gaspi_print_error("Debug: Failed to allocate memory"); goto errL; } glb_gaspi_ctx.poff = glb_gaspi_ctx.hn_poff + glb_gaspi_ctx.tnc * 64; int id = 0; while((read = getline (&line, &len, fp)) != -1) { //we assume a single hostname per line if((read < 2) || (read >= 64)) continue; int inList = 0; for(i = 0; i < id; i++) { //already in list ? //TODO: 64? 63? Magic numbers -> just get cacheline from system or define as such const int hnlen = MAX (strlen (glb_gaspi_ctx.hn_poff + i * 64), MIN (strlen (line) - 1, 63)); if(strncmp (glb_gaspi_ctx.hn_poff + i * 64, line, hnlen) == 0) { inList++; } } glb_gaspi_ctx.poff[id] = inList; strncpy (glb_gaspi_ctx.hn_poff + id * 64, line, MIN (read - 1, 63)); id++; if(id >= GASPI_MAX_NODES) break; } fclose (fp); free (line); //master glb_gaspi_ctx.rank = 0; free(glb_gaspi_ctx.sockfd); glb_gaspi_ctx.sockfd = (int *) malloc (glb_gaspi_ctx.tnc * sizeof (int)); if(glb_gaspi_ctx.sockfd == NULL) { gaspi_print_error("Failed to allocate memory"); eret = GASPI_ERR_MEMALLOC; goto errL; } for(i = 0; i < glb_gaspi_ctx.tnc; i++) glb_gaspi_ctx.sockfd[i] = -1; }//glb_gaspi_dev_init }//MASTER_PROC else if(glb_gaspi_ctx.procType != WORKER_PROC) { gaspi_print_error ("Invalid node type (GASPI_TYPE)"); eret = GASPI_ERR_ENV; goto errL; } if( 0 != gaspi_sn_broadcast_topology(&glb_gaspi_ctx, GASPI_BLOCK) ) { gaspi_print_error("Failed topology broadcast"); eret = GASPI_ERROR; goto errL; } if( (eret = pgaspi_init_core()) != GASPI_SUCCESS ) { goto errL; } /* Unleash SN thread */ __sync_fetch_and_add( &gaspi_master_topo_data, 1); gaspi_init_collectives(); glb_gaspi_init = 1; unlock_gaspi (&glb_gaspi_ctx_lock); if(glb_gaspi_cfg.build_infrastructure) { /* configuration tells us to pre-connect */ if( GASPI_TOPOLOGY_STATIC == glb_gaspi_cfg.build_infrastructure ) { for(i = glb_gaspi_ctx.rank; i >= 0; i--) { if( (eret = pgaspi_connect((gaspi_rank_t) i, timeout_ms)) != GASPI_SUCCESS ) { goto errL; } } } eret = pgaspi_group_all_local_create(timeout_ms); if(eret == GASPI_SUCCESS) { eret = gaspi_barrier(GASPI_GROUP_ALL, timeout_ms); } else { gaspi_print_error("Failed to create GASPI_GROUP_ALL."); } } else /* dont build_infrastructure */ { /* just reserve GASPI_GROUP_ALL */ glb_gaspi_ctx.group_cnt = 1; glb_gaspi_group_ctx[GASPI_GROUP_ALL].id = -2;//disable eret = GASPI_SUCCESS; } #ifdef GPI2_CUDA /* init GPU counts */ glb_gaspi_ctx.use_gpus = 0; glb_gaspi_ctx.gpu_count = 0; #endif return eret; errL: unlock_gaspi (&glb_gaspi_ctx_lock); return eret; }