int sclient_read_rbs(sclient_t * clt, cid_t cid, sid_t sid, uint8_t layout, uint8_t spare, sid_t dist_set[ROZOFS_SAFE_MAX], fid_t fid, bid_t bid, uint32_t nb_proj, uint32_t * nb_proj_recv, bin_t * bins) { int status = -1; sp_read_ret_t *ret = 0; sp_read_arg_t args; DEBUG_FUNCTION; // Fill request args.cid = cid; args.sid = sid; args.layout = layout; args.spare = spare; memcpy(args.dist_set, dist_set, sizeof (sid_t) * ROZOFS_SAFE_MAX); memcpy(args.fid, fid, sizeof (fid_t)); args.bid = bid; args.nb_proj = nb_proj; if (!(clt->rpcclt.client) || !(ret = sp_read_1(&args, clt->rpcclt.client))) { clt->status = 0; warning("sclient_read_rbs failed: storage read failed " "(no response from storage server: %s)", clt->host); errno = EPROTO; goto out; } if (ret->status != 0) { errno = ret->sp_read_ret_t_u.error; if (errno == ENOENT) { // Receive a response but the file // is not on storage // This is possible when it's just be removed *nb_proj_recv = 0; status = 0; goto out; } else { severe("sclient_read_rbs failed (error from %s): (%s)", clt->host, strerror(errno)); goto out; } } // XXX ret->sp_read_ret_t_u.bins.bins_len is coherent // XXX could we avoid memcpy ?? memcpy(bins, ret->sp_read_ret_t_u.rsp.bins.bins_val, ret->sp_read_ret_t_u.rsp.bins.bins_len); *nb_proj_recv = ret->sp_read_ret_t_u.rsp.bins.bins_len / ((rozofs_get_max_psize(layout) * sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t)); status = 0; out: if (ret) xdr_free((xdrproc_t) xdr_sp_read_ret_t, (char *) ret); return status; }
void sp_write_1_svc_nb(void * pt, rozorpc_srv_ctx_t *req_ctx_p) { sp_write_arg_t * args = (sp_write_arg_t *) pt; static sp_write_ret_t ret; storage_t *st = 0; // Variable to be used in a later version. uint8_t version = 0; char *buf_bins; /* ** put the pointer to the bins (still in received buffer */ int position = storage_get_position_of_first_byte2write_from_write_req(); buf_bins = (char*)ruc_buf_getPayload(req_ctx_p->recv_buf); buf_bins+= position; DEBUG_FUNCTION; START_PROFILING_IO(write, args->nb_proj * rozofs_get_max_psize(args->layout) * sizeof (bin_t)); ret.status = SP_FAILURE; // Get the storage for the couple (cid;sid) if ((st = storaged_lookup(args->cid, args->sid)) == 0) { ret.sp_write_ret_t_u.error = errno; goto out; } // Write projections if (storage_write(st, args->layout, (sid_t *) args->dist_set, args->spare, (unsigned char *) args->fid, args->bid, args->nb_proj, version, &ret.sp_write_ret_t_u.file_size, (bin_t *) buf_bins) <= 0) { ret.sp_write_ret_t_u.error = errno; goto out; } ret.status = SP_SUCCESS; out: req_ctx_p->xmitBuf = req_ctx_p->recv_buf; req_ctx_p->recv_buf = NULL; rozorpc_srv_forward_reply(req_ctx_p,(char*)&ret); /* ** release the context */ rozorpc_srv_release_context(req_ctx_p); STOP_PROFILING(write); return ; }
int sclient_write(sclient_t * clt, cid_t cid, sid_t sid, uint8_t layout, uint8_t spare, sid_t dist_set[ROZOFS_SAFE_MAX], fid_t fid, bid_t bid, uint32_t nb_proj, const bin_t * bins) { int status = -1; sp_write_ret_t *ret = 0; sp_write_arg_t args; DEBUG_FUNCTION; // Fill request args.cid = cid; args.sid = sid; args.layout = layout; args.spare = spare; memcpy(args.dist_set, dist_set, sizeof (sid_t) * ROZOFS_SAFE_MAX); memcpy(args.fid, fid, sizeof (uuid_t)); args.bid = bid; args.nb_proj = nb_proj; args.bins.bins_len = nb_proj * (rozofs_get_max_psize(layout) * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t)); args.bins.bins_val = (char *) bins; if (!(clt->rpcclt.client) || !(ret = sp_write_1(&args, clt->rpcclt.client))) { clt->status = 0; warning("sclient_write failed: no response from storage server" " (%s, %u, %u)", clt->host, clt->port, sid); errno = EPROTO; goto out; } if (ret->status != 0) { severe("sclient_write failed: storage write response failure (%s)", strerror(errno)); errno = ret->sp_write_ret_t_u.error; goto out; } status = 0; out: if (ret) xdr_free((xdrproc_t) xdr_sp_write_ret_t, (char *) ret); return status; }
/** * API to update in the internal structure associated with the projection the header of each blocks That function is required since the read can return less blocks than expected so we might face the situation where the system check headers in memory on an array that has not be updated We need also to consider the case of the end of file as well as the case where blocks has been reserved but not yet written (file with holes). For these two cases we might have a timestam of 0 so we need to use the effective length to discriminate between a hole (0's array on BSIZE length) and a EOF case where length is set to 0. @param prj_ctx_p : pointer to the projection context @param layout : layout associated with the file @param number_of_blocks_returned : number of blocks in the projection @param number_of_blocks_requested : number of blocks requested @param raw_file_size : raw file_size reported from a fstat on the projection file (on storage) @retval none */ void rozofs_storcli_transform_update_headers(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout, uint32_t number_of_blocks_returned, uint32_t number_of_blocks_requested, uint64_t raw_file_size) { int block_idx; prj_ctx_p->raw_file_size = raw_file_size; for (block_idx = 0; block_idx < number_of_blocks_returned; block_idx++) { /* ** Get the pointer to the beginning of the block and extract its header */ rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)(prj_ctx_p->bins +((rozofs_get_max_psize(layout)+(sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t))) * block_idx)); if (rozofs_bins_hdr_p->s.timestamp == 0) { prj_ctx_p->block_hdr_tab[block_idx].s.timestamp = rozofs_bins_hdr_p->s.timestamp; prj_ctx_p->block_hdr_tab[block_idx].s.effective_length = ROZOFS_BSIZE; } else { prj_ctx_p->block_hdr_tab[block_idx].s.timestamp = rozofs_bins_hdr_p->s.timestamp; prj_ctx_p->block_hdr_tab[block_idx].s.effective_length = rozofs_bins_hdr_p->s.effective_length; } } /* ** clear the part that is after number of returned block (assume end of file) */ for (block_idx = number_of_blocks_returned; block_idx < number_of_blocks_requested; block_idx++) { prj_ctx_p->block_hdr_tab[block_idx].s.timestamp = 0; prj_ctx_p->block_hdr_tab[block_idx].s.effective_length = 0; } }
/* ** That function is called when all the projection are ready to be sent @param working_ctx_p: pointer to the root context associated with the top level write request @param data : pointer to the data of the last block to truncate */ void rozofs_storcli_truncate_req_processing_exec(rozofs_storcli_ctx_t *working_ctx_p, char * data) { storcli_truncate_arg_t *storcli_truncate_rq_p = (storcli_truncate_arg_t*)&working_ctx_p->storcli_truncate_arg; uint8_t layout = storcli_truncate_rq_p->layout; uint8_t rozofs_forward; uint8_t rozofs_safe; uint8_t projection_id; int storage_idx; int error=0; rozofs_storcli_lbg_prj_assoc_t *lbg_assoc_p = working_ctx_p->lbg_assoc_tb; rozofs_storcli_projection_ctx_t *prj_cxt_p = working_ctx_p->prj_ctx; rozofs_forward = rozofs_get_rozofs_forward(layout); rozofs_safe = rozofs_get_rozofs_safe(layout); /* ** set the current state of each load balancing group belonging to the rozofs_safe group */ for (storage_idx = 0; storage_idx < rozofs_safe; storage_idx++) { /* ** Check the state of the load Balancing group */ rozofs_storcli_lbg_prj_insert_lbg_state(lbg_assoc_p, storage_idx, NORTH_LBG_GET_STATE(lbg_assoc_p[storage_idx].lbg_id)); } /* ** Now find out a selectable lbg_id for each projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { if (rozofs_storcli_select_storage_idx_for_write ( working_ctx_p,rozofs_forward, rozofs_safe,projection_id) < 0) { /* ** there is no enough valid storage !! */ error = EIO; goto fail; } } /* ** Let's transform the data to write */ working_ctx_p->truncate_bins_len = 0; if (data != NULL) { STORCLI_START_KPI(storcli_kpi_transform_forward); rozofs_storcli_transform_forward(working_ctx_p->prj_ctx, layout, 0, 1, working_ctx_p->timestamp, storcli_truncate_rq_p->last_seg, data); STORCLI_STOP_KPI(storcli_kpi_transform_forward,0); working_ctx_p->truncate_bins_len = rozofs_get_max_psize(layout)*sizeof(bin_t) + sizeof(rozofs_stor_bins_hdr_t); } /* ** We have enough storage, so initiate the transaction towards the storage for each ** projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { sp_truncate_arg_no_bins_t *request; sp_truncate_arg_no_bins_t truncate_prj_args; void *xmit_buf; int ret; xmit_buf = prj_cxt_p[projection_id].prj_buf; if (xmit_buf == NULL) { /* ** fatal error since the ressource control already took place */ error = EIO; goto fatal; } /* ** fill partially the common header */ retry: request = &truncate_prj_args; request->cid = storcli_truncate_rq_p->cid; request->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request->layout = layout; if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request->spare = 1; else request->spare = 0; memcpy(request->dist_set, storcli_truncate_rq_p->dist_set, ROZOFS_SAFE_MAX*sizeof (uint8_t)); memcpy(request->fid, storcli_truncate_rq_p->fid, sizeof (sp_uuid_t)); request->proj_id = projection_id; request->bid = storcli_truncate_rq_p->bid; request->last_seg = storcli_truncate_rq_p->last_seg; request->last_timestamp = working_ctx_p->timestamp; request->len = working_ctx_p->truncate_bins_len; uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),truncate_prj,0); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_truncate_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_TRUNCATE, (xdrproc_t) xdr_sp_truncate_arg_no_bins_t, (caddr_t) request, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, working_ctx_p->truncate_bins_len, rozofs_storcli_truncate_req_processing_cbk, (void*)working_ctx_p); working_ctx_p->write_ctx_lock = 0; if (ret < 0) { /* ** the communication with the storage seems to be wrong (more than TCP connection temporary down ** attempt to select a new storage ** */ if (rozofs_storcli_select_storage_idx_for_write (working_ctx_p,rozofs_forward,rozofs_safe,projection_id) < 0) { /* ** Out of storage !!-> too many storages are down */ goto fatal; } /* ** retry for that projection with a new storage index: WARNING: we assume that xmit buffer has not been released !!! */ //#warning: it is assumed that xmit buffer has not been release, need to double check!! goto retry; } else { /* ** check if the state has not been changed: -> it might be possible to get a direct error */ if (prj_cxt_p[projection_id].prj_state == ROZOFS_PRJ_WR_ERROR) { error = prj_cxt_p[projection_id].errcode; goto fatal; } } } return; fail: /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; fatal: /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param timestamp: date in microseconds @param last_block_size: effective length of the last block * @param *data: pointer to the source data that must be transformed * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_forward(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout, uint32_t first_block_idx, uint32_t number_of_blocks, uint64_t timestamp, uint16_t last_block_size, char *data) { projection_t rozofs_fwd_projections[ROZOFS_SAFE_MAX]; projection_t *projections; // Table of projections used to transform data uint16_t projection_id = 0; uint32_t i = 0; uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); int empty_block = 0; projections = rozofs_fwd_projections; // For each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { projections[projection_id].angle.p = rozofs_get_angles_p(layout,projection_id); projections[projection_id].angle.q = rozofs_get_angles_q(layout,projection_id); projections[projection_id].size = rozofs_get_psizes(layout,projection_id); } /* Transform the data */ // For each block to send for (i = 0; i < number_of_blocks; i++) { empty_block = rozofs_data_block_check_empty(data + (i * ROZOFS_BSIZE), ROZOFS_BSIZE); // seek bins for each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { /* ** Indicates the memory area where the transformed data must be stored */ projections[projection_id].bins = prj_ctx_p[projection_id].bins + ((rozofs_get_max_psize(layout)+(sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t)))* (first_block_idx+i)); rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)projections[projection_id].bins; /* ** check if the user data block is empty: if the data block is empty no need to transform */ if (empty_block) { rozofs_bins_hdr_p->s.projection_id = 0; rozofs_bins_hdr_p->s.timestamp = 0; rozofs_bins_hdr_p->s.effective_length = 0; continue; } /* ** fill the header of the projection */ rozofs_bins_hdr_p->s.projection_id = projection_id; rozofs_bins_hdr_p->s.timestamp = timestamp; /* ** set the effective size of the block. It is always ROZOFS_BSIZE except for the last block */ if (i == (number_of_blocks-1)) { rozofs_bins_hdr_p->s.effective_length = last_block_size; } else { rozofs_bins_hdr_p->s.effective_length = ROZOFS_BSIZE; } /* ** update the pointer to point out the first bins */ projections[projection_id].bins += sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t); } /* ** do not apply transform for empty block */ if (empty_block == 0) { /* ** Apply the erasure code transform for the block i+first_block_idx */ transform_forward((pxl_t *) (data + (i * ROZOFS_BSIZE)), rozofs_inverse, ROZOFS_BSIZE / rozofs_inverse / sizeof (pxl_t), rozofs_forward, projections); } } return 0; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param *data: pointer to the source data that must be transformed @param *number_of_blocks_p: pointer to the array where the function returns number of blocks on which the transform was applied @param *rozofs_storcli_prj_idx_table: pointer to the array used for storing the projections index for inverse process * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_inverse(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout, uint32_t first_block_idx, uint32_t number_of_blocks, rozofs_storcli_inverse_block_t *block_ctx_p, char *data, uint32_t *number_of_blocks_p, uint8_t *rozofs_storcli_prj_idx_table) { projection_t *projections = NULL; projection_t rozofs_inv_projections[ROZOFS_SAFE_MAX]; int block_idx; uint16_t projection_id = 0; int prj_ctx_idx; *number_of_blocks_p = 0; uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); projections = rozofs_inv_projections; /* ** Proceed the inverse data transform for the nb_projections2read blocks. */ for (block_idx = 0; block_idx < number_of_blocks; block_idx++) { if (block_ctx_p[block_idx].state == ROZOFS_BLK_TRANSFORM_DONE) { /* ** transformation has already been done for that block of ROZOFS_BSIZE siz ** check the next one */ continue; } #if 0 /* ** check if we can find out a set of rozofs_inverse projections that will permit to ** rebuild the current block of ROZOFS_BSIZE sise ** For this we check if we can find at least rozofs_inverse projections with the same ** time stamp and with different angles(projection id ** If there is no enough valid projection we need to read a new projection on the next ** storage in sequence that follows the index of the last valid storage on which a projection has been ** read. ** It might be possible that we run out of storage since rozofs_safe has been reached and we have not reached ** rozofs_inserse projection!! */ ret = rozofs_storcli_transform_inverse_check(prj_ctx_p,layout, block_idx, rozofs_storcli_prj_idx_table, &block_ctx_p[block_idx].timestamp, &block_ctx_p[block_idx].effective_length); if (ret < 0) { /* ** the set of projection that have been read does not permit to rebuild, need to read more */ return -1; } #endif /* ** Check the case of the file that has no data (there is a hole in the file), this is indicated by ** reporting a timestamp of 0 */ if ((block_ctx_p[block_idx].timestamp == 0) && (block_ctx_p[block_idx].effective_length == ROZOFS_BSIZE )) { /* ** clear the memory */ ROZOFS_STORCLI_STATS(ROZOFS_STORCLI_EMPTY_READ); memset( data + (ROZOFS_BSIZE * (first_block_idx + block_idx)),0,ROZOFS_BSIZE); block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; continue; } if ((block_ctx_p[block_idx].timestamp == 0) && (block_ctx_p[block_idx].effective_length == 0 )) { /* ** we have reached end of file */ block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; *number_of_blocks_p = (block_idx++); return 0; } /* ** Here we have to take care, since the index of the projection_id use to address ** prj_ctx_p is NOT the real projection_id. The projection ID is found in the header of ** each bins, so for a set of projections pointed by bins, we might have a different ** projection id in the header of the projections contains in the bins array that has ** been read!! */ int prj_count = 0; for (prj_count = 0; prj_count < rozofs_inverse; prj_count++) { /* ** Get the pointer to the beginning of the projection and extract the projection Id */ prj_ctx_idx = rozofs_storcli_prj_idx_table[ROZOFS_SAFE_MAX*block_idx+prj_count]; rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)(prj_ctx_p[prj_ctx_idx].bins +((rozofs_get_max_psize(layout)+(sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t))) * block_idx)); /* ** Extract the projection_id from the header ** and Fill the table of projections for the block block_idx ** For each meta-projection */ projection_id = rozofs_bins_hdr_p->s.projection_id; projections[prj_count].angle.p = rozofs_get_angles_p(layout,projection_id); projections[prj_count].angle.q = rozofs_get_angles_q(layout,projection_id); projections[prj_count].size = rozofs_get_psizes(layout,projection_id); projections[prj_count].bins = (bin_t*)(rozofs_bins_hdr_p+1); } // Inverse data for the block (first_block_idx + block_idx) transform_inverse_inline((pxl_t *) (data + (ROZOFS_BSIZE * (first_block_idx + block_idx))), rozofs_inverse, ROZOFS_BSIZE / rozofs_inverse / sizeof (pxl_t), rozofs_inverse, projections); /* ** indicate that transform has been done for the projection */ block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; /* ** check the case of a block that is not full: need to zero's that part */ if (block_ctx_p[block_idx].effective_length < ROZOFS_BSIZE) { /* ** clear the memory */ char *raz_p = data + (ROZOFS_BSIZE * (first_block_idx + block_idx)) + block_ctx_p[block_idx].effective_length; memset( raz_p,0,(ROZOFS_BSIZE-block_ctx_p[block_idx].effective_length) ); } } /* ** now the inverse transform is finished, release the allocated ressources used for ** rebuild */ *number_of_blocks_p = number_of_blocks; return 0; }
void sp_read_1_svc_nb(void * pt, rozorpc_srv_ctx_t *req_ctx_p) { sp_read_arg_t * args = (sp_read_arg_t *) pt; static sp_read_ret_t ret; storage_t *st = 0; START_PROFILING_IO(read, args->nb_proj * rozofs_get_max_psize(args->layout) * sizeof (bin_t)); ret.status = SP_FAILURE; /* ** allocate a buffer for the response */ req_ctx_p->xmitBuf = ruc_buf_getBuffer(storage_xmit_buffer_pool_p); if (req_ctx_p->xmitBuf == NULL) { severe("Out of memory STORAGE_NORTH_LARGE_POOL"); ret.sp_read_ret_t_u.error = ENOMEM; req_ctx_p->xmitBuf = req_ctx_p->recv_buf; req_ctx_p->recv_buf = NULL; goto error; } // Get the storage for the couple (cid;sid) if ((st = storaged_lookup(args->cid, args->sid)) == 0) { ret.sp_read_ret_t_u.error = errno; goto error; } /* ** set the pointer to the bins */ int position = storage_get_position_of_first_byte2write_from_read_req(); uint8_t *pbuf = (uint8_t*)ruc_buf_getPayload(req_ctx_p->xmitBuf); /* ** clear the length of the bins and set the pointer where data must be returned */ ret.sp_read_ret_t_u.rsp.bins.bins_val =(char *)(pbuf+position); ; ret.sp_read_ret_t_u.rsp.bins.bins_len = 0; #if 0 // for future usage with distributed cache /* ** clear the optimization array */ ret.sp_read_ret_t_u.rsp.optim.optim_val = (char*)sp_optim; ret.sp_read_ret_t_u.rsp.optim.optim_len = 0; #endif // Read projections if (storage_read(st, args->layout, (sid_t *) args->dist_set, args->spare, (unsigned char *) args->fid, args->bid, args->nb_proj, (bin_t *) ret.sp_read_ret_t_u.rsp.bins.bins_val, (size_t *) & ret.sp_read_ret_t_u.rsp.bins.bins_len, &ret.sp_read_ret_t_u.rsp.file_size) != 0) { ret.sp_read_ret_t_u.error = errno; goto error; } ret.status = SP_SUCCESS; storaged_srv_forward_read_success(req_ctx_p,&ret); /* ** check the case of the readahead */ storage_check_readahead(); goto out; error: rozorpc_srv_forward_reply(req_ctx_p,(char*)&ret); /* ** release the context */ out: rozorpc_srv_release_context(req_ctx_p); STOP_PROFILING(read); return ; }
int storage_truncate(storage_t * st, uint8_t layout, sid_t * dist_set, uint8_t spare, fid_t fid, tid_t proj_id,bid_t bid,uint8_t version,uint16_t last_seg,uint64_t last_timestamp) { int status = -1; char path[FILENAME_MAX]; int fd = -1; off_t bins_file_offset = 0; uint16_t rozofs_max_psize = 0; uint8_t write_file_hdr = 0; bid_t bid_truncate; size_t nb_write = 0; size_t length_to_write = 0; rozofs_stor_bins_hdr_t bins_hdr; // Build the full path of directory that contains the bins file storage_map_distribution(st, layout, dist_set, spare, path); // Check that this directory already exists, otherwise it will be create if (access(path, F_OK) == -1) { if (errno == ENOENT) { // If the directory doesn't exist, create it if (mkdir(path, ROZOFS_ST_DIR_MODE) != 0) { if (errno != EEXIST) { // The directory is not created !!! severe("mkdir failed (%s) : %s", path, strerror(errno)); goto out; } // Well someone else has created the directory in the meantime } } else { goto out; } } // Build the path of bins file storage_map_projection(fid, path); // Check that this file already exists if (access(path, F_OK) == -1) write_file_hdr = 1; // We must write the header // Open bins file fd = open(path, ROZOFS_ST_BINS_FILE_FLAG, ROZOFS_ST_BINS_FILE_MODE); if (fd < 0) { severe("open failed (%s) : %s", path, strerror(errno)); goto out; } // If we write the bins file for the first time, we must write the header if (write_file_hdr) { // Prepare file header rozofs_stor_bins_file_hdr_t file_hdr; memcpy(file_hdr.dist_set_current, dist_set, ROZOFS_SAFE_MAX * sizeof (sid_t)); memset(file_hdr.dist_set_next, 0, ROZOFS_SAFE_MAX * sizeof (sid_t)); file_hdr.layout = layout; file_hdr.version = version; // Write the header for this bins file nb_write = pwrite(fd, &file_hdr, sizeof (file_hdr), 0); if (nb_write != sizeof (file_hdr)) { severe("pwrite failed: %s", strerror(errno)); goto out; } } // Compute the offset from the truncate rozofs_max_psize = rozofs_get_max_psize(layout); bid_truncate = bid; if (last_seg!= 0) bid_truncate+=1; bins_file_offset = ROZOFS_ST_BINS_FILE_HDR_SIZE + (bid_truncate) * (rozofs_max_psize * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t)); status = ftruncate(fd, bins_file_offset); if (status < 0) goto out; /* ** Check the case of the last segment */ if (last_seg!= 0) { bins_hdr.s.timestamp = last_timestamp; bins_hdr.s.effective_length = last_seg; bins_hdr.s.projection_id = proj_id; bins_hdr.s.version = version; length_to_write = sizeof(rozofs_stor_bins_hdr_t); bins_file_offset = ROZOFS_ST_BINS_FILE_HDR_SIZE + (bid) * (rozofs_max_psize * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t)); nb_write = pwrite(fd, &bins_hdr, length_to_write, bins_file_offset); if (nb_write != length_to_write) { severe("pwrite failed on last segment: %s", strerror(errno)); goto out; } } out: if (fd != -1) close(fd); return status; }
int storage_read(storage_t * st, uint8_t layout, sid_t * dist_set, uint8_t spare, fid_t fid, bid_t bid, uint32_t nb_proj, bin_t * bins, size_t * len_read, uint64_t *file_size) { int status = -1; char path[FILENAME_MAX]; int fd = -1; size_t nb_read = 0; size_t length_to_read = 0; off_t bins_file_offset = 0; uint16_t rozofs_max_psize = 0; struct stat sb; // Build the full path of directory that contains the bins file storage_map_distribution(st, layout, dist_set, spare, path); // Build the path of bins file storage_map_projection(fid, path); // Open bins file fd = open(path, ROZOFS_ST_BINS_FILE_FLAG, ROZOFS_ST_BINS_FILE_MODE); if (fd < 0) { DEBUG("open failed (%s) : %s", path, strerror(errno)); goto out; } // Compute the offset and length to read rozofs_max_psize = rozofs_get_max_psize(layout); bins_file_offset = ROZOFS_ST_BINS_FILE_HDR_SIZE + bid * ((off_t) (rozofs_max_psize * sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t)); length_to_read = nb_proj * (rozofs_max_psize * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t)); // Read nb_proj * (projection + header) nb_read = pread(fd, bins, length_to_read, bins_file_offset); // Check error if (nb_read == -1) { severe("pread failed: %s", strerror(errno)); goto out; } // Check the length read if ((nb_read % (rozofs_max_psize * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t))) != 0) { char fid_str[37]; uuid_unparse(fid, fid_str); severe("storage_read failed (FID: %s): read inconsistent length", fid_str); errno = EIO; goto out; } // Update the length read *len_read = nb_read; // Stat file for return the size of bins file after the read operation if (fstat(fd, &sb) == -1) { severe("fstat failed: %s", strerror(errno)); goto out; } *file_size = sb.st_size; // Read is successful status = 0; out: if (fd != -1) close(fd); return status; }
int storage_write(storage_t * st, uint8_t layout, sid_t * dist_set, uint8_t spare, fid_t fid, bid_t bid, uint32_t nb_proj, uint8_t version, uint64_t *file_size, const bin_t * bins) { int status = -1; char path[FILENAME_MAX]; int fd = -1; size_t nb_write = 0; size_t length_to_write = 0; off_t bins_file_offset = 0; uint16_t rozofs_max_psize = 0; uint8_t write_file_hdr = 0; struct stat sb; rozofs_max_psize = rozofs_get_max_psize(layout); // Build the full path of directory that contains the bins file storage_map_distribution(st, layout, dist_set, spare, path); // Check that this directory already exists, otherwise it will be create if (access(path, F_OK) == -1) { if (errno == ENOENT) { // If the directory doesn't exist, create it if (mkdir(path, ROZOFS_ST_DIR_MODE) != 0) { if (errno != EEXIST) { // The directory is not created !!! severe("mkdir failed (%s) : %s", path, strerror(errno)); goto out; } // Well someone else has created the directory in the meantime } } else { goto out; } } // Build the path of bins file storage_map_projection(fid, path); // Check that this file already exists if (access(path, F_OK) == -1) write_file_hdr = 1; // We must write the header // Open bins file fd = open(path, ROZOFS_ST_BINS_FILE_FLAG, ROZOFS_ST_BINS_FILE_MODE); if (fd < 0) { severe("open failed (%s) : %s", path, strerror(errno)); goto out; } // If we write the bins file for the first time, we must write the header if (write_file_hdr) { // Prepare file header rozofs_stor_bins_file_hdr_t file_hdr; memcpy(file_hdr.dist_set_current, dist_set, ROZOFS_SAFE_MAX * sizeof (sid_t)); memset(file_hdr.dist_set_next, 0, ROZOFS_SAFE_MAX * sizeof (sid_t)); file_hdr.layout = layout; file_hdr.version = version; // Write the header for this bins file nb_write = pwrite(fd, &file_hdr, sizeof (file_hdr), 0); if (nb_write != sizeof (file_hdr)) { severe("pwrite failed: %s", strerror(errno)); goto out; } } // Compute the offset and length to write bins_file_offset = ROZOFS_ST_BINS_FILE_HDR_SIZE + bid * (rozofs_max_psize * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t)); length_to_write = nb_proj * (rozofs_max_psize * sizeof (bin_t) + sizeof (rozofs_stor_bins_hdr_t)); // Write nb_proj * (projection + header) nb_write = pwrite(fd, bins, length_to_write, bins_file_offset); if (nb_write != length_to_write) { severe("pwrite failed: %s", strerror(errno)); goto out; } /** * insert in the fid cache the written section */ // storage_build_ts_table_from_prj_header((char*)bins,nb_proj,rozofs_max_psize,buf_ts_storage_write); // storio_cache_insert(fid,bid,nb_proj,buf_ts_storage_write,0); // Stat file for return the size of bins file after the write operation if (fstat(fd, &sb) == -1) { severe("fstat failed: %s", strerror(errno)); goto out; } *file_size = sb.st_size; // Write is successful status = length_to_write; out: if (fd != -1) close(fd); return status; }
int read_data_file() { int status = -1; uint64_t size = 0; int block_idx = 0; int idx =0; int count; rozofs_stor_bins_hdr_t * rozofs_bins_hdr_p; rozofs_stor_bins_footer_t * rozofs_bins_foot_p; char * loc_read_bins_p = NULL; int forward = rozofs_get_rozofs_forward(layout); // int inverse = rozofs_get_rozofs_inverse(layout); uint16_t disk_block_size; uint16_t max_block_size = (rozofs_get_max_psize(layout,bsize)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); char * p; int empty,valid; int prj_id; int nb_ts; uint64_t ts[32]; int ts_count[32]; // Allocate memory for reading loc_read_bins_p = xmalloc(max_block_size); for (idx=0; idx < nb_file; idx++) { if (strcmp(filename[idx],"NULL") == 0) { fd[idx] = -1; } else { fd[idx] = open(filename[idx],O_RDWR); if (fd < 0) { severe("Can not open file %s %s",filename[idx],strerror(errno)); goto out; } } } printf (" ______ __________ "); for (idx=0; idx < nb_file; idx++) printf (" __________________ ______ ____ "); printf ("\n"); printf("| %4s | %8s |","Blk","Offset"); for (idx=0; idx < nb_file; idx++) printf("| %16s | %4s | %2s |", "Time stamp", "lgth", "id"); printf ("\n"); printf ("|______|__________|"); for (idx=0; idx < nb_file; idx++) printf ("|__________________|______|____|"); printf ("\n"); if (block_number == -1) block_idx = 0; else block_idx = block_number; count = 1; empty = 0; while ( count ) { valid = 0; count = 0; nb_ts = 0; p = &LINE[0]; p += sprintf(p,"| %4d | %8d ",block_idx+firstBlock,(block_idx+firstBlock)*bbytes); for (idx=0; idx < nb_file; idx++) { if (fd[idx] == -1) { p += sprintf(p,"%32s"," "); continue; } if (idx >= forward) disk_block_size = rozofs_get_max_psize_in_msg(layout, bsize); else disk_block_size = rozofs_get_psizes_on_disk(layout,bsize,idx); size = pread(fd[idx],loc_read_bins_p,disk_block_size,block_idx*disk_block_size); if (size != disk_block_size) { p += sprintf(p,"|__________________|______|____|"); close(fd[idx]); fd[idx] = -1; } else { count++; rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t *)loc_read_bins_p; prj_id = rozofs_bins_hdr_p->s.projection_id; if (prj_id >= forward) { valid = 1; p += sprintf(p,"|| xxxxxxxxxxxxxxxx | xxxx | %2d ",prj_id); } else { disk_block_size = (rozofs_get_psizes(layout,bsize,prj_id)*sizeof (bin_t)); disk_block_size += sizeof (rozofs_stor_bins_hdr_t); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t *) ((char*) rozofs_bins_hdr_p + disk_block_size); if (rozofs_bins_hdr_p->s.timestamp == 0) { p += sprintf(p,"|| %16d | .... | %2d ",0,prj_id); } else if (rozofs_bins_foot_p->timestamp != rozofs_bins_hdr_p->s.timestamp) { valid = 1; p += sprintf(p,"|--%16.16llu----------%2d-", (long long unsigned int)rozofs_bins_hdr_p->s.timestamp, prj_id); } else if (rozofs_bins_hdr_p->s.timestamp == 0) { p += sprintf(p,"|| %16d | .... | %2d ",0,prj_id); } else { valid = 1; p += sprintf(p,"|| %16llu | %4d | %2d ", (unsigned long long)rozofs_bins_hdr_p->s.timestamp, rozofs_bins_hdr_p->s.effective_length, rozofs_bins_hdr_p->s.projection_id); int tsidx; for (tsidx=0; tsidx< nb_ts; tsidx++) { if (ts[tsidx] == rozofs_bins_hdr_p->s.timestamp) { ts_count[tsidx]++; break; } } if (tsidx == nb_ts) { ts[tsidx] = rozofs_bins_hdr_p->s.timestamp; ts_count[tsidx] = 1; nb_ts++; } } } } } if (valid) { if (empty) { printf("... %d blocks...\n",empty); empty = 0; } int best=-1,tsidx; for (tsidx=0; tsidx< nb_ts; tsidx++) { if (ts_count[tsidx] > best) best = ts_count[tsidx]; } printf("%s%s\n",LINE, (best<forward)?"<<<<----":"|"); } else { empty++; } block_idx++; if (block_number!=-1) break; } printf ("|______|__________|\n"); if (block_number!=-1) { for (idx=0; idx < nb_file; idx++) { if (idx < forward) { disk_block_size = (rozofs_get_psizes(layout,bsize,idx)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); } else { disk_block_size = (rozofs_get_max_psize(layout,bsize)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); } size = pread(fd[idx],loc_read_bins_p,disk_block_size,block_number*disk_block_size); if (size != disk_block_size) { printf("Can not read block %d of %s\n", block_number, filename[idx]); continue; } FILE * f; char fname[128]; sprintf(fname,"block_%d_dist_%d.txt", block_number, idx); f = fopen(fname,"w"); if (f == NULL) { printf ("Can not create file %s",fname); continue; } printf("- %s\n",fname); fprintf(f,"%s Block %d size %d\n", filename[idx], block_number, disk_block_size); rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t *)loc_read_bins_p; fprintf(f,"Block header : TS %llu SZ %d PRJ %d CRC32 0x%x\n", (long long unsigned int)rozofs_bins_hdr_p->s.timestamp, rozofs_bins_hdr_p->s.effective_length, rozofs_bins_hdr_p->s.projection_id, rozofs_bins_hdr_p->s.filler); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t *) (loc_read_bins_p + disk_block_size); rozofs_bins_foot_p--; fprintf(f,"Block footer : TS %llu %s\n", (long long unsigned int)rozofs_bins_foot_p->timestamp, (rozofs_bins_hdr_p->s.timestamp==rozofs_bins_foot_p->timestamp)?"":" !!!!!!"); hexdump(f,loc_read_bins_p, 0, disk_block_size); fclose(f); } } status = 0; out: // This spare file used to exist but is not needed any more for (idx=0; idx < nb_file; idx++) { if (fd[idx] != -1) close(fd[idx]); } if (loc_read_bins_p != NULL) { //free(loc_read_bins_p); loc_read_bins_p = NULL; } return status; }