/**" * The purpose of that function is to return TRUE if there are enough projection received for rebuilding the associated initial message @param layout : layout association with the file @param prj_cxt_p: pointer to the projection context (working array) @retval 1 if there are enough received projection @retval 0 when there is enough projection */ static inline int rozofs_storcli_all_prj_write_repair_check(uint8_t layout,rozofs_storcli_projection_ctx_t *prj_cxt_p) { /* ** Get the rozofs_forward value for the layout */ uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); int i; for (i = 0; i <rozofs_forward; i++,prj_cxt_p++) { if (prj_cxt_p->prj_state == ROZOFS_PRJ_WR_IN_PRG) { return 0; } } return 1; }
/** * The purpose of that function is to return TRUE if there are enough truncate response received for rebuilding a projection for future reading @param layout : layout association with the file @param prj_cxt_p: pointer to the projection context (working array) @param *distribution: pointer to the resulting distribution--> obsolete @retval 1 if there are enough received projection @retval 0 when there is enough projection */ static inline int rozofs_storcli_all_prj_truncate_check(uint8_t layout,rozofs_storcli_projection_ctx_t *prj_cxt_p,dist_t *distribution) { /* ** Get the rozofs_forward value for the layout */ uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); int i; int received = 0; for (i = 0; i <rozofs_forward; i++,prj_cxt_p++) { if (prj_cxt_p->prj_state == ROZOFS_PRJ_WR_DONE) { received++; } if (received == rozofs_forward) return 1; } return 0; }
/* ** That function is called when all the projection are ready to be sent @param working_ctx_p: pointer to the root context associated with the top level write request */ void rozofs_storcli_write_repair_req_processing(rozofs_storcli_ctx_t *working_ctx_p) { storcli_read_arg_t *storcli_read_rq_p = (storcli_read_arg_t*)&working_ctx_p->storcli_read_arg; uint8_t layout = storcli_read_rq_p->layout; uint8_t rozofs_forward; uint8_t projection_id; int error=0; int ret; rozofs_storcli_projection_ctx_t *prj_cxt_p = working_ctx_p->prj_ctx; uint8_t bsize = storcli_read_rq_p->bsize; int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); sp_write_repair_arg_no_bins_t *request; sp_write_repair_arg_no_bins_t repair_prj_args; sp_write_repair2_arg_no_bins_t *request2; sp_write_repair2_arg_no_bins_t repair2_prj_args; rozofs_forward = rozofs_get_rozofs_forward(layout); /* ** check if the buffer is still valid: we might face the situation where the rozofsmount ** time-out and re-allocate the write buffer located in shared memory for another ** transaction (either read or write: ** the control must take place only where here is the presence of a shared memory for the write */ error = 0; if (working_ctx_p->shared_mem_p!= NULL) { uint32_t *xid_p = (uint32_t*)working_ctx_p->shared_mem_p; if (*xid_p != working_ctx_p->src_transaction_id) { /* ** the source has aborted the request */ error = EPROTO; } } /* ** send back the response of the read request towards rozofsmount */ rozofs_storcli_read_reply_success(working_ctx_p); /* ** allocate a sequence number for the working context: ** This is mandatory to avoid any confusion with a late response of the previous read request */ working_ctx_p->read_seqnum = rozofs_storcli_allocate_read_seqnum(); /* ** check if it make sense to send the repaired blocks */ if (error) { /* ** the requester has released the buffer and it could be possible that the ** rozofsmount uses it for another purpose, so the data that have been repaired ** might be wrong, so don't take the right to write wrong data for which we can can ** a good crc !! */ goto fail; } /* ** We have enough storage, so initiate the transaction towards the storage for each ** projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { void *xmit_buf; int ret; /* ** skip the projections for which no error has been detected */ if (storcli_storage_supports_repair2) { if (ROZOFS_BITMAP64_TEST_ALL0(working_ctx_p->prj_ctx[projection_id].crc_err_bitmap)) continue; } else { if (working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[0] == 0) continue; } xmit_buf = prj_cxt_p[projection_id].prj_buf; if (xmit_buf == NULL) { /* ** fatal error since the ressource control already took place */ error = EIO; goto fail; } /* ** fill partially the common header */ if (storcli_storage_supports_repair2) { request2 = &repair2_prj_args; request2->cid = storcli_read_rq_p->cid; request2->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request2->layout = storcli_read_rq_p->layout; request2->bsize = storcli_read_rq_p->bsize; /* ** the case of spare 1 must not occur because repair is done for th eoptimal distribution only */ if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request2->spare = 1; else request2->spare = 0; memcpy(request2->dist_set, storcli_read_rq_p->dist_set, ROZOFS_SAFE_MAX_STORCLI*sizeof (uint8_t)); memcpy(request2->fid, storcli_read_rq_p->fid, sizeof (sp_uuid_t)); //CRCrequest->proj_id = projection_id; request2->proj_id = rozofs_storcli_get_mojette_proj_id(storcli_read_rq_p->dist_set,request2->sid,rozofs_forward); request2->bid = storcli_read_rq_p->bid; request2->bitmap[0] = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[0]; request2->bitmap[1] = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[1]; request2->bitmap[2] = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[2]; int nb_blocks = ROZOFS_BITMAP64_NB_BIT1(request2->bitmap); request2->nb_proj = nb_blocks; /* ** set the length of the bins part: need to compute the number of blocks */ int bins_len = (prj_size_in_msg * nb_blocks); request2->len = bins_len; /**< bins length MUST be in bytes !!! */ uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,bins_len); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_write_repair_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_WRITE_REPAIR2, (xdrproc_t) xdr_sp_write_repair2_arg_no_bins_t, (caddr_t) request2, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, bins_len, rozofs_storcli_write_repair_req_processing_cbk, (void*)working_ctx_p); } else { request = &repair_prj_args; request->cid = storcli_read_rq_p->cid; request->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request->layout = storcli_read_rq_p->layout; request->bsize = storcli_read_rq_p->bsize; /* ** the case of spare 1 must not occur because repair is done for th eoptimal distribution only */ if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request->spare = 1; else request->spare = 0; memcpy(request->dist_set, storcli_read_rq_p->dist_set, ROZOFS_SAFE_MAX_STORCLI*sizeof (uint8_t)); memcpy(request->fid, storcli_read_rq_p->fid, sizeof (sp_uuid_t)); //CRCrequest->proj_id = projection_id; request->proj_id = rozofs_storcli_get_mojette_proj_id(storcli_read_rq_p->dist_set,request->sid,rozofs_forward); request->bid = storcli_read_rq_p->bid; request->bitmap = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[0]; int nb_blocks = ROZOFS_BITMAP64_NB_BIT1_FUNC((uint8_t*)&request->bitmap,8); request->nb_proj = nb_blocks; /* ** set the length of the bins part: need to compute the number of blocks */ int bins_len = (prj_size_in_msg * nb_blocks); request->len = bins_len; /**< bins length MUST be in bytes !!! */ uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,bins_len); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_write_repair_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_WRITE_REPAIR, (xdrproc_t) xdr_sp_write_repair_arg_no_bins_t, (caddr_t) request, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, bins_len, rozofs_storcli_write_repair_req_processing_cbk, (void*)working_ctx_p); } working_ctx_p->write_ctx_lock = 0; if (ret < 0) { /* ** there is no retry, just keep on with a potential other projection to repair */ STORCLI_ERR_PROF(repair_prj_err); STORCLI_STOP_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,0); prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_ERROR; continue; } else { /* ** check if the state has not been changed: -> it might be possible to get a direct error */ if (prj_cxt_p[projection_id].prj_state == ROZOFS_PRJ_WR_ERROR) { /* ** it looks like that we cannot repair that preojection, check if there is some other */ STORCLI_STOP_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,0); } } } /* ** check if there some write repair request pending, in such a case we wait for the end of the repair ** (answer from the storage node */ ret = rozofs_storcli_all_prj_write_repair_check(storcli_read_rq_p->layout, working_ctx_p->prj_ctx); if (ret == 0) { /* ** there is some pending write */ return; } fail: /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,repair,0); rozofs_storcli_release_context(working_ctx_p); return; }
/** Initial write repair request Here it is assumed that storclo is working with the context that has been allocated @param working_ctx_p: pointer to the working context of a read transaction @retval : TRUE-> xmit ready event expected @retval : FALSE-> xmit ready event not expected */ void rozofs_storcli_repair_req_init(rozofs_storcli_ctx_t *working_ctx_p) { int i; storcli_read_arg_t *storcli_read_rq_p = (storcli_read_arg_t*)&working_ctx_p->storcli_read_arg; STORCLI_START_NORTH_PROF(working_ctx_p,repair,0); /* ** set the pointer to to first available data (decoded data) */ working_ctx_p->data_write_p = working_ctx_p->data_read_p; /* ** set now the working variable specific for handling the write ** We need one large buffer per projection that will be written on storage ** we keep the buffer that have been allocated for the read. */ uint8_t forward_projection = rozofs_get_rozofs_forward(storcli_read_rq_p->layout); for (i = 0; i < forward_projection; i++) { working_ctx_p->prj_ctx[i].prj_state = ROZOFS_PRJ_WR_IDLE; if (working_ctx_p->prj_ctx[i].prj_buf == NULL) { working_ctx_p->prj_ctx[i].prj_buf = ruc_buf_getBuffer(ROZOFS_STORCLI_SOUTH_LARGE_POOL); if (working_ctx_p->prj_ctx[i].prj_buf == NULL) { /* ** that situation MUST not occur since there the same number of receive buffer and working context!! */ severe("out of large buffer"); goto failure; } } /* ** set the pointer to the bins */ int position; // For compatibility between new clients and old storages if (storcli_storage_supports_repair2) { position = rozofs_storcli_repair2_get_position_of_first_byte2write(); } else { position = rozofs_storcli_repair_get_position_of_first_byte2write(); } uint8_t *pbuf = (uint8_t*)ruc_buf_getPayload(working_ctx_p->prj_ctx[i].prj_buf); working_ctx_p->prj_ctx[i].bins = (bin_t*)(pbuf+position); } /* ** now regenerate the projections that were in error */ rozofs_storcli_transform_forward_repair(working_ctx_p, storcli_read_rq_p->layout, storcli_read_rq_p->nb_proj, (char *)working_ctx_p->data_write_p); /* ** starts the sending of the repaired projections */ rozofs_storcli_write_repair_req_processing(working_ctx_p); return; failure: /* ** send back the response of the read request towards rozofsmount */ rozofs_storcli_read_reply_success(working_ctx_p); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,repair,0); rozofs_storcli_release_context(working_ctx_p); }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *working_ctx_p: storcli working context * @param number_of_blocks: number of blocks to write * @param *data: pointer to the source data that must be transformed * * @return: the length written on success, -1 otherwise (errno is set) */ void rozofs_storcli_transform_forward_repair(rozofs_storcli_ctx_t *working_ctx_p, uint8_t layout, uint32_t number_of_blocks, char *data) { projection_t rozofs_fwd_projections[ROZOFS_SAFE_MAX_STORCLI]; projection_t *projections; // Table of projections used to transform data uint16_t projection_id = 0; uint32_t i = 0; uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_safe = rozofs_get_rozofs_forward(layout); uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); rozofs_storcli_projection_ctx_t *prj_ctx_p = &working_ctx_p->prj_ctx[0]; int empty_block = 0; uint8_t sid; int moj_prj_id; int block_idx; int k; storcli_read_arg_t *storcli_read_rq_p = (storcli_read_arg_t*)&working_ctx_p->storcli_read_arg; uint8_t bsize = storcli_read_rq_p->bsize; uint32_t bbytes = ROZOFS_BSIZE_BYTES(bsize); int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); projections = rozofs_fwd_projections; // For each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { projections[projection_id].angle.p = rozofs_get_angles_p(layout,projection_id); projections[projection_id].angle.q = rozofs_get_angles_q(layout,projection_id); projections[projection_id].size = rozofs_get_128bits_psizes(layout,bsize,projection_id); } /* ** now go through all projection set to find out if there is something to regenerate */ for (k = 0; k < rozofs_safe; k++) { block_idx = 0; if (ROZOFS_BITMAP64_TEST_ALL0(prj_ctx_p[k].crc_err_bitmap)) continue; /* ** Get the sid associated with the projection context */ sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb, prj_ctx_p[k].stor_idx); /* ** Get the reference of the Mojette projection_id */ moj_prj_id = rozofs_storcli_get_mojette_proj_id(storcli_read_rq_p->dist_set,sid,rozofs_forward); if (moj_prj_id < 0) { /* ** it is the reference of a spare sid, so go to the next projection context */ continue; } for (i = 0; i < number_of_blocks; i++) { if (ROZOFS_BITMAP64_TEST0(i,prj_ctx_p[k].crc_err_bitmap)) { /* ** nothing to generate for that block */ continue; } /* ** check for empty block */ empty_block = rozofs_data_block_check_empty(data + (i * bbytes), bbytes); /** * regenerate the projection for the block for which a crc error has been detected */ //CRC projections[moj_prj_id].bins = prj_ctx_p[moj_prj_id].bins + projections[moj_prj_id].bins = prj_ctx_p[k].bins + (prj_size_in_msg/sizeof(bin_t)* (0+block_idx)); rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)projections[moj_prj_id].bins; /* ** check if the user data block is empty: if the data block is empty no need to transform */ if (empty_block) { rozofs_bins_hdr_p->s.projection_id = 0; rozofs_bins_hdr_p->s.timestamp = 0; rozofs_bins_hdr_p->s.effective_length = 0; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; block_idx++; continue; } /* ** fill the header of the projection */ rozofs_bins_hdr_p->s.projection_id = moj_prj_id; //CRC rozofs_bins_hdr_p->s.timestamp = working_ctx_p->block_ctx_table[block_idx].timestamp; rozofs_bins_hdr_p->s.timestamp = working_ctx_p->block_ctx_table[i].timestamp; //CRC rozofs_bins_hdr_p->s.effective_length = working_ctx_p->block_ctx_table[block_idx].effective_length; rozofs_bins_hdr_p->s.effective_length = working_ctx_p->block_ctx_table[i].effective_length; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; /* ** update the pointer to point out the first bins */ projections[moj_prj_id].bins += sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t); /* ** do not apply transform for empty block */ if (empty_block == 0) { /* ** Apply the erasure code transform for the block i */ transform128_forward_one_proj((pxl_t *) (data + (i * bbytes)), rozofs_inverse, bbytes / rozofs_inverse / sizeof (pxl_t), moj_prj_id, projections); /* ** add the footer at the end of the repaired projection */ rozofs_stor_bins_footer_t *rozofs_bins_foot_p; rozofs_bins_foot_p = (rozofs_stor_bins_footer_t*) (projections[moj_prj_id].bins + rozofs_get_psizes(layout,bsize,moj_prj_id)); //CRC rozofs_bins_foot_p->timestamp = working_ctx_p->block_ctx_table[block_idx].timestamp; rozofs_bins_foot_p->timestamp = rozofs_bins_hdr_p->s.timestamp; } block_idx++; } } }
void rozofs_storcli_truncate_projection_retry(rozofs_storcli_ctx_t *working_ctx_p,uint8_t projection_id,int same_storage_retry_acceptable) { uint8_t rozofs_safe; uint8_t rozofs_forward; uint8_t layout; storcli_truncate_arg_t *storcli_truncate_rq_p = (storcli_truncate_arg_t*)&working_ctx_p->storcli_truncate_arg; int error=0; int storage_idx; rozofs_storcli_projection_ctx_t *prj_cxt_p = working_ctx_p->prj_ctx; rozofs_storcli_lbg_prj_assoc_t *lbg_assoc_p = working_ctx_p->lbg_assoc_tb; layout = storcli_truncate_rq_p->layout; rozofs_safe = rozofs_get_rozofs_safe(layout); rozofs_forward = rozofs_get_rozofs_forward(layout); /* ** Now update the state of each load balancing group since it might be possible ** that some experience a state change */ for (storage_idx = 0; storage_idx < rozofs_safe; storage_idx++) { /* ** Check the state of the load Balancing group */ rozofs_storcli_lbg_prj_insert_lbg_state(lbg_assoc_p, storage_idx, NORTH_LBG_GET_STATE(lbg_assoc_p[storage_idx].lbg_id)); } /** * attempt to select a new storage */ if (rozofs_storcli_select_storage_idx_for_write (working_ctx_p,rozofs_forward,rozofs_safe,projection_id) < 0) { /* ** Cannot select a new storage: OK so now double check if the retry on the same storage is ** acceptable.When it is the case, check if the max retry has not been yet reached ** Otherwise, we are in deep shit-> reject the read request */ if (same_storage_retry_acceptable == 0) { error = EIO; prj_cxt_p[projection_id].errcode = error; goto reject; } if (++prj_cxt_p[projection_id].retry_cpt >= ROZOFS_STORCLI_MAX_RETRY) { error = EIO; prj_cxt_p[projection_id].errcode = error; goto reject; } } /* ** we are lucky since either a get a new storage or the retry counter is not exhausted */ sp_truncate_arg_no_bins_t *request; sp_truncate_arg_no_bins_t truncate_prj_args; void *xmit_buf; int ret; xmit_buf = prj_cxt_p[projection_id].prj_buf; if (xmit_buf == NULL) { /* ** fatal error since the ressource control already took place */ error = EFAULT; prj_cxt_p[projection_id].errcode = error; goto fatal; } /* ** fill partially the common header */ retry: request = &truncate_prj_args; request->cid = storcli_truncate_rq_p->cid; request->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request->layout = layout; if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request->spare = 1; else request->spare = 0; memcpy(request->dist_set, storcli_truncate_rq_p->dist_set, ROZOFS_SAFE_MAX*sizeof (uint8_t)); memcpy(request->fid, storcli_truncate_rq_p->fid, sizeof (sp_uuid_t)); request->proj_id = projection_id; request->bid = storcli_truncate_rq_p->bid; request->last_seg = storcli_truncate_rq_p->last_seg; request->last_timestamp = working_ctx_p->timestamp; /* ** Bins len has been saved in the working context */ request->len = working_ctx_p->truncate_bins_len; uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); /* ** increment the lock since it might be possible that this procedure is called after a synchronous transaction failu failure ** while the system is still in the initial procedure that triggers the writing of the projection. So it might be possible that ** the lock is already asserted ** as for the initial case, we need to anticipate the xmit state of the projection since the ERROR status might be set ** on a synchronous transaction failure. If that state is set after a positive submission towards the lbg, we might ** overwrite the ERROR state with the IN_PRG state. */ working_ctx_p->write_ctx_lock++; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),truncate_prj,0); ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_TRUNCATE, (xdrproc_t) xdr_sp_truncate_arg_no_bins_t, (caddr_t) request, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, working_ctx_p->truncate_bins_len, rozofs_storcli_truncate_req_processing_cbk, (void*)working_ctx_p); working_ctx_p->write_ctx_lock--; if (ret < 0) { /* ** the communication with the storage seems to be wrong (more than TCP connection temporary down ** attempt to select a new storage ** */ STORCLI_STOP_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),truncate_prj,0); if (rozofs_storcli_select_storage_idx_for_write (working_ctx_p,rozofs_forward,rozofs_safe,projection_id) < 0) { /* ** Out of storage !!-> too many storages are down */ goto fatal; } /* ** retry for that projection with a new storage index: WARNING: we assume that xmit buffer has not been released !!! */ goto retry; } /* ** OK, the buffer has been accepted by the load balancing group, check if there was a direct failure for ** that transaction */ if ( prj_cxt_p[projection_id].prj_state == ROZOFS_PRJ_WR_ERROR) { error = prj_cxt_p[projection_id].errcode; goto fatal; } return; /* **_____________________________________________ ** Exception cases **_____________________________________________ */ reject: if (working_ctx_p->write_ctx_lock != 0) return; /* ** we fall in that case when we run out of storage */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; fatal: /* ** caution -> reply error is only generated if the ctx_lock is 0 */ if (working_ctx_p->write_ctx_lock != 0) return; /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; }
/** Initial truncate request @param socket_ctx_p: pointer to the af unix socket @param socketId: reference of the socket (not used) @param rozofs_storcli_remote_rsp_cbk: callback for sending out the response @retval : TRUE-> xmit ready event expected @retval : FALSE-> xmit ready event not expected */ void rozofs_storcli_truncate_req_init(uint32_t socket_ctx_idx, void *recv_buf,rozofs_storcli_resp_pf_t rozofs_storcli_remote_rsp_cbk) { rozofs_rpc_call_hdr_with_sz_t *com_hdr_p; rozofs_storcli_ctx_t *working_ctx_p = NULL; int i; uint32_t msg_len; /* length of the rpc messsage including the header length */ storcli_truncate_arg_t *storcli_truncate_rq_p = NULL; rozofs_rpc_call_hdr_t hdr; /* structure that contains the rpc header in host format */ int len; /* effective length of application message */ uint8_t *pmsg; /* pointer to the first available byte in the application message */ uint32_t header_len; XDR xdrs; int errcode = EINVAL; /* ** allocate a context for the duration of the write */ working_ctx_p = rozofs_storcli_alloc_context(); if (working_ctx_p == NULL) { /* ** that situation MUST not occur since there the same number of receive buffer and working context!! */ severe("out of working read/write saved context"); goto failure; } storcli_truncate_rq_p = &working_ctx_p->storcli_truncate_arg; STORCLI_START_NORTH_PROF(working_ctx_p,truncate,0); /* ** Get the full length of the message and adjust it the the length of the applicative part (RPC header+application msg) */ msg_len = ruc_buf_getPayloadLen(recv_buf); msg_len -=sizeof(uint32_t); /* ** save the reference of the received socket since it will be needed for sending back the ** response */ working_ctx_p->socketRef = socket_ctx_idx; working_ctx_p->user_param = NULL; working_ctx_p->recv_buf = recv_buf; working_ctx_p->response_cbk = rozofs_storcli_remote_rsp_cbk; /* ** Get the payload of the receive buffer and set the pointer to the array that describes the write request */ com_hdr_p = (rozofs_rpc_call_hdr_with_sz_t*) ruc_buf_getPayload(recv_buf); memcpy(&hdr,&com_hdr_p->hdr,sizeof(rozofs_rpc_call_hdr_t)); /* ** swap the rpc header */ scv_call_hdr_ntoh(&hdr); pmsg = rozofs_storcli_set_ptr_on_nfs_call_msg((char*)&com_hdr_p->hdr,&header_len); if (pmsg == NULL) { errcode = EFAULT; goto failure; } /* ** map the memory on the first applicative RPC byte available and prepare to decode: ** notice that we will not call XDR_FREE since the application MUST ** provide a pointer for storing the file handle */ len = msg_len - header_len; xdrmem_create(&xdrs,(char*)pmsg,len,XDR_DECODE); /* ** store the source transaction id needed for the reply */ working_ctx_p->src_transaction_id = hdr.hdr.xid; /* ** decode the RPC message of the truncate request */ if (xdr_storcli_truncate_arg_t(&xdrs,storcli_truncate_rq_p) == FALSE) { /* ** decoding error */ errcode = EFAULT; severe("rpc trucnate request decoding error"); goto failure; } /* ** init of the load balancing group/ projection association table: ** That table is ordered: the first corresponds to the storage associated with projection 0, second with 1, etc.. ** When build that table, we MUST consider the value of the base which is associated with the distribution */ uint8_t rozofs_safe = rozofs_get_rozofs_safe(storcli_truncate_rq_p->layout); int lbg_in_distribution = 0; for (i = 0; i <rozofs_safe ; i ++) { /* ** Get the load balancing group associated with the sid */ int lbg_id = rozofs_storcli_get_lbg_for_sid(storcli_truncate_rq_p->cid,storcli_truncate_rq_p->dist_set[i]); if (lbg_id < 0) { /* ** there is no associated between the sid and the lbg. It is typically the case ** when a new cluster has been added to the configuration and the client does not ** know yet the configuration change */ severe("sid is unknown !! %d\n",storcli_truncate_rq_p->dist_set[i]); continue; } rozofs_storcli_lbg_prj_insert_lbg_and_sid(working_ctx_p->lbg_assoc_tb,lbg_in_distribution, lbg_id, storcli_truncate_rq_p->dist_set[i]); rozofs_storcli_lbg_prj_insert_lbg_state(working_ctx_p->lbg_assoc_tb, lbg_in_distribution, NORTH_LBG_GET_STATE(working_ctx_p->lbg_assoc_tb[lbg_in_distribution].lbg_id)); lbg_in_distribution++; if (lbg_in_distribution == rozofs_safe) break; } /* ** allocate a small buffer that will be used for sending the response to the truncate request */ working_ctx_p->xmitBuf = ruc_buf_getBuffer(ROZOFS_STORCLI_NORTH_SMALL_POOL); if (working_ctx_p == NULL) { /* ** that situation MUST not occur since there the same number of receive buffer and working context!! */ errcode = ENOMEM; severe("out of small buffer"); goto failure; } /* ** allocate a sequence number for the working context (same aas for read) */ working_ctx_p->read_seqnum = rozofs_storcli_allocate_read_seqnum(); /* ** set now the working variable specific for handling the truncate ** we re-use the structure used for writing even if nothing is written */ uint8_t forward_projection = rozofs_get_rozofs_forward(storcli_truncate_rq_p->layout); for (i = 0; i < forward_projection; i++) { working_ctx_p->prj_ctx[i].prj_state = ROZOFS_PRJ_READ_IDLE; working_ctx_p->prj_ctx[i].prj_buf = ruc_buf_getBuffer(ROZOFS_STORCLI_SOUTH_LARGE_POOL); if (working_ctx_p->prj_ctx[i].prj_buf == NULL) { /* ** that situation MUST not occur since there the same number of receive buffer and working context!! */ errcode = ENOMEM; severe("out of large buffer"); goto failure; } /* ** increment inuse counter on each buffer since we might need to re-use that packet in case ** of retransmission */ working_ctx_p->prj_ctx[i].inuse_valid = 1; ruc_buf_inuse_increment(working_ctx_p->prj_ctx[i].prj_buf); /* ** set the pointer to the bins */ int position = rozofs_storcli_get_position_of_first_byte2write_in_truncate(); uint8_t *pbuf = (uint8_t*)ruc_buf_getPayload(working_ctx_p->prj_ctx[i].prj_buf); working_ctx_p->prj_ctx[i].bins = (bin_t*)(pbuf+position); } /* ** Prepare for request serialization */ memcpy(working_ctx_p->fid_key, storcli_truncate_rq_p->fid, sizeof (sp_uuid_t)); working_ctx_p->opcode_key = STORCLI_TRUNCATE; { /** * lock all the file for a truncate */ uint64_t nb_blocks = 0; nb_blocks--; int ret; ret = stc_rng_insert((void*)working_ctx_p, STORCLI_READ,working_ctx_p->fid_key, 0,nb_blocks, &working_ctx_p->sched_idx); if (ret == 0) { /* ** there is a current request that is processed with the same fid and there is a collision */ return; } /* ** no request pending with that fid, so we can process it right away */ return rozofs_storcli_truncate_req_processing(working_ctx_p); } /* **_____________________________________________ ** Exception cases **_____________________________________________ */ /* ** there was a failure while attempting to allocate a memory ressource. */ failure: /* ** send back the response with the appropriated error code. ** note: The received buffer (rev_buf) is ** intended to be released by this service in case of error or the TCP transmitter ** once it has been passed to the TCP stack. */ rozofs_storcli_reply_error_with_recv_buf(socket_ctx_idx,recv_buf,NULL,rozofs_storcli_remote_rsp_cbk,errcode); /* ** check if the root context was allocated. Free it if is exist */ if (working_ctx_p != NULL) { /* ** remove the reference to the recvbuf to avoid releasing it twice */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); working_ctx_p->recv_buf = NULL; rozofs_storcli_release_context(working_ctx_p); } return; }
/* ** That function is called when all the projection are ready to be sent @param working_ctx_p: pointer to the root context associated with the top level write request @param data : pointer to the data of the last block to truncate */ void rozofs_storcli_truncate_req_processing_exec(rozofs_storcli_ctx_t *working_ctx_p, char * data) { storcli_truncate_arg_t *storcli_truncate_rq_p = (storcli_truncate_arg_t*)&working_ctx_p->storcli_truncate_arg; uint8_t layout = storcli_truncate_rq_p->layout; uint8_t rozofs_forward; uint8_t rozofs_safe; uint8_t projection_id; int storage_idx; int error=0; rozofs_storcli_lbg_prj_assoc_t *lbg_assoc_p = working_ctx_p->lbg_assoc_tb; rozofs_storcli_projection_ctx_t *prj_cxt_p = working_ctx_p->prj_ctx; rozofs_forward = rozofs_get_rozofs_forward(layout); rozofs_safe = rozofs_get_rozofs_safe(layout); /* ** set the current state of each load balancing group belonging to the rozofs_safe group */ for (storage_idx = 0; storage_idx < rozofs_safe; storage_idx++) { /* ** Check the state of the load Balancing group */ rozofs_storcli_lbg_prj_insert_lbg_state(lbg_assoc_p, storage_idx, NORTH_LBG_GET_STATE(lbg_assoc_p[storage_idx].lbg_id)); } /* ** Now find out a selectable lbg_id for each projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { if (rozofs_storcli_select_storage_idx_for_write ( working_ctx_p,rozofs_forward, rozofs_safe,projection_id) < 0) { /* ** there is no enough valid storage !! */ error = EIO; goto fail; } } /* ** Let's transform the data to write */ working_ctx_p->truncate_bins_len = 0; if (data != NULL) { STORCLI_START_KPI(storcli_kpi_transform_forward); rozofs_storcli_transform_forward(working_ctx_p->prj_ctx, layout, 0, 1, working_ctx_p->timestamp, storcli_truncate_rq_p->last_seg, data); STORCLI_STOP_KPI(storcli_kpi_transform_forward,0); working_ctx_p->truncate_bins_len = rozofs_get_max_psize(layout)*sizeof(bin_t) + sizeof(rozofs_stor_bins_hdr_t); } /* ** We have enough storage, so initiate the transaction towards the storage for each ** projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { sp_truncate_arg_no_bins_t *request; sp_truncate_arg_no_bins_t truncate_prj_args; void *xmit_buf; int ret; xmit_buf = prj_cxt_p[projection_id].prj_buf; if (xmit_buf == NULL) { /* ** fatal error since the ressource control already took place */ error = EIO; goto fatal; } /* ** fill partially the common header */ retry: request = &truncate_prj_args; request->cid = storcli_truncate_rq_p->cid; request->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request->layout = layout; if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request->spare = 1; else request->spare = 0; memcpy(request->dist_set, storcli_truncate_rq_p->dist_set, ROZOFS_SAFE_MAX*sizeof (uint8_t)); memcpy(request->fid, storcli_truncate_rq_p->fid, sizeof (sp_uuid_t)); request->proj_id = projection_id; request->bid = storcli_truncate_rq_p->bid; request->last_seg = storcli_truncate_rq_p->last_seg; request->last_timestamp = working_ctx_p->timestamp; request->len = working_ctx_p->truncate_bins_len; uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),truncate_prj,0); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_truncate_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_TRUNCATE, (xdrproc_t) xdr_sp_truncate_arg_no_bins_t, (caddr_t) request, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, working_ctx_p->truncate_bins_len, rozofs_storcli_truncate_req_processing_cbk, (void*)working_ctx_p); working_ctx_p->write_ctx_lock = 0; if (ret < 0) { /* ** the communication with the storage seems to be wrong (more than TCP connection temporary down ** attempt to select a new storage ** */ if (rozofs_storcli_select_storage_idx_for_write (working_ctx_p,rozofs_forward,rozofs_safe,projection_id) < 0) { /* ** Out of storage !!-> too many storages are down */ goto fatal; } /* ** retry for that projection with a new storage index: WARNING: we assume that xmit buffer has not been released !!! */ //#warning: it is assumed that xmit buffer has not been release, need to double check!! goto retry; } else { /* ** check if the state has not been changed: -> it might be possible to get a direct error */ if (prj_cxt_p[projection_id].prj_state == ROZOFS_PRJ_WR_ERROR) { error = prj_cxt_p[projection_id].errcode; goto fatal; } } } return; fail: /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; fatal: /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param *data: pointer to the source data that must be transformed @param *number_of_blocks_p: pointer to the array where the function returns number of blocks on which the transform was applied @param *rozofs_storcli_prj_idx_table: pointer to the array used for storing the projections index for inverse process * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_inverse(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout, uint32_t bsize, uint32_t first_block_idx, uint32_t number_of_blocks, rozofs_storcli_inverse_block_t *block_ctx_p, char *data, uint32_t *number_of_blocks_p, uint8_t *rozofs_storcli_prj_idx_table) { int block_idx; uint16_t projection_id = 0; int prj_ctx_idx; *number_of_blocks_p = 0; uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_safe = rozofs_get_rozofs_safe(layout); uint32_t bbytes = ROZOFS_BSIZE_BYTES(bsize); int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); /* ** Proceed the inverse data transform for the nb_projections2read blocks. */ for (block_idx = 0; block_idx < number_of_blocks; block_idx++) { if (block_ctx_p[block_idx].state == ROZOFS_BLK_TRANSFORM_DONE) { /* ** transformation has already been done for that block of ROZOFS_BSIZE siz ** check the next one */ continue; } /* ** Check the case of the file that has no data (there is a hole in the file), this is indicated by ** reporting a timestamp of 0 */ if ((block_ctx_p[block_idx].timestamp == 0) && (block_ctx_p[block_idx].effective_length == bbytes )) { /* ** clear the memory */ ROZOFS_STORCLI_STATS(ROZOFS_STORCLI_EMPTY_READ); memset( data + (bbytes * (first_block_idx + block_idx)),0,bbytes); block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; continue; } if ((block_ctx_p[block_idx].timestamp == 0) && (block_ctx_p[block_idx].effective_length == 0 )) { /* ** we have reached end of file */ block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; *number_of_blocks_p = (block_idx++); return 0; } /* ** Here we have to take care, since the index of the projection_id use to address ** prj_ctx_p is NOT the real projection_id. The projection ID is found in the header of ** each bins, so for a set of projections pointed by bins, we might have a different ** projection id in the header of the projections contains in the bins array that has ** been read!! */ transform_inverse_proc(&rozofs_storcli_prj_idx_table[ROZOFS_SAFE_MAX_STORCLI*block_idx], prj_ctx_p, prj_size_in_msg, layout, bbytes, first_block_idx, block_idx, data); /* ** indicate that transform has been done for the projection */ block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; /* ** check the case of a block that is not full: need to zero's that part */ if (block_ctx_p[block_idx].effective_length < bbytes) { /* ** clear the memory */ char *raz_p = data + (bbytes * (first_block_idx + block_idx)) + block_ctx_p[block_idx].effective_length; memset( raz_p,0,(bbytes-block_ctx_p[block_idx].effective_length) ); } } /* ** now the inverse transform is finished, release the allocated ressources used for ** rebuild */ *number_of_blocks_p = number_of_blocks; /* ** Check whether a block should be repaired */ rozofs_storcli_check_block_2_repair(prj_ctx_p, rozofs_inverse, rozofs_forward, rozofs_safe, prj_size_in_msg, number_of_blocks, block_ctx_p); return 0; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param timestamp: date in microseconds @param last_block_size: effective length of the last block * @param *data: pointer to the source data that must be transformed * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_forward(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout,uint32_t bsize, uint32_t first_block_idx, uint32_t number_of_blocks, uint64_t timestamp, uint16_t last_block_size, char *data) { projection_t rozofs_fwd_projections[ROZOFS_SAFE_MAX_STORCLI]; projection_t *projections; // Table of projections used to transform data uint16_t projection_id = 0; uint32_t i = 0; uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); int empty_block = 0; uint32_t bbytes = ROZOFS_BSIZE_BYTES(bsize); projections = rozofs_fwd_projections; int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); /* Transform the data */ // For each block to send for (i = 0; i < number_of_blocks; i++) { empty_block = rozofs_data_block_check_empty(data + (i * bbytes), bbytes); // seek bins for each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { /* ** Indicates the memory area where the transformed data must be stored */ projections[projection_id].bins = prj_ctx_p[projection_id].bins + (prj_size_in_msg/sizeof(bin_t)) * (first_block_idx+i); rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)projections[projection_id].bins; rozofs_stor_bins_footer_t *rozofs_bins_foot_p = (rozofs_stor_bins_footer_t*) ((bin_t*)(rozofs_bins_hdr_p+1)+ rozofs_get_psizes(layout,bsize,projection_id)); /* ** check if the user data block is empty: if the data block is empty no need to transform */ if (empty_block) { rozofs_bins_hdr_p->s.projection_id = 0; rozofs_bins_hdr_p->s.timestamp = 0; rozofs_bins_hdr_p->s.effective_length = 0; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; continue; } /* ** fill the header of the projection */ rozofs_bins_hdr_p->s.projection_id = projection_id; rozofs_bins_hdr_p->s.timestamp = timestamp; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; /* ** set the effective size of the block. It is always ROZOFS_BSIZE except for the last block */ if (i == (number_of_blocks-1)) { rozofs_bins_hdr_p->s.effective_length = last_block_size; } else { rozofs_bins_hdr_p->s.effective_length = bbytes; } /* ** update the pointer to point out the first bins */ projections[projection_id].bins += sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t*) (projections[projection_id].bins + rozofs_get_psizes(layout,bsize,projection_id)); rozofs_bins_foot_p->timestamp = timestamp; } /* ** do not apply transform for empty block */ if (empty_block == 0) { transform_forward_proc(layout,data + (i * bbytes),bbytes,projections); } } return 0; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param timestamp: date in microseconds @param last_block_size: effective length of the last block * @param *data: pointer to the source data that must be transformed * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_forward(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout, uint32_t first_block_idx, uint32_t number_of_blocks, uint64_t timestamp, uint16_t last_block_size, char *data) { projection_t rozofs_fwd_projections[ROZOFS_SAFE_MAX]; projection_t *projections; // Table of projections used to transform data uint16_t projection_id = 0; uint32_t i = 0; uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); int empty_block = 0; projections = rozofs_fwd_projections; // For each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { projections[projection_id].angle.p = rozofs_get_angles_p(layout,projection_id); projections[projection_id].angle.q = rozofs_get_angles_q(layout,projection_id); projections[projection_id].size = rozofs_get_psizes(layout,projection_id); } /* Transform the data */ // For each block to send for (i = 0; i < number_of_blocks; i++) { empty_block = rozofs_data_block_check_empty(data + (i * ROZOFS_BSIZE), ROZOFS_BSIZE); // seek bins for each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { /* ** Indicates the memory area where the transformed data must be stored */ projections[projection_id].bins = prj_ctx_p[projection_id].bins + ((rozofs_get_max_psize(layout)+(sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t)))* (first_block_idx+i)); rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)projections[projection_id].bins; /* ** check if the user data block is empty: if the data block is empty no need to transform */ if (empty_block) { rozofs_bins_hdr_p->s.projection_id = 0; rozofs_bins_hdr_p->s.timestamp = 0; rozofs_bins_hdr_p->s.effective_length = 0; continue; } /* ** fill the header of the projection */ rozofs_bins_hdr_p->s.projection_id = projection_id; rozofs_bins_hdr_p->s.timestamp = timestamp; /* ** set the effective size of the block. It is always ROZOFS_BSIZE except for the last block */ if (i == (number_of_blocks-1)) { rozofs_bins_hdr_p->s.effective_length = last_block_size; } else { rozofs_bins_hdr_p->s.effective_length = ROZOFS_BSIZE; } /* ** update the pointer to point out the first bins */ projections[projection_id].bins += sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t); } /* ** do not apply transform for empty block */ if (empty_block == 0) { /* ** Apply the erasure code transform for the block i+first_block_idx */ transform_forward((pxl_t *) (data + (i * ROZOFS_BSIZE)), rozofs_inverse, ROZOFS_BSIZE / rozofs_inverse / sizeof (pxl_t), rozofs_forward, projections); } } return 0; }
void read_chunk_file(uuid_t fid, char * path, rozofs_stor_bins_file_hdr_vall_t * hdr, int spare, uint64_t firstBlock) { uint16_t rozofs_disk_psize; int fd; rozofs_stor_bins_hdr_t * pH; int nb_read; uint32_t bbytes = ROZOFS_BSIZE_BYTES(hdr->v0.bsize); char crc32_string[32]; uint64_t offset; if (dump_data == 0) { printf ("+------------+------------------+------------+----+------+-------+--------------------------------------------\n"); printf ("| %10s | %16s | %10s | %2s | %4s | %5s | %s\n", "block#","file offset", "prj offset", "pj", "size", "crc32", "date"); printf ("+------------+------------------+------------+----+------+-------+--------------------------------------------\n"); } // Open bins file fd = open(path, ROZOFS_ST_NO_CREATE_FILE_FLAG, ROZOFS_ST_BINS_FILE_MODE_RO); if (fd < 0) { printf("open(%s) %s\n",path,strerror(errno)); return; } /* ** Retrieve the projection size on disk */ rozofs_disk_psize = rozofs_get_max_psize_in_msg(hdr->v0.layout,hdr->v0.bsize); if (spare==0) { /* Header version 1. Find the sid in the distribution */ if (hdr->v0.version == 2) { int fwd = rozofs_get_rozofs_forward(hdr->v2.layout); int idx; for (idx=0; idx< fwd;idx++) { if (hdr->v2.distrib[idx] != hdr->v2.sid) continue; rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v2.layout,hdr->v2.bsize,idx); break; } } else if (hdr->v0.version == 1) { int fwd = rozofs_get_rozofs_forward(hdr->v1.layout); int idx; for (idx=0; idx< fwd;idx++) { if (hdr->v1.dist_set_current[idx] != hdr->v1.sid) continue; rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v1.layout,hdr->v1.bsize,idx); break; } } /* Projection id given as parameter */ else if (prjid != -1) { rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v0.layout,hdr->v0.bsize,prjid); } /*�Version 0 without projection given as parameter*/ else { // Read 1rst block nb_read = pread(fd, buffer, sizeof(rozofs_stor_bins_hdr_t), 0); if (nb_read<0) { printf("pread(%s) %s\n",path,strerror(errno)); return; } pH = (rozofs_stor_bins_hdr_t*)buffer; if (pH->s.timestamp == 0) { printf("Can not tell projection id\n"); return; } rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v0.layout,hdr->v0.bsize,pH->s.projection_id); } } /* ** Where to start reading from */ if (first == 0) { offset = 0; } else { if (first <= firstBlock) { offset = 0; } else { offset = (first-firstBlock)*rozofs_disk_psize; } } int idx; nb_read = 1; uint64_t bid; /* ** Reading blocks */ while (nb_read) { // Read nb_proj * (projection + header) nb_read = pread(fd, buffer, rozofs_disk_psize*32, offset); if (nb_read<0) { printf("pread(%s) %s\n",path,strerror(errno)); close(fd); return; } nb_read = (nb_read / rozofs_disk_psize); pH = (rozofs_stor_bins_hdr_t*) buffer; for (idx=0; idx<nb_read; idx++) { pH = (rozofs_stor_bins_hdr_t*) &buffer[idx*rozofs_disk_psize]; bid = (offset/rozofs_disk_psize)+idx+firstBlock; if (bid < first) continue; if (bid > last) break; uint32_t save_crc32 = pH->s.filler; pH->s.filler = 0; uint32_t crc32=0; if (save_crc32 == 0) { sprintf(crc32_string,"NONE"); } else { crc32 = fid2crc32((uint32_t *)fid)+bid-firstBlock; crc32 = crc32c(crc32,(char *) pH, rozofs_disk_psize); if (crc32 != save_crc32) sprintf(crc32_string,"ERROR"); else sprintf(crc32_string,"OK"); } pH->s.filler = save_crc32; if (dump_data == 0) { printf ("| %10llu | %16llu | %10llu | %2d | %4d | %5s | %s\n", (long long unsigned int)bid, (long long unsigned int)bbytes * bid, (long long unsigned int)offset+(idx*rozofs_disk_psize), pH->s.projection_id, pH->s.effective_length, crc32_string, ts2string(pH->s.timestamp)); } else { printf("_________________________________________________________________________________________\n"); printf("Block# %llu / file offset %llu / projection offset %llu\n", (unsigned long long)bid, (unsigned long long)(bbytes * bid), (unsigned long long)(offset+(idx*rozofs_disk_psize))); printf("prj id %d / length %d / CRC %s / time stamp %s\n", pH->s.projection_id,pH->s.effective_length,crc32_string, ts2string(pH->s.timestamp)); printf("_________________________________________________________________________________________\n"); if ((pH->s.projection_id == 0)&&(pH->s.timestamp==0)) continue; hexdump(pH, (offset+(idx*rozofs_disk_psize)), rozofs_disk_psize); } } offset += (nb_read*rozofs_disk_psize); } if (dump_data == 0) { printf ("+------------+------------------+------------+----+------+-------+--------------------------------------------\n"); } close(fd); }
int read_data_file() { int status = -1; uint64_t size = 0; int block_idx = 0; int idx =0; int count; rozofs_stor_bins_hdr_t * rozofs_bins_hdr_p; rozofs_stor_bins_footer_t * rozofs_bins_foot_p; char * loc_read_bins_p = NULL; int forward = rozofs_get_rozofs_forward(layout); // int inverse = rozofs_get_rozofs_inverse(layout); uint16_t disk_block_size; uint16_t max_block_size = (rozofs_get_max_psize(layout,bsize)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); char * p; int empty,valid; int prj_id; int nb_ts; uint64_t ts[32]; int ts_count[32]; // Allocate memory for reading loc_read_bins_p = xmalloc(max_block_size); for (idx=0; idx < nb_file; idx++) { if (strcmp(filename[idx],"NULL") == 0) { fd[idx] = -1; } else { fd[idx] = open(filename[idx],O_RDWR); if (fd < 0) { severe("Can not open file %s %s",filename[idx],strerror(errno)); goto out; } } } printf (" ______ __________ "); for (idx=0; idx < nb_file; idx++) printf (" __________________ ______ ____ "); printf ("\n"); printf("| %4s | %8s |","Blk","Offset"); for (idx=0; idx < nb_file; idx++) printf("| %16s | %4s | %2s |", "Time stamp", "lgth", "id"); printf ("\n"); printf ("|______|__________|"); for (idx=0; idx < nb_file; idx++) printf ("|__________________|______|____|"); printf ("\n"); if (block_number == -1) block_idx = 0; else block_idx = block_number; count = 1; empty = 0; while ( count ) { valid = 0; count = 0; nb_ts = 0; p = &LINE[0]; p += sprintf(p,"| %4d | %8d ",block_idx+firstBlock,(block_idx+firstBlock)*bbytes); for (idx=0; idx < nb_file; idx++) { if (fd[idx] == -1) { p += sprintf(p,"%32s"," "); continue; } if (idx >= forward) disk_block_size = rozofs_get_max_psize_in_msg(layout, bsize); else disk_block_size = rozofs_get_psizes_on_disk(layout,bsize,idx); size = pread(fd[idx],loc_read_bins_p,disk_block_size,block_idx*disk_block_size); if (size != disk_block_size) { p += sprintf(p,"|__________________|______|____|"); close(fd[idx]); fd[idx] = -1; } else { count++; rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t *)loc_read_bins_p; prj_id = rozofs_bins_hdr_p->s.projection_id; if (prj_id >= forward) { valid = 1; p += sprintf(p,"|| xxxxxxxxxxxxxxxx | xxxx | %2d ",prj_id); } else { disk_block_size = (rozofs_get_psizes(layout,bsize,prj_id)*sizeof (bin_t)); disk_block_size += sizeof (rozofs_stor_bins_hdr_t); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t *) ((char*) rozofs_bins_hdr_p + disk_block_size); if (rozofs_bins_hdr_p->s.timestamp == 0) { p += sprintf(p,"|| %16d | .... | %2d ",0,prj_id); } else if (rozofs_bins_foot_p->timestamp != rozofs_bins_hdr_p->s.timestamp) { valid = 1; p += sprintf(p,"|--%16.16llu----------%2d-", (long long unsigned int)rozofs_bins_hdr_p->s.timestamp, prj_id); } else if (rozofs_bins_hdr_p->s.timestamp == 0) { p += sprintf(p,"|| %16d | .... | %2d ",0,prj_id); } else { valid = 1; p += sprintf(p,"|| %16llu | %4d | %2d ", (unsigned long long)rozofs_bins_hdr_p->s.timestamp, rozofs_bins_hdr_p->s.effective_length, rozofs_bins_hdr_p->s.projection_id); int tsidx; for (tsidx=0; tsidx< nb_ts; tsidx++) { if (ts[tsidx] == rozofs_bins_hdr_p->s.timestamp) { ts_count[tsidx]++; break; } } if (tsidx == nb_ts) { ts[tsidx] = rozofs_bins_hdr_p->s.timestamp; ts_count[tsidx] = 1; nb_ts++; } } } } } if (valid) { if (empty) { printf("... %d blocks...\n",empty); empty = 0; } int best=-1,tsidx; for (tsidx=0; tsidx< nb_ts; tsidx++) { if (ts_count[tsidx] > best) best = ts_count[tsidx]; } printf("%s%s\n",LINE, (best<forward)?"<<<<----":"|"); } else { empty++; } block_idx++; if (block_number!=-1) break; } printf ("|______|__________|\n"); if (block_number!=-1) { for (idx=0; idx < nb_file; idx++) { if (idx < forward) { disk_block_size = (rozofs_get_psizes(layout,bsize,idx)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); } else { disk_block_size = (rozofs_get_max_psize(layout,bsize)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); } size = pread(fd[idx],loc_read_bins_p,disk_block_size,block_number*disk_block_size); if (size != disk_block_size) { printf("Can not read block %d of %s\n", block_number, filename[idx]); continue; } FILE * f; char fname[128]; sprintf(fname,"block_%d_dist_%d.txt", block_number, idx); f = fopen(fname,"w"); if (f == NULL) { printf ("Can not create file %s",fname); continue; } printf("- %s\n",fname); fprintf(f,"%s Block %d size %d\n", filename[idx], block_number, disk_block_size); rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t *)loc_read_bins_p; fprintf(f,"Block header : TS %llu SZ %d PRJ %d CRC32 0x%x\n", (long long unsigned int)rozofs_bins_hdr_p->s.timestamp, rozofs_bins_hdr_p->s.effective_length, rozofs_bins_hdr_p->s.projection_id, rozofs_bins_hdr_p->s.filler); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t *) (loc_read_bins_p + disk_block_size); rozofs_bins_foot_p--; fprintf(f,"Block footer : TS %llu %s\n", (long long unsigned int)rozofs_bins_foot_p->timestamp, (rozofs_bins_hdr_p->s.timestamp==rozofs_bins_foot_p->timestamp)?"":" !!!!!!"); hexdump(f,loc_read_bins_p, 0, disk_block_size); fclose(f); } } status = 0; out: // This spare file used to exist but is not needed any more for (idx=0; idx < nb_file; idx++) { if (fd[idx] != -1) close(fd[idx]); } if (loc_read_bins_p != NULL) { //free(loc_read_bins_p); loc_read_bins_p = NULL; } return status; }