/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *working_ctx_p: storcli working context * @param number_of_blocks: number of blocks to write * @param *data: pointer to the source data that must be transformed * * @return: the length written on success, -1 otherwise (errno is set) */ void rozofs_storcli_transform_forward_repair(rozofs_storcli_ctx_t *working_ctx_p, uint8_t layout, uint32_t number_of_blocks, char *data) { projection_t rozofs_fwd_projections[ROZOFS_SAFE_MAX_STORCLI]; projection_t *projections; // Table of projections used to transform data uint16_t projection_id = 0; uint32_t i = 0; uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_safe = rozofs_get_rozofs_forward(layout); uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); rozofs_storcli_projection_ctx_t *prj_ctx_p = &working_ctx_p->prj_ctx[0]; int empty_block = 0; uint8_t sid; int moj_prj_id; int block_idx; int k; storcli_read_arg_t *storcli_read_rq_p = (storcli_read_arg_t*)&working_ctx_p->storcli_read_arg; uint8_t bsize = storcli_read_rq_p->bsize; uint32_t bbytes = ROZOFS_BSIZE_BYTES(bsize); int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); projections = rozofs_fwd_projections; // For each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { projections[projection_id].angle.p = rozofs_get_angles_p(layout,projection_id); projections[projection_id].angle.q = rozofs_get_angles_q(layout,projection_id); projections[projection_id].size = rozofs_get_128bits_psizes(layout,bsize,projection_id); } /* ** now go through all projection set to find out if there is something to regenerate */ for (k = 0; k < rozofs_safe; k++) { block_idx = 0; if (ROZOFS_BITMAP64_TEST_ALL0(prj_ctx_p[k].crc_err_bitmap)) continue; /* ** Get the sid associated with the projection context */ sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb, prj_ctx_p[k].stor_idx); /* ** Get the reference of the Mojette projection_id */ moj_prj_id = rozofs_storcli_get_mojette_proj_id(storcli_read_rq_p->dist_set,sid,rozofs_forward); if (moj_prj_id < 0) { /* ** it is the reference of a spare sid, so go to the next projection context */ continue; } for (i = 0; i < number_of_blocks; i++) { if (ROZOFS_BITMAP64_TEST0(i,prj_ctx_p[k].crc_err_bitmap)) { /* ** nothing to generate for that block */ continue; } /* ** check for empty block */ empty_block = rozofs_data_block_check_empty(data + (i * bbytes), bbytes); /** * regenerate the projection for the block for which a crc error has been detected */ //CRC projections[moj_prj_id].bins = prj_ctx_p[moj_prj_id].bins + projections[moj_prj_id].bins = prj_ctx_p[k].bins + (prj_size_in_msg/sizeof(bin_t)* (0+block_idx)); rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)projections[moj_prj_id].bins; /* ** check if the user data block is empty: if the data block is empty no need to transform */ if (empty_block) { rozofs_bins_hdr_p->s.projection_id = 0; rozofs_bins_hdr_p->s.timestamp = 0; rozofs_bins_hdr_p->s.effective_length = 0; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; block_idx++; continue; } /* ** fill the header of the projection */ rozofs_bins_hdr_p->s.projection_id = moj_prj_id; //CRC rozofs_bins_hdr_p->s.timestamp = working_ctx_p->block_ctx_table[block_idx].timestamp; rozofs_bins_hdr_p->s.timestamp = working_ctx_p->block_ctx_table[i].timestamp; //CRC rozofs_bins_hdr_p->s.effective_length = working_ctx_p->block_ctx_table[block_idx].effective_length; rozofs_bins_hdr_p->s.effective_length = working_ctx_p->block_ctx_table[i].effective_length; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; /* ** update the pointer to point out the first bins */ projections[moj_prj_id].bins += sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t); /* ** do not apply transform for empty block */ if (empty_block == 0) { /* ** Apply the erasure code transform for the block i */ transform128_forward_one_proj((pxl_t *) (data + (i * bbytes)), rozofs_inverse, bbytes / rozofs_inverse / sizeof (pxl_t), moj_prj_id, projections); /* ** add the footer at the end of the repaired projection */ rozofs_stor_bins_footer_t *rozofs_bins_foot_p; rozofs_bins_foot_p = (rozofs_stor_bins_footer_t*) (projections[moj_prj_id].bins + rozofs_get_psizes(layout,bsize,moj_prj_id)); //CRC rozofs_bins_foot_p->timestamp = working_ctx_p->block_ctx_table[block_idx].timestamp; rozofs_bins_foot_p->timestamp = rozofs_bins_hdr_p->s.timestamp; } block_idx++; } } }
/* ** That function is called when all the projection are ready to be sent @param working_ctx_p: pointer to the root context associated with the top level write request */ void rozofs_storcli_write_repair_req_processing(rozofs_storcli_ctx_t *working_ctx_p) { storcli_read_arg_t *storcli_read_rq_p = (storcli_read_arg_t*)&working_ctx_p->storcli_read_arg; uint8_t layout = storcli_read_rq_p->layout; uint8_t rozofs_forward; uint8_t projection_id; int error=0; int ret; rozofs_storcli_projection_ctx_t *prj_cxt_p = working_ctx_p->prj_ctx; uint8_t bsize = storcli_read_rq_p->bsize; int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); sp_write_repair_arg_no_bins_t *request; sp_write_repair_arg_no_bins_t repair_prj_args; sp_write_repair2_arg_no_bins_t *request2; sp_write_repair2_arg_no_bins_t repair2_prj_args; rozofs_forward = rozofs_get_rozofs_forward(layout); /* ** check if the buffer is still valid: we might face the situation where the rozofsmount ** time-out and re-allocate the write buffer located in shared memory for another ** transaction (either read or write: ** the control must take place only where here is the presence of a shared memory for the write */ error = 0; if (working_ctx_p->shared_mem_p!= NULL) { uint32_t *xid_p = (uint32_t*)working_ctx_p->shared_mem_p; if (*xid_p != working_ctx_p->src_transaction_id) { /* ** the source has aborted the request */ error = EPROTO; } } /* ** send back the response of the read request towards rozofsmount */ rozofs_storcli_read_reply_success(working_ctx_p); /* ** allocate a sequence number for the working context: ** This is mandatory to avoid any confusion with a late response of the previous read request */ working_ctx_p->read_seqnum = rozofs_storcli_allocate_read_seqnum(); /* ** check if it make sense to send the repaired blocks */ if (error) { /* ** the requester has released the buffer and it could be possible that the ** rozofsmount uses it for another purpose, so the data that have been repaired ** might be wrong, so don't take the right to write wrong data for which we can can ** a good crc !! */ goto fail; } /* ** We have enough storage, so initiate the transaction towards the storage for each ** projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { void *xmit_buf; int ret; /* ** skip the projections for which no error has been detected */ if (storcli_storage_supports_repair2) { if (ROZOFS_BITMAP64_TEST_ALL0(working_ctx_p->prj_ctx[projection_id].crc_err_bitmap)) continue; } else { if (working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[0] == 0) continue; } xmit_buf = prj_cxt_p[projection_id].prj_buf; if (xmit_buf == NULL) { /* ** fatal error since the ressource control already took place */ error = EIO; goto fail; } /* ** fill partially the common header */ if (storcli_storage_supports_repair2) { request2 = &repair2_prj_args; request2->cid = storcli_read_rq_p->cid; request2->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request2->layout = storcli_read_rq_p->layout; request2->bsize = storcli_read_rq_p->bsize; /* ** the case of spare 1 must not occur because repair is done for th eoptimal distribution only */ if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request2->spare = 1; else request2->spare = 0; memcpy(request2->dist_set, storcli_read_rq_p->dist_set, ROZOFS_SAFE_MAX_STORCLI*sizeof (uint8_t)); memcpy(request2->fid, storcli_read_rq_p->fid, sizeof (sp_uuid_t)); //CRCrequest->proj_id = projection_id; request2->proj_id = rozofs_storcli_get_mojette_proj_id(storcli_read_rq_p->dist_set,request2->sid,rozofs_forward); request2->bid = storcli_read_rq_p->bid; request2->bitmap[0] = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[0]; request2->bitmap[1] = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[1]; request2->bitmap[2] = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[2]; int nb_blocks = ROZOFS_BITMAP64_NB_BIT1(request2->bitmap); request2->nb_proj = nb_blocks; /* ** set the length of the bins part: need to compute the number of blocks */ int bins_len = (prj_size_in_msg * nb_blocks); request2->len = bins_len; /**< bins length MUST be in bytes !!! */ uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,bins_len); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_write_repair_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_WRITE_REPAIR2, (xdrproc_t) xdr_sp_write_repair2_arg_no_bins_t, (caddr_t) request2, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, bins_len, rozofs_storcli_write_repair_req_processing_cbk, (void*)working_ctx_p); } else { request = &repair_prj_args; request->cid = storcli_read_rq_p->cid; request->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request->layout = storcli_read_rq_p->layout; request->bsize = storcli_read_rq_p->bsize; /* ** the case of spare 1 must not occur because repair is done for th eoptimal distribution only */ if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request->spare = 1; else request->spare = 0; memcpy(request->dist_set, storcli_read_rq_p->dist_set, ROZOFS_SAFE_MAX_STORCLI*sizeof (uint8_t)); memcpy(request->fid, storcli_read_rq_p->fid, sizeof (sp_uuid_t)); //CRCrequest->proj_id = projection_id; request->proj_id = rozofs_storcli_get_mojette_proj_id(storcli_read_rq_p->dist_set,request->sid,rozofs_forward); request->bid = storcli_read_rq_p->bid; request->bitmap = working_ctx_p->prj_ctx[projection_id].crc_err_bitmap[0]; int nb_blocks = ROZOFS_BITMAP64_NB_BIT1_FUNC((uint8_t*)&request->bitmap,8); request->nb_proj = nb_blocks; /* ** set the length of the bins part: need to compute the number of blocks */ int bins_len = (prj_size_in_msg * nb_blocks); request->len = bins_len; /**< bins length MUST be in bytes !!! */ uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,bins_len); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_write_repair_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_WRITE_REPAIR, (xdrproc_t) xdr_sp_write_repair_arg_no_bins_t, (caddr_t) request, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, bins_len, rozofs_storcli_write_repair_req_processing_cbk, (void*)working_ctx_p); } working_ctx_p->write_ctx_lock = 0; if (ret < 0) { /* ** there is no retry, just keep on with a potential other projection to repair */ STORCLI_ERR_PROF(repair_prj_err); STORCLI_STOP_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,0); prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_ERROR; continue; } else { /* ** check if the state has not been changed: -> it might be possible to get a direct error */ if (prj_cxt_p[projection_id].prj_state == ROZOFS_PRJ_WR_ERROR) { /* ** it looks like that we cannot repair that preojection, check if there is some other */ STORCLI_STOP_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),repair_prj,0); } } } /* ** check if there some write repair request pending, in such a case we wait for the end of the repair ** (answer from the storage node */ ret = rozofs_storcli_all_prj_write_repair_check(storcli_read_rq_p->layout, working_ctx_p->prj_ctx); if (ret == 0) { /* ** there is some pending write */ return; } fail: /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,repair,0); rozofs_storcli_release_context(working_ctx_p); return; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param *data: pointer to the source data that must be transformed @param *number_of_blocks_p: pointer to the array where the function returns number of blocks on which the transform was applied @param *rozofs_storcli_prj_idx_table: pointer to the array used for storing the projections index for inverse process * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_inverse(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout, uint32_t bsize, uint32_t first_block_idx, uint32_t number_of_blocks, rozofs_storcli_inverse_block_t *block_ctx_p, char *data, uint32_t *number_of_blocks_p, uint8_t *rozofs_storcli_prj_idx_table) { int block_idx; uint16_t projection_id = 0; int prj_ctx_idx; *number_of_blocks_p = 0; uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_safe = rozofs_get_rozofs_safe(layout); uint32_t bbytes = ROZOFS_BSIZE_BYTES(bsize); int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); /* ** Proceed the inverse data transform for the nb_projections2read blocks. */ for (block_idx = 0; block_idx < number_of_blocks; block_idx++) { if (block_ctx_p[block_idx].state == ROZOFS_BLK_TRANSFORM_DONE) { /* ** transformation has already been done for that block of ROZOFS_BSIZE siz ** check the next one */ continue; } /* ** Check the case of the file that has no data (there is a hole in the file), this is indicated by ** reporting a timestamp of 0 */ if ((block_ctx_p[block_idx].timestamp == 0) && (block_ctx_p[block_idx].effective_length == bbytes )) { /* ** clear the memory */ ROZOFS_STORCLI_STATS(ROZOFS_STORCLI_EMPTY_READ); memset( data + (bbytes * (first_block_idx + block_idx)),0,bbytes); block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; continue; } if ((block_ctx_p[block_idx].timestamp == 0) && (block_ctx_p[block_idx].effective_length == 0 )) { /* ** we have reached end of file */ block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; *number_of_blocks_p = (block_idx++); return 0; } /* ** Here we have to take care, since the index of the projection_id use to address ** prj_ctx_p is NOT the real projection_id. The projection ID is found in the header of ** each bins, so for a set of projections pointed by bins, we might have a different ** projection id in the header of the projections contains in the bins array that has ** been read!! */ transform_inverse_proc(&rozofs_storcli_prj_idx_table[ROZOFS_SAFE_MAX_STORCLI*block_idx], prj_ctx_p, prj_size_in_msg, layout, bbytes, first_block_idx, block_idx, data); /* ** indicate that transform has been done for the projection */ block_ctx_p[block_idx].state = ROZOFS_BLK_TRANSFORM_DONE; /* ** check the case of a block that is not full: need to zero's that part */ if (block_ctx_p[block_idx].effective_length < bbytes) { /* ** clear the memory */ char *raz_p = data + (bbytes * (first_block_idx + block_idx)) + block_ctx_p[block_idx].effective_length; memset( raz_p,0,(bbytes-block_ctx_p[block_idx].effective_length) ); } } /* ** now the inverse transform is finished, release the allocated ressources used for ** rebuild */ *number_of_blocks_p = number_of_blocks; /* ** Check whether a block should be repaired */ rozofs_storcli_check_block_2_repair(prj_ctx_p, rozofs_inverse, rozofs_forward, rozofs_safe, prj_size_in_msg, number_of_blocks, block_ctx_p); return 0; }
/** Apply the transform to a buffer starting at "data". That buffer MUST be ROZOFS_BSIZE aligned. The first_block_idx is the index of a ROZOFS_BSIZE array in the output buffer The number_of_blocks is the number of ROZOFS_BSIZE that must be transform Notice that the first_block_idx offset applies to the output transform buffer only not to the input buffer pointed by "data". * * @param *prj_ctx_p: pointer to the working array of the projection * @param first_block_idx: index of the first block to transform * @param number_of_blocks: number of blocks to write * @param timestamp: date in microseconds @param last_block_size: effective length of the last block * @param *data: pointer to the source data that must be transformed * * @return: the length written on success, -1 otherwise (errno is set) */ int rozofs_storcli_transform_forward(rozofs_storcli_projection_ctx_t *prj_ctx_p, uint8_t layout,uint32_t bsize, uint32_t first_block_idx, uint32_t number_of_blocks, uint64_t timestamp, uint16_t last_block_size, char *data) { projection_t rozofs_fwd_projections[ROZOFS_SAFE_MAX_STORCLI]; projection_t *projections; // Table of projections used to transform data uint16_t projection_id = 0; uint32_t i = 0; uint8_t rozofs_forward = rozofs_get_rozofs_forward(layout); uint8_t rozofs_inverse = rozofs_get_rozofs_inverse(layout); int empty_block = 0; uint32_t bbytes = ROZOFS_BSIZE_BYTES(bsize); projections = rozofs_fwd_projections; int prj_size_in_msg = rozofs_get_max_psize_in_msg(layout,bsize); /* Transform the data */ // For each block to send for (i = 0; i < number_of_blocks; i++) { empty_block = rozofs_data_block_check_empty(data + (i * bbytes), bbytes); // seek bins for each projection for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { /* ** Indicates the memory area where the transformed data must be stored */ projections[projection_id].bins = prj_ctx_p[projection_id].bins + (prj_size_in_msg/sizeof(bin_t)) * (first_block_idx+i); rozofs_stor_bins_hdr_t *rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t*)projections[projection_id].bins; rozofs_stor_bins_footer_t *rozofs_bins_foot_p = (rozofs_stor_bins_footer_t*) ((bin_t*)(rozofs_bins_hdr_p+1)+ rozofs_get_psizes(layout,bsize,projection_id)); /* ** check if the user data block is empty: if the data block is empty no need to transform */ if (empty_block) { rozofs_bins_hdr_p->s.projection_id = 0; rozofs_bins_hdr_p->s.timestamp = 0; rozofs_bins_hdr_p->s.effective_length = 0; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; continue; } /* ** fill the header of the projection */ rozofs_bins_hdr_p->s.projection_id = projection_id; rozofs_bins_hdr_p->s.timestamp = timestamp; rozofs_bins_hdr_p->s.filler = 0; rozofs_bins_hdr_p->s.version = 0; /* ** set the effective size of the block. It is always ROZOFS_BSIZE except for the last block */ if (i == (number_of_blocks-1)) { rozofs_bins_hdr_p->s.effective_length = last_block_size; } else { rozofs_bins_hdr_p->s.effective_length = bbytes; } /* ** update the pointer to point out the first bins */ projections[projection_id].bins += sizeof(rozofs_stor_bins_hdr_t)/sizeof(bin_t); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t*) (projections[projection_id].bins + rozofs_get_psizes(layout,bsize,projection_id)); rozofs_bins_foot_p->timestamp = timestamp; } /* ** do not apply transform for empty block */ if (empty_block == 0) { transform_forward_proc(layout,data + (i * bbytes),bbytes,projections); } } return 0; }
/* ** That function is called when all the projection are ready to be sent @param working_ctx_p: pointer to the root context associated with the top level write request @param data : pointer to the data of the last block to truncate */ void rozofs_storcli_truncate_req_processing_exec(rozofs_storcli_ctx_t *working_ctx_p, char * data) { storcli_truncate_arg_t *storcli_truncate_rq_p = (storcli_truncate_arg_t*)&working_ctx_p->storcli_truncate_arg; uint8_t layout = storcli_truncate_rq_p->layout; uint32_t bsize = storcli_truncate_rq_p->bsize; uint8_t rozofs_forward; uint8_t rozofs_safe; uint8_t projection_id; int storage_idx; int error=0; rozofs_storcli_lbg_prj_assoc_t *lbg_assoc_p = working_ctx_p->lbg_assoc_tb; rozofs_storcli_projection_ctx_t *prj_cxt_p = working_ctx_p->prj_ctx; rozofs_forward = rozofs_get_rozofs_forward(layout); rozofs_safe = rozofs_get_rozofs_safe(layout); /* ** set the current state of each load balancing group belonging to the rozofs_safe group */ for (storage_idx = 0; storage_idx < rozofs_safe; storage_idx++) { /* ** Check the state of the load Balancing group */ rozofs_storcli_lbg_prj_insert_lbg_state(lbg_assoc_p, storage_idx, NORTH_LBG_GET_STATE(lbg_assoc_p[storage_idx].lbg_id)); } /* ** Now find out a selectable lbg_id for each projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { if (rozofs_storcli_select_storage_idx_for_write ( working_ctx_p,rozofs_forward, rozofs_safe,projection_id) < 0) { /* ** there is no enough valid storage !! */ STORCLI_ERR_PROF(truncate_sid_miss); error = EIO; goto fail; } } /* ** Let's transform the data to write */ working_ctx_p->truncate_bins_len = 0; if (data != NULL) { STORCLI_START_KPI(storcli_kpi_transform_forward); rozofs_storcli_transform_forward(working_ctx_p->prj_ctx, layout, bsize, 0, 1, working_ctx_p->timestamp, storcli_truncate_rq_p->last_seg, data); STORCLI_STOP_KPI(storcli_kpi_transform_forward,0); working_ctx_p->truncate_bins_len = rozofs_get_max_psize_in_msg(layout,bsize); ; } /* ** We have enough storage, so initiate the transaction towards the storage for each ** projection */ for (projection_id = 0; projection_id < rozofs_forward; projection_id++) { sp_truncate_arg_no_bins_t *request; sp_truncate_arg_no_bins_t truncate_prj_args; void *xmit_buf; int ret; xmit_buf = prj_cxt_p[projection_id].prj_buf; if (xmit_buf == NULL) { /* ** fatal error since the ressource control already took place */ error = EIO; goto fatal; } /* ** fill partially the common header */ retry: request = &truncate_prj_args; request->cid = storcli_truncate_rq_p->cid; request->sid = (uint8_t) rozofs_storcli_lbg_prj_get_sid(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); request->layout = layout; request->bsize = bsize; if (prj_cxt_p[projection_id].stor_idx >= rozofs_forward) request->spare = 1; else request->spare = 0; memcpy(request->dist_set, storcli_truncate_rq_p->dist_set, ROZOFS_SAFE_MAX_STORCLI*sizeof (uint8_t)); memcpy(request->fid, storcli_truncate_rq_p->fid, sizeof (sp_uuid_t)); request->proj_id = projection_id; request->bid = storcli_truncate_rq_p->bid; request->last_seg = storcli_truncate_rq_p->last_seg; request->last_timestamp = working_ctx_p->timestamp; request->len = working_ctx_p->truncate_bins_len; uint32_t lbg_id = rozofs_storcli_lbg_prj_get_lbg(working_ctx_p->lbg_assoc_tb,prj_cxt_p[projection_id].stor_idx); STORCLI_START_NORTH_PROF((&working_ctx_p->prj_ctx[projection_id]),truncate_prj,0); /* ** caution we might have a direct reply if there is a direct error at load balancing group while ** ateempting to send the RPC message-> typically a disconnection of the TCP connection ** As a consequence the response fct 'rozofs_storcli_truncate_req_processing_cbk) can be called ** prior returning from rozofs_sorcli_send_rq_common') ** anticipate the status of the xmit state of the projection and lock the section to ** avoid a reply error before returning from rozofs_sorcli_send_rq_common() ** --> need to take care because the write context is released after the reply error sent to rozofsmount */ working_ctx_p->write_ctx_lock = 1; prj_cxt_p[projection_id].prj_state = ROZOFS_PRJ_WR_IN_PRG; ret = rozofs_sorcli_send_rq_common(lbg_id,ROZOFS_TMR_GET(TMR_STORAGE_PROGRAM),STORAGE_PROGRAM,STORAGE_VERSION,SP_TRUNCATE, (xdrproc_t) xdr_sp_truncate_arg_no_bins_t, (caddr_t) request, xmit_buf, working_ctx_p->read_seqnum, (uint32_t) projection_id, working_ctx_p->truncate_bins_len, rozofs_storcli_truncate_req_processing_cbk, (void*)working_ctx_p); working_ctx_p->write_ctx_lock = 0; if (ret < 0) { /* ** the communication with the storage seems to be wrong (more than TCP connection temporary down ** attempt to select a new storage ** */ if (rozofs_storcli_select_storage_idx_for_write (working_ctx_p,rozofs_forward,rozofs_safe,projection_id) < 0) { /* ** Out of storage !!-> too many storages are down */ goto fatal; } /* ** retry for that projection with a new storage index: WARNING: we assume that xmit buffer has not been released !!! */ //#warning: it is assumed that xmit buffer has not been release, need to double check!! goto retry; } else { /* ** check if the state has not been changed: -> it might be possible to get a direct error */ if (prj_cxt_p[projection_id].prj_state == ROZOFS_PRJ_WR_ERROR) { error = prj_cxt_p[projection_id].errcode; goto fatal; } } } return; fail: /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; fatal: /* ** we fall in that case when we run out of resource-> that case is a BUG !! */ rozofs_storcli_write_reply_error(working_ctx_p,error); /* ** release the root transaction context */ STORCLI_STOP_NORTH_PROF(working_ctx_p,truncate,0); rozofs_storcli_release_context(working_ctx_p); return; }
void read_chunk_file(uuid_t fid, char * path, rozofs_stor_bins_file_hdr_vall_t * hdr, int spare, uint64_t firstBlock) { uint16_t rozofs_disk_psize; int fd; rozofs_stor_bins_hdr_t * pH; int nb_read; uint32_t bbytes = ROZOFS_BSIZE_BYTES(hdr->v0.bsize); char crc32_string[32]; uint64_t offset; if (dump_data == 0) { printf ("+------------+------------------+------------+----+------+-------+--------------------------------------------\n"); printf ("| %10s | %16s | %10s | %2s | %4s | %5s | %s\n", "block#","file offset", "prj offset", "pj", "size", "crc32", "date"); printf ("+------------+------------------+------------+----+------+-------+--------------------------------------------\n"); } // Open bins file fd = open(path, ROZOFS_ST_NO_CREATE_FILE_FLAG, ROZOFS_ST_BINS_FILE_MODE_RO); if (fd < 0) { printf("open(%s) %s\n",path,strerror(errno)); return; } /* ** Retrieve the projection size on disk */ rozofs_disk_psize = rozofs_get_max_psize_in_msg(hdr->v0.layout,hdr->v0.bsize); if (spare==0) { /* Header version 1. Find the sid in the distribution */ if (hdr->v0.version == 2) { int fwd = rozofs_get_rozofs_forward(hdr->v2.layout); int idx; for (idx=0; idx< fwd;idx++) { if (hdr->v2.distrib[idx] != hdr->v2.sid) continue; rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v2.layout,hdr->v2.bsize,idx); break; } } else if (hdr->v0.version == 1) { int fwd = rozofs_get_rozofs_forward(hdr->v1.layout); int idx; for (idx=0; idx< fwd;idx++) { if (hdr->v1.dist_set_current[idx] != hdr->v1.sid) continue; rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v1.layout,hdr->v1.bsize,idx); break; } } /* Projection id given as parameter */ else if (prjid != -1) { rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v0.layout,hdr->v0.bsize,prjid); } /*�Version 0 without projection given as parameter*/ else { // Read 1rst block nb_read = pread(fd, buffer, sizeof(rozofs_stor_bins_hdr_t), 0); if (nb_read<0) { printf("pread(%s) %s\n",path,strerror(errno)); return; } pH = (rozofs_stor_bins_hdr_t*)buffer; if (pH->s.timestamp == 0) { printf("Can not tell projection id\n"); return; } rozofs_disk_psize = rozofs_get_psizes_on_disk(hdr->v0.layout,hdr->v0.bsize,pH->s.projection_id); } } /* ** Where to start reading from */ if (first == 0) { offset = 0; } else { if (first <= firstBlock) { offset = 0; } else { offset = (first-firstBlock)*rozofs_disk_psize; } } int idx; nb_read = 1; uint64_t bid; /* ** Reading blocks */ while (nb_read) { // Read nb_proj * (projection + header) nb_read = pread(fd, buffer, rozofs_disk_psize*32, offset); if (nb_read<0) { printf("pread(%s) %s\n",path,strerror(errno)); close(fd); return; } nb_read = (nb_read / rozofs_disk_psize); pH = (rozofs_stor_bins_hdr_t*) buffer; for (idx=0; idx<nb_read; idx++) { pH = (rozofs_stor_bins_hdr_t*) &buffer[idx*rozofs_disk_psize]; bid = (offset/rozofs_disk_psize)+idx+firstBlock; if (bid < first) continue; if (bid > last) break; uint32_t save_crc32 = pH->s.filler; pH->s.filler = 0; uint32_t crc32=0; if (save_crc32 == 0) { sprintf(crc32_string,"NONE"); } else { crc32 = fid2crc32((uint32_t *)fid)+bid-firstBlock; crc32 = crc32c(crc32,(char *) pH, rozofs_disk_psize); if (crc32 != save_crc32) sprintf(crc32_string,"ERROR"); else sprintf(crc32_string,"OK"); } pH->s.filler = save_crc32; if (dump_data == 0) { printf ("| %10llu | %16llu | %10llu | %2d | %4d | %5s | %s\n", (long long unsigned int)bid, (long long unsigned int)bbytes * bid, (long long unsigned int)offset+(idx*rozofs_disk_psize), pH->s.projection_id, pH->s.effective_length, crc32_string, ts2string(pH->s.timestamp)); } else { printf("_________________________________________________________________________________________\n"); printf("Block# %llu / file offset %llu / projection offset %llu\n", (unsigned long long)bid, (unsigned long long)(bbytes * bid), (unsigned long long)(offset+(idx*rozofs_disk_psize))); printf("prj id %d / length %d / CRC %s / time stamp %s\n", pH->s.projection_id,pH->s.effective_length,crc32_string, ts2string(pH->s.timestamp)); printf("_________________________________________________________________________________________\n"); if ((pH->s.projection_id == 0)&&(pH->s.timestamp==0)) continue; hexdump(pH, (offset+(idx*rozofs_disk_psize)), rozofs_disk_psize); } } offset += (nb_read*rozofs_disk_psize); } if (dump_data == 0) { printf ("+------------+------------------+------------+----+------+-------+--------------------------------------------\n"); } close(fd); }
int read_data_file() { int status = -1; uint64_t size = 0; int block_idx = 0; int idx =0; int count; rozofs_stor_bins_hdr_t * rozofs_bins_hdr_p; rozofs_stor_bins_footer_t * rozofs_bins_foot_p; char * loc_read_bins_p = NULL; int forward = rozofs_get_rozofs_forward(layout); // int inverse = rozofs_get_rozofs_inverse(layout); uint16_t disk_block_size; uint16_t max_block_size = (rozofs_get_max_psize(layout,bsize)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); char * p; int empty,valid; int prj_id; int nb_ts; uint64_t ts[32]; int ts_count[32]; // Allocate memory for reading loc_read_bins_p = xmalloc(max_block_size); for (idx=0; idx < nb_file; idx++) { if (strcmp(filename[idx],"NULL") == 0) { fd[idx] = -1; } else { fd[idx] = open(filename[idx],O_RDWR); if (fd < 0) { severe("Can not open file %s %s",filename[idx],strerror(errno)); goto out; } } } printf (" ______ __________ "); for (idx=0; idx < nb_file; idx++) printf (" __________________ ______ ____ "); printf ("\n"); printf("| %4s | %8s |","Blk","Offset"); for (idx=0; idx < nb_file; idx++) printf("| %16s | %4s | %2s |", "Time stamp", "lgth", "id"); printf ("\n"); printf ("|______|__________|"); for (idx=0; idx < nb_file; idx++) printf ("|__________________|______|____|"); printf ("\n"); if (block_number == -1) block_idx = 0; else block_idx = block_number; count = 1; empty = 0; while ( count ) { valid = 0; count = 0; nb_ts = 0; p = &LINE[0]; p += sprintf(p,"| %4d | %8d ",block_idx+firstBlock,(block_idx+firstBlock)*bbytes); for (idx=0; idx < nb_file; idx++) { if (fd[idx] == -1) { p += sprintf(p,"%32s"," "); continue; } if (idx >= forward) disk_block_size = rozofs_get_max_psize_in_msg(layout, bsize); else disk_block_size = rozofs_get_psizes_on_disk(layout,bsize,idx); size = pread(fd[idx],loc_read_bins_p,disk_block_size,block_idx*disk_block_size); if (size != disk_block_size) { p += sprintf(p,"|__________________|______|____|"); close(fd[idx]); fd[idx] = -1; } else { count++; rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t *)loc_read_bins_p; prj_id = rozofs_bins_hdr_p->s.projection_id; if (prj_id >= forward) { valid = 1; p += sprintf(p,"|| xxxxxxxxxxxxxxxx | xxxx | %2d ",prj_id); } else { disk_block_size = (rozofs_get_psizes(layout,bsize,prj_id)*sizeof (bin_t)); disk_block_size += sizeof (rozofs_stor_bins_hdr_t); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t *) ((char*) rozofs_bins_hdr_p + disk_block_size); if (rozofs_bins_hdr_p->s.timestamp == 0) { p += sprintf(p,"|| %16d | .... | %2d ",0,prj_id); } else if (rozofs_bins_foot_p->timestamp != rozofs_bins_hdr_p->s.timestamp) { valid = 1; p += sprintf(p,"|--%16.16llu----------%2d-", (long long unsigned int)rozofs_bins_hdr_p->s.timestamp, prj_id); } else if (rozofs_bins_hdr_p->s.timestamp == 0) { p += sprintf(p,"|| %16d | .... | %2d ",0,prj_id); } else { valid = 1; p += sprintf(p,"|| %16llu | %4d | %2d ", (unsigned long long)rozofs_bins_hdr_p->s.timestamp, rozofs_bins_hdr_p->s.effective_length, rozofs_bins_hdr_p->s.projection_id); int tsidx; for (tsidx=0; tsidx< nb_ts; tsidx++) { if (ts[tsidx] == rozofs_bins_hdr_p->s.timestamp) { ts_count[tsidx]++; break; } } if (tsidx == nb_ts) { ts[tsidx] = rozofs_bins_hdr_p->s.timestamp; ts_count[tsidx] = 1; nb_ts++; } } } } } if (valid) { if (empty) { printf("... %d blocks...\n",empty); empty = 0; } int best=-1,tsidx; for (tsidx=0; tsidx< nb_ts; tsidx++) { if (ts_count[tsidx] > best) best = ts_count[tsidx]; } printf("%s%s\n",LINE, (best<forward)?"<<<<----":"|"); } else { empty++; } block_idx++; if (block_number!=-1) break; } printf ("|______|__________|\n"); if (block_number!=-1) { for (idx=0; idx < nb_file; idx++) { if (idx < forward) { disk_block_size = (rozofs_get_psizes(layout,bsize,idx)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); } else { disk_block_size = (rozofs_get_max_psize(layout,bsize)*sizeof (bin_t)) + sizeof (rozofs_stor_bins_hdr_t) + sizeof (rozofs_stor_bins_footer_t); } size = pread(fd[idx],loc_read_bins_p,disk_block_size,block_number*disk_block_size); if (size != disk_block_size) { printf("Can not read block %d of %s\n", block_number, filename[idx]); continue; } FILE * f; char fname[128]; sprintf(fname,"block_%d_dist_%d.txt", block_number, idx); f = fopen(fname,"w"); if (f == NULL) { printf ("Can not create file %s",fname); continue; } printf("- %s\n",fname); fprintf(f,"%s Block %d size %d\n", filename[idx], block_number, disk_block_size); rozofs_bins_hdr_p = (rozofs_stor_bins_hdr_t *)loc_read_bins_p; fprintf(f,"Block header : TS %llu SZ %d PRJ %d CRC32 0x%x\n", (long long unsigned int)rozofs_bins_hdr_p->s.timestamp, rozofs_bins_hdr_p->s.effective_length, rozofs_bins_hdr_p->s.projection_id, rozofs_bins_hdr_p->s.filler); rozofs_bins_foot_p = (rozofs_stor_bins_footer_t *) (loc_read_bins_p + disk_block_size); rozofs_bins_foot_p--; fprintf(f,"Block footer : TS %llu %s\n", (long long unsigned int)rozofs_bins_foot_p->timestamp, (rozofs_bins_hdr_p->s.timestamp==rozofs_bins_foot_p->timestamp)?"":" !!!!!!"); hexdump(f,loc_read_bins_p, 0, disk_block_size); fclose(f); } } status = 0; out: // This spare file used to exist but is not needed any more for (idx=0; idx < nb_file; idx++) { if (fd[idx] != -1) close(fd[idx]); } if (loc_read_bins_p != NULL) { //free(loc_read_bins_p); loc_read_bins_p = NULL; } return status; }