uint8_t sclass_store(bio *fd) { uint8_t databuff[10+MAXSCLASSNLENG+3*(1+9*4*MASKORGROUP)]; uint8_t *ptr; uint16_t i,j,k; int32_t wsize; if (fd==NULL) { return 0x16; } ptr = databuff; put8bit(&ptr,MASKORGROUP); if (bio_write(fd,databuff,1)!=1) { syslog(LOG_NOTICE,"write error"); return 0xFF; } for (i=1 ; i<firstneverused ; i++) { if ((sclasstab[i].nleng)>0) { ptr = databuff; put16bit(&ptr,i); put8bit(&ptr,sclasstab[i].nleng); put8bit(&ptr,sclasstab[i].admin_only); put8bit(&ptr,sclasstab[i].create_mode); put16bit(&ptr,sclasstab[i].arch_delay); put8bit(&ptr,sclasstab[i].create_labelscnt); put8bit(&ptr,sclasstab[i].keep_labelscnt); put8bit(&ptr,sclasstab[i].arch_labelscnt); memcpy(ptr,sclasstab[i].name,sclasstab[i].nleng); ptr+=sclasstab[i].nleng; for (j=0 ; j<sclasstab[i].create_labelscnt ; j++) { for (k=0 ; k<MASKORGROUP ; k++) { put32bit(&ptr,sclasstab[i].create_labelmasks[j][k]); } } for (j=0 ; j<sclasstab[i].keep_labelscnt ; j++) { for (k=0 ; k<MASKORGROUP ; k++) { put32bit(&ptr,sclasstab[i].keep_labelmasks[j][k]); } } for (j=0 ; j<sclasstab[i].arch_labelscnt ; j++) { for (k=0 ; k<MASKORGROUP ; k++) { put32bit(&ptr,sclasstab[i].arch_labelmasks[j][k]); } } wsize = 10+sclasstab[i].nleng+(sclasstab[i].create_labelscnt+sclasstab[i].keep_labelscnt+sclasstab[i].arch_labelscnt)*4*MASKORGROUP; if (bio_write(fd,databuff,wsize)!=wsize) { syslog(LOG_NOTICE,"write error"); return 0xFF; } } } memset(databuff,0,10); if (bio_write(fd,databuff,10)!=10) { syslog(LOG_NOTICE,"write error"); return 0xFF; } return 0; }
void masterconn_sendregister(masterconn *eptr) { uint8_t *buff; eptr->downloading=0; eptr->metafd=-1; eptr->logfd=NULL; buff = masterconn_createpacket(eptr,MLTOMA_REGISTER,1+4+2); put8bit(&buff,1); put16bit(&buff,VERSMAJ); put8bit(&buff,VERSMID); put8bit(&buff,VERSMIN); put16bit(&buff,Timeout); }
void masterproxy_getlocation(uint8_t *masterinfo) { const uint8_t *rptr = masterinfo+10; if (lsock>=0 && get32bit(&rptr)>=0x00010618) { // use proxy only when master version is greater than or equal to 1.6.24 put32bit(&masterinfo,proxyhost); put16bit(&masterinfo,proxyport); } }
void masterconn_sendregister(serventry *eptr) { uint8_t *buff; eptr->downloading=0; eptr->metafd=-1; eptr->logfd=NULL; buff = masterconn_createpacket(eptr,SLATOMA_REGISTER,1+4+2); if (buff==NULL) { eptr->mode=KILL; return; } put8bit(&buff,1); put16bit(&buff,VERSMAJ); put8bit(&buff,VERSMID); put8bit(&buff,VERSMIN); put16bit(&buff,Timeout); }
void csserv_module_info(csserventry *eptr,const uint8_t *data,uint32_t length) { uint8_t *ptr; if (length!=0) { syslog(LOG_NOTICE,"CLTOAN_MODULE_INFO - wrong size (%"PRIu32"/0)",length); eptr->state = CLOSE; return; } (void)data; ptr = csserv_create_packet(eptr,ANTOCL_MODULE_INFO,21); put8bit(&ptr,MODULE_TYPE_CHUNKSERVER); put16bit(&ptr,VERSMAJ); put8bit(&ptr,VERSMID); put8bit(&ptr,VERSMIN); put16bit(&ptr,masterconn_getcsid()); put64bit(&ptr,masterconn_getmetaid()); put32bit(&ptr,masterconn_getmasterip()); put16bit(&ptr,masterconn_getmasterport()); }
void csserv_idlejob_finished(uint8_t status,void *ijp) { idlejob *ij = (idlejob*)ijp; csserventry *eptr = ij->eptr; uint8_t *ptr; if (eptr) { switch (ij->op) { case IJ_GET_CHUNK_BLOCKS: ptr = csserv_create_packet(eptr,CSTOAN_CHUNK_BLOCKS,8+4+2+1); put64bit(&ptr,ij->chunkid); put32bit(&ptr,ij->version); if (status==MFS_STATUS_OK) { memcpy(ptr,ij->buff,2); ptr+=2; } else { put16bit(&ptr,0); } put8bit(&ptr,status); break; case IJ_GET_CHUNK_CHECKSUM: if (status!=MFS_STATUS_OK) { ptr = csserv_create_packet(eptr,CSTOAN_CHUNK_CHECKSUM,8+4+1); } else { ptr = csserv_create_packet(eptr,CSTOAN_CHUNK_CHECKSUM,8+4+4); } put64bit(&ptr,ij->chunkid); put32bit(&ptr,ij->version); if (status!=MFS_STATUS_OK) { put8bit(&ptr,status); } else { memcpy(ptr,ij->buff,4); } break; case IJ_GET_CHUNK_CHECKSUM_TAB: if (status!=MFS_STATUS_OK) { ptr = csserv_create_packet(eptr,CSTOAN_CHUNK_CHECKSUM_TAB,8+4+1); } else { ptr = csserv_create_packet(eptr,CSTOAN_CHUNK_CHECKSUM_TAB,8+4+4096); } put64bit(&ptr,ij->chunkid); put32bit(&ptr,ij->version); if (status!=MFS_STATUS_OK) { put8bit(&ptr,status); } else { memcpy(ptr,ij->buff,4096); } break; } *(ij->prev) = ij->next; if (ij->next) { ij->next->prev = ij->prev; } } free(ij); }
//封装向Master发送注册信息的packet,存入服务队列 //调用:masterconn_connected() void masterconn_sendregister(masterconn *eptr) { uint8_t *buff; uint32_t chunks,myip; uint16_t myport; uint64_t usedspace,totalspace; uint64_t tdusedspace,tdtotalspace; uint32_t chunkcount,tdchunkcount; myip = csserv_getlistenip(); myport = csserv_getlistenport(); hdd_get_space(&usedspace,&totalspace,&chunkcount,&tdusedspace,&tdtotalspace,&tdchunkcount); chunks = hdd_get_chunks_count(); buff = masterconn_create_attached_packet(eptr,CSTOMA_REGISTER,1+4+4+2+2+8+8+4+8+8+4+chunks*(8+4)); if (buff==NULL) { eptr->mode=KILL; hdd_get_chunks_data(NULL); // unlock return; } put8bit(&buff,4); /* put32bit(&buff,VERSION): */ put16bit(&buff,VERSMAJ); put8bit(&buff,VERSMID); put8bit(&buff,VERSMIN); put32bit(&buff,myip); put16bit(&buff,myport); put16bit(&buff,Timeout); put64bit(&buff,usedspace); put64bit(&buff,totalspace); put32bit(&buff,chunkcount); put64bit(&buff,tdusedspace); put64bit(&buff,tdtotalspace); put32bit(&buff,tdchunkcount); if (chunks>0) { hdd_get_chunks_data(buff); } else { hdd_get_chunks_data(NULL); // unlock } }
void masterconn_sendregister(masterconn *eptr) { uint8_t *buff; eptr->downloading=0; eptr->metafd=-1; if (currentlogversion>0) { buff = masterconn_createpacket(eptr,MLTOMA_REGISTER,1+4+2+8); put8bit(&buff,2); put16bit(&buff,VERSMAJ); put8bit(&buff,VERSMID); put8bit(&buff,VERSMIN); put16bit(&buff,Timeout); put64bit(&buff,currentlogversion); } else { buff = masterconn_createpacket(eptr,MLTOMA_REGISTER,1+4+2); put8bit(&buff,1); put16bit(&buff,VERSMAJ); put8bit(&buff,VERSMID); put8bit(&buff,VERSMIN); put16bit(&buff,Timeout); } }
void masterconn_sendregister(masterconn *eptr) { uint8_t *buff; uint32_t myip; uint16_t myport; uint64_t usedspace,totalspace; uint64_t tdusedspace,tdtotalspace; uint32_t chunkcount,tdchunkcount; myip = csserv_getlistenip(); myport = csserv_getlistenport(); if (eptr->new_register_mode) { #ifdef MFSDEBUG syslog(LOG_NOTICE,"register ver. 6 - init + space info"); #endif hdd_get_space(&usedspace,&totalspace,&chunkcount,&tdusedspace,&tdtotalspace,&tdchunkcount); if (eptr->gotrndblob && AuthCode) { md5ctx md5c; buff = masterconn_create_attached_packet(eptr,CSTOMA_REGISTER,1+16+4+4+2+2+2+8+8+4+8+8+4); put8bit(&buff,60); md5_init(&md5c); md5_update(&md5c,eptr->rndblob,16); md5_update(&md5c,(const uint8_t *)AuthCode,strlen(AuthCode)); md5_update(&md5c,eptr->rndblob+16,16); md5_final(buff,&md5c); buff+=16; } else { buff = masterconn_create_attached_packet(eptr,CSTOMA_REGISTER,1+4+4+2+2+2+8+8+4+8+8+4); put8bit(&buff,60); } put32bit(&buff,VERSHEX); put32bit(&buff,myip); put16bit(&buff,myport); put16bit(&buff,Timeout); put16bit(&buff,masterconn_getcsid()); put64bit(&buff,usedspace); put64bit(&buff,totalspace); put32bit(&buff,chunkcount); put64bit(&buff,tdusedspace); put64bit(&buff,tdtotalspace); put32bit(&buff,tdchunkcount); } else { #ifdef MFSDEBUG syslog(LOG_NOTICE,"register ver. 5 - init"); #endif buff = masterconn_create_attached_packet(eptr,CSTOMA_REGISTER,1+4+4+2+2); put8bit(&buff,50); put32bit(&buff,VERSHEX); put32bit(&buff,myip); put16bit(&buff,myport); if (Timeout>0) { put16bit(&buff,Timeout); } else { put16bit(&buff,10); } } }
uint32_t sclass_list_entries(uint8_t *buff,uint8_t longmode) { uint32_t sclassid; uint32_t ret; uint32_t i,og; ret = 0; for (sclassid=1 ; sclassid<firstneverused ; sclassid++) { if (sclasstab[sclassid].nleng>0) { if (buff==NULL) { ret += sclasstab[sclassid].nleng+1; if (longmode&1) { ret += (sclasstab[sclassid].create_labelscnt + sclasstab[sclassid].keep_labelscnt + sclasstab[sclassid].arch_labelscnt)*4*MASKORGROUP+7; } } else { put8bit(&buff,sclasstab[sclassid].nleng); memcpy(buff,sclasstab[sclassid].name,sclasstab[sclassid].nleng); buff+=sclasstab[sclassid].nleng; if (longmode&1) { put8bit(&buff,sclasstab[sclassid].admin_only); put8bit(&buff,sclasstab[sclassid].create_mode); put16bit(&buff,sclasstab[sclassid].arch_delay); put8bit(&buff,sclasstab[sclassid].create_labelscnt); put8bit(&buff,sclasstab[sclassid].keep_labelscnt); put8bit(&buff,sclasstab[sclassid].arch_labelscnt); for (i=0 ; i<sclasstab[sclassid].create_labelscnt ; i++) { for (og=0 ; og<MASKORGROUP ; og++) { put32bit(&buff,sclasstab[sclassid].create_labelmasks[i][og]); } } for (i=0 ; i<sclasstab[sclassid].keep_labelscnt ; i++) { for (og=0 ; og<MASKORGROUP ; og++) { put32bit(&buff,sclasstab[sclassid].keep_labelmasks[i][og]); } } for (i=0 ; i<sclasstab[sclassid].arch_labelscnt ; i++) { for (og=0 ; og<MASKORGROUP ; og++) { put32bit(&buff,sclasstab[sclassid].arch_labelmasks[i][og]); } } } } } } return ret; }
void csserv_get_version(csserventry *eptr,const uint8_t *data,uint32_t length) { uint32_t msgid = 0; uint8_t *ptr; static const char vstring[] = VERSSTR; if (length!=0 && length!=4) { syslog(LOG_NOTICE,"ANTOAN_GET_VERSION - wrong size (%"PRIu32"/4|0)",length); eptr->state = CLOSE; return; } if (length==4) { msgid = get32bit(&data); ptr = csserv_create_packet(eptr,ANTOAN_VERSION,4+4+strlen(vstring)); put32bit(&ptr,msgid); } else { ptr = csserv_create_packet(eptr,ANTOAN_VERSION,4+strlen(vstring)); } put16bit(&ptr,VERSMAJ); put8bit(&ptr,VERSMID); put8bit(&ptr,VERSMIN); memcpy(ptr,vstring,strlen(vstring)); }
uint32_t job_replicate_simple(void (*callback)(uint8_t status,void *extra),void *extra,uint64_t chunkid,uint32_t version,uint32_t ip,uint16_t port) { jobpool* jp = globalpool; chunk_rp_args *args; uint8_t *ptr; ptr = malloc(sizeof(chunk_rp_args)+18); passert(ptr); args = (chunk_rp_args*)ptr; ptr += sizeof(chunk_rp_args); args->chunkid = chunkid; args->version = version; args->srccnt = 1; args->xormasks[0] = UINT32_C(0x88888888); args->xormasks[1] = UINT32_C(0x44444444); args->xormasks[2] = UINT32_C(0x22222222); args->xormasks[3] = UINT32_C(0x11111111); put64bit(&ptr,chunkid); put32bit(&ptr,version); put32bit(&ptr,ip); put16bit(&ptr,port); return job_new(jp,OP_REPLICATE,args,callback,extra); }
static int master_register(int rfd,uint32_t cuid) { uint32_t i; const uint8_t *rptr; uint8_t *wptr,regbuff[8+73]; wptr = regbuff; put32bit(&wptr,CLTOMA_FUSE_REGISTER); put32bit(&wptr,73); memcpy(wptr,FUSE_REGISTER_BLOB_ACL,64); wptr+=64; put8bit(&wptr,REGISTER_TOOLS); put32bit(&wptr,cuid); put16bit(&wptr,VERSMAJ); put8bit(&wptr,VERSMID); put8bit(&wptr,VERSMIN); if (tcpwrite(rfd,regbuff,8+73)!=8+73) { printf("register to master: send error\n"); return -1; } if (tcpread(rfd,regbuff,9)!=9) { printf("register to master: receive error\n"); return -1; } rptr = regbuff; i = get32bit(&rptr); if (i!=MATOCL_FUSE_REGISTER) { printf("register to master: wrong answer (type)\n"); return -1; } i = get32bit(&rptr); if (i!=1) { printf("register to master: wrong answer (length)\n"); return -1; } if (*rptr) { printf("register to master: %s\n",mfsstrerr(*rptr)); return -1; } return 0; }
static inline void masterconn_setcsid(uint16_t csid,uint64_t metaid) { int fd; uint8_t buff[10],*wptr; if (ChunkServerID!=csid || MetaID!=metaid) { if (csid>0) { ChunkServerID = csid; } if (metaid>0) { MetaID = metaid; } wptr = buff; put16bit(&wptr,ChunkServerID); put64bit(&wptr,MetaID); fd = open("chunkserverid.mfs",O_CREAT | O_TRUNC | O_RDWR,0666); if (fd>=0) { if (write(fd,buff,10)!=10) { syslog(LOG_WARNING,"can't store chunkserver id (write error)"); } close(fd); } else { syslog(LOG_WARNING,"can't store chunkserver id (open error)"); } } }
int main(void) { uint64_t buff[2]; uint8_t *wp; const uint8_t *rp; uint32_t i; mfstest_init(); wp = (uint8_t*)buff; for (i=0 ; i<16 ; i++) { wp[i] = ((15-i)*0x10)+i; } mfstest_start(getbit_uneven); rp = (uint8_t*)buff; mfstest_assert_uint8_eq(get8bit(&rp),0xF0); mfstest_assert_uint16_eq(get16bit(&rp),0xE1D2); mfstest_assert_uint32_eq(get32bit(&rp),0xC3B4A596); mfstest_assert_uint64_eq(get64bit(&rp),0x8778695A4B3C2D1E); mfstest_end(); mfstest_start(getbit_even); rp = (uint8_t*)buff; mfstest_assert_uint64_eq(get64bit(&rp),0xF0E1D2C3B4A59687); mfstest_assert_uint32_eq(get32bit(&rp),0x78695A4B); mfstest_assert_uint16_eq(get16bit(&rp),0x3C2D); mfstest_assert_uint8_eq(get8bit(&rp),0x1E); mfstest_end(); wp = (uint8_t*)buff; for (i=0; i<16 ; i++) { wp[i] = 0; } put8bit(&wp,0xF0); put16bit(&wp,0xE1D2); put32bit(&wp,0xC3B4A596); put64bit(&wp,0x8778695A4B3C2D1E); put8bit(&wp,0x0F); mfstest_start(putbit_uneven); rp = (uint8_t*)buff; for (i=0 ; i<16 ; i++) { mfstest_assert_uint8_eq(rp[i],((15-i)*0x10)+i); } mfstest_end(); wp = (uint8_t*)buff; for (i=0; i<16 ; i++) { wp[i] = 0; } put64bit(&wp,0xF0E1D2C3B4A59687); put32bit(&wp,0x78695A4B); put16bit(&wp,0x3C2D); put8bit(&wp,0x1E); put8bit(&wp,0x0F); mfstest_start(putbit_even); rp = (uint8_t*)buff; for (i=0 ; i<16 ; i++) { mfstest_assert_uint8_eq(rp[i],((15-i)*0x10)+i); } mfstest_end(); mfstest_return(); }
uint32_t sclass_info(uint8_t *buff) { uint32_t leng,i,j,k; uint64_t sunder,sexact,sover; uint64_t aunder,aexact,aover; if (buff==NULL) { leng = 2; for (i=1 ; i<firstneverused ; i++) { if (sclasstab[i].nleng>0) { leng += 20 + 3 * 16; leng += sclasstab[i].nleng; leng += ((uint32_t)sclasstab[i].create_labelscnt) * ( MASKORGROUP * 4 + 2 ); leng += ((uint32_t)sclasstab[i].keep_labelscnt) * ( MASKORGROUP * 4 + 2 ); leng += ((uint32_t)sclasstab[i].arch_labelscnt) * ( MASKORGROUP * 4 + 2 ); } } return leng; } else { chunk_labelset_can_be_fulfilled(0,NULL); // init server list put16bit(&buff,matocsserv_servers_count()); for (i=1 ; i<firstneverused ; i++) { if (sclasstab[i].nleng>0) { put8bit(&buff,i); put8bit(&buff,sclasstab[i].nleng); memcpy(buff,sclasstab[i].name,sclasstab[i].nleng); buff+=sclasstab[i].nleng; put32bit(&buff,sclasstab[i].files); put32bit(&buff,sclasstab[i].directories); chunk_sclass_counters(i,0,sclasstab[i].keep_labelscnt,&sunder,&sexact,&sover); chunk_sclass_counters(i,1,sclasstab[i].arch_labelscnt,&aunder,&aexact,&aover); put64bit(&buff,sunder); put64bit(&buff,aunder); put64bit(&buff,sexact); put64bit(&buff,aexact); put64bit(&buff,sover); put64bit(&buff,aover); put8bit(&buff,sclasstab[i].admin_only); put8bit(&buff,sclasstab[i].create_mode); put16bit(&buff,sclasstab[i].arch_delay); put8bit(&buff,chunk_labelset_can_be_fulfilled(sclasstab[i].create_labelscnt,sclasstab[i].create_labelmasks)); put8bit(&buff,sclasstab[i].create_labelscnt); put8bit(&buff,chunk_labelset_can_be_fulfilled(sclasstab[i].keep_labelscnt,sclasstab[i].keep_labelmasks)); put8bit(&buff,sclasstab[i].keep_labelscnt); put8bit(&buff,chunk_labelset_can_be_fulfilled(sclasstab[i].arch_labelscnt,sclasstab[i].arch_labelmasks)); put8bit(&buff,sclasstab[i].arch_labelscnt); for (j=0 ; j<sclasstab[i].create_labelscnt ; j++) { for (k=0 ; k<MASKORGROUP ; k++) { put32bit(&buff,sclasstab[i].create_labelmasks[j][k]); } put16bit(&buff,matocsserv_servers_with_labelsets(sclasstab[i].create_labelmasks[j])); } for (j=0 ; j<sclasstab[i].keep_labelscnt ; j++) { for (k=0 ; k<MASKORGROUP ; k++) { put32bit(&buff,sclasstab[i].keep_labelmasks[j][k]); } put16bit(&buff,matocsserv_servers_with_labelsets(sclasstab[i].keep_labelmasks[j])); } for (j=0 ; j<sclasstab[i].arch_labelscnt ; j++) { for (k=0 ; k<MASKORGROUP ; k++) { put32bit(&buff,sclasstab[i].arch_labelmasks[j][k]); } put16bit(&buff,matocsserv_servers_with_labelsets(sclasstab[i].arch_labelmasks[j])); } } } return 0; } }
/* main working thread | glock:UNLOCKED */ void* write_worker(void *arg) { uint32_t z1,z2,z3; uint8_t *data; int fd; int i; struct pollfd pfd[2]; uint32_t sent,rcvd; uint8_t recvbuff[21]; uint8_t sendbuff[32]; #ifdef HAVE_WRITEV struct iovec siov[2]; #endif uint8_t pipebuff[1024]; uint8_t *wptr; const uint8_t *rptr; uint32_t reccmd; uint32_t recleng; uint64_t recchunkid; uint32_t recwriteid; uint8_t recstatus; #ifdef WORKER_DEBUG uint32_t partialblocks; uint32_t bytessent; char debugchain[200]; uint32_t cl; #endif const uint8_t *cp,*cpe; uint32_t chainip[10]; uint16_t chainport[10]; uint16_t chainelements; uint16_t chindx; uint32_t ip; uint16_t port; uint32_t srcip; uint64_t mfleng; uint64_t maxwroffset; uint64_t chunkid; uint32_t version; uint32_t nextwriteid; const uint8_t *chain; uint32_t chainsize; const uint8_t *csdata; uint32_t csdatasize; uint8_t westatus; uint8_t wrstatus; int status; uint8_t waitforstatus; uint8_t havedata; struct timeval start,now,lastrcvd,lrdiff; uint8_t cnt; inodedata *id; cblock *cb,*rcb; // inodedata *id; chainelements = 0; (void)arg; for (;;) { for (cnt=0 ; cnt<chainelements ; cnt++) { csdb_writedec(chainip[cnt],chainport[cnt]); } chainelements=0; // get next job queue_get(jqueue,&z1,&z2,&data,&z3); id = (inodedata*)data; pthread_mutex_lock(&glock); if (id->datachainhead) { chindx = id->datachainhead->chindx; } else { syslog(LOG_WARNING,"writeworker got inode with no data to write !!!"); chindx = 0xFFFF; status = EINVAL; // this should never happen, so status is not important - just anything } status = id->status; pthread_mutex_unlock(&glock); if (status) { write_job_end(id,status,0); continue; } // syslog(LOG_NOTICE,"file: %"PRIu32", index: %"PRIu16" - debug1",id->inode,chindx); // get chunk data from master wrstatus = fs_writechunk(id->inode,chindx,&mfleng,&chunkid,&version,&csdata,&csdatasize); if (wrstatus!=STATUS_OK) { syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16" - fs_writechunk returns status %d",id->inode,chindx,wrstatus); if (wrstatus!=ERROR_LOCKED) { if (wrstatus==ERROR_ENOENT) { write_job_end(id,EBADF,0); } else if (wrstatus==ERROR_QUOTA) { write_job_end(id,EDQUOT,0); } else if (wrstatus==ERROR_NOSPACE) { write_job_end(id,ENOSPC,0); } else { id->trycnt++; if (id->trycnt>=maxretries) { if (wrstatus==ERROR_NOCHUNKSERVERS) { write_job_end(id,ENOSPC,0); } else { write_job_end(id,EIO,0); } } else { write_delayed_enqueue(id,1+(id->trycnt<30)?(id->trycnt/3):10); } } } else { write_delayed_enqueue(id,1+(id->trycnt<30)?(id->trycnt/3):10); } continue; // get next job } if (csdata==NULL || csdatasize==0) { syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - there are no valid copies",id->inode,chindx,chunkid,version); id->trycnt+=6; if (id->trycnt>=maxretries) { write_job_end(id,ENXIO,0); } else { write_delayed_enqueue(id,60); } continue; } cp = csdata; cpe = csdata+csdatasize; while (cp<cpe && chainelements<10) { chainip[chainelements] = get32bit(&cp); chainport[chainelements] = get16bit(&cp); csdb_writeinc(chainip[chainelements],chainport[chainelements]); chainelements++; } chain = csdata; ip = get32bit(&chain); port = get16bit(&chain); chainsize = csdatasize-6; gettimeofday(&start,NULL); /* if (csdatasize>CSDATARESERVE) { csdatasize = CSDATARESERVE; } memcpy(wrec->csdata,csdata,csdatasize); wrec->csdatasize=csdatasize; while (csdatasize>=6) { tmpip = get32bit(&csdata); tmpport = get16bit(&csdata); csdatasize-=6; csdb_writeinc(tmpip,tmpport); } */ // make connection to cs srcip = fs_getsrcip(); cnt=5; while (cnt>0) { fd = tcpsocket(); if (fd<0) { syslog(LOG_WARNING,"can't create tcp socket: %m"); cnt=0; } if (srcip) { if (tcpnumbind(fd,srcip,0)<0) { syslog(LOG_WARNING,"can't bind socket to given ip: %m"); tcpclose(fd); fd=-1; break; } } if (tcpnumtoconnect(fd,ip,port,200)<0) { cnt--; if (cnt==0) { syslog(LOG_WARNING,"can't connect to (%08"PRIX32":%"PRIu16"): %m",ip,port); } tcpclose(fd); fd=-1; } else { cnt=0; } } if (fd<0) { fs_writeend(chunkid,id->inode,0); id->trycnt++; if (id->trycnt>=maxretries) { write_job_end(id,EIO,0); } else { write_delayed_enqueue(id,1+(id->trycnt<30)?(id->trycnt/3):10); } continue; } if (tcpnodelay(fd)<0) { syslog(LOG_WARNING,"can't set TCP_NODELAY: %m"); } #ifdef WORKER_DEBUG partialblocks=0; bytessent=0; #endif nextwriteid=1; pfd[0].fd = fd; pfd[1].fd = id->pipe[0]; rcvd = 0; sent = 0; waitforstatus=1; havedata=1; wptr = sendbuff; put32bit(&wptr,CUTOCS_WRITE); put32bit(&wptr,12+chainsize); put64bit(&wptr,chunkid); put32bit(&wptr,version); // debug: syslog(LOG_NOTICE,"writeworker: init packet prepared"); cb = NULL; status = 0; wrstatus = STATUS_OK; lastrcvd.tv_sec = 0; do { gettimeofday(&now,NULL); if (lastrcvd.tv_sec==0) { lastrcvd = now; } else { lrdiff = now; if (lrdiff.tv_usec<lastrcvd.tv_usec) { lrdiff.tv_sec--; lrdiff.tv_usec+=1000000; } lrdiff.tv_sec -= lastrcvd.tv_sec; lrdiff.tv_usec -= lastrcvd.tv_usec; if (lrdiff.tv_sec>=2) { syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was timed out (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1); break; } } if (now.tv_usec<start.tv_usec) { now.tv_sec--; now.tv_usec+=1000000; } now.tv_sec -= start.tv_sec; now.tv_usec -= start.tv_usec; if (havedata==0 && now.tv_sec<5 && waitforstatus<5) { pthread_mutex_lock(&glock); if (cb==NULL) { if (id->datachainhead) { if (id->datachainhead->to-id->datachainhead->from==65536 || waitforstatus<=1) { cb = id->datachainhead; havedata=1; } } } else { if (cb->next) { if (cb->next->chindx==chindx) { if (cb->next->to-cb->next->from==65536 || waitforstatus<=1) { cb = cb->next; havedata=1; } } } else { id->waitingworker=1; } } if (havedata==1) { cb->writeid = nextwriteid++; // debug: syslog(LOG_NOTICE,"writeworker: data packet prepared (writeid:%"PRIu32",pos:%"PRIu16")",cb->writeid,cb->pos); waitforstatus++; wptr = sendbuff; put32bit(&wptr,CUTOCS_WRITE_DATA); put32bit(&wptr,24+(cb->to-cb->from)); put64bit(&wptr,chunkid); put32bit(&wptr,cb->writeid); put16bit(&wptr,cb->pos); put16bit(&wptr,cb->from); put32bit(&wptr,cb->to-cb->from); put32bit(&wptr,mycrc32(0,cb->data+cb->from,cb->to-cb->from)); #ifdef WORKER_DEBUG if (cb->to-cb->from<65536) { partialblocks++; } bytessent+=(cb->to-cb->from); #endif sent=0; } pthread_mutex_unlock(&glock); } pfd[0].events = POLLIN | (havedata?POLLOUT:0); pfd[0].revents = 0; pfd[1].events = POLLIN; pfd[1].revents = 0; if (poll(pfd,2,100)<0) { /* correct timeout - in msec */ syslog(LOG_WARNING,"writeworker: poll error: %m"); status=EIO; break; } if (pfd[1].revents&POLLIN) { // used just to break poll - so just read all data from pipe to empty it i = read(id->pipe[0],pipebuff,1024); } if (pfd[0].revents&POLLIN) { i = read(fd,recvbuff+rcvd,21-rcvd); if (i==0) { // connection reset by peer ,读取文件头错误 syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was reset by peer (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1); status=EIO; break; } gettimeofday(&lastrcvd,NULL); rcvd+=i; if (rcvd==21) { rptr = recvbuff; reccmd = get32bit(&rptr); recleng = get32bit(&rptr); recchunkid = get64bit(&rptr); recwriteid = get32bit(&rptr); recstatus = get8bit(&rptr); if (reccmd!=CSTOCU_WRITE_STATUS || recleng!=13) { syslog(LOG_WARNING,"writeworker: got unrecognized packet from chunkserver (cmd:%"PRIu32",leng:%"PRIu32")",reccmd,recleng); status=EIO; break; } if (recchunkid!=chunkid) { syslog(LOG_WARNING,"writeworker: got unexpected packet (expected chunkdid:%"PRIu64",packet chunkid:%"PRIu64")",chunkid,recchunkid); status=EIO; break; } if (recstatus!=STATUS_OK) { syslog(LOG_WARNING,"writeworker: write error: %"PRIu8,recstatus); wrstatus=recstatus; break; } // debug: syslog(LOG_NOTICE,"writeworker: received status ok for writeid:%"PRIu32,recwriteid); if (recwriteid>0) { pthread_mutex_lock(&glock); for (rcb = id->datachainhead ; rcb && rcb->writeid!=recwriteid ; rcb=rcb->next) {} if (rcb==NULL) { syslog(LOG_WARNING,"writeworker: got unexpected status (writeid:%"PRIu32")",recwriteid); pthread_mutex_unlock(&glock); status=EIO; break; } if (rcb==cb) { // current block,cb为当前块儿指针 // debug: syslog(LOG_NOTICE,"writeworker: received status for current block"); if (havedata) { // got status ok before all data had been sent - error syslog(LOG_WARNING,"writeworker: got status OK before all data have been sent"); pthread_mutex_unlock(&glock); status=EIO; break; } else { cb = NULL; } } if (rcb->prev) {//将rcb所指块儿从链表中取出 rcb->prev->next = rcb->next; } else { id->datachainhead = rcb->next; } if (rcb->next) { rcb->next->prev = rcb->prev; } else { id->datachaintail = rcb->prev; } maxwroffset = (((uint64_t)(chindx))<<26)+(((uint32_t)(rcb->pos))<<16)+rcb->to; if (maxwroffset>mfleng) { mfleng=maxwroffset; } write_cb_release(rcb);// id->cacheblocks--; if (id->cachewaiting>0) { pthread_cond_broadcast(&(id->cachecond)); } pthread_mutex_unlock(&glock); } waitforstatus--; rcvd=0; } } if (havedata && (pfd[0].revents&POLLOUT)) { if (cb==NULL) { // havedata==1 && cb==NULL means sending first packet (CUTOCS_WRITE) if (sent<20) { #ifdef HAVE_WRITEV //将多个数据存储在一起,将驻留在两个或更多的不连接的缓冲区中的数据一次写出去 if (chainsize>0) { siov[0].iov_base = sendbuff+sent; siov[0].iov_len = 20-sent; siov[1].iov_base = (char*)chain; // discard const (safe - because it's used in writev) siov[1].iov_len = chainsize; i = writev(fd,siov,2); } else { #endif i = write(fd,sendbuff+sent,20-sent); #ifdef HAVE_WRITEV } #endif } else { i = write(fd,chain+(sent-20),chainsize-(sent-20)); } if (i<0) { syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was reset by peer (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1); status=EIO; break; } sent+=i; if (sent==20+chainsize) { havedata=0; } } else { if (sent<32) { #ifdef HAVE_WRITEV siov[0].iov_base = sendbuff+sent; siov[0].iov_len = 32-sent; siov[1].iov_base = cb->data+cb->from; siov[1].iov_len = cb->to-cb->from; i = writev(fd,siov,2); #else i = write(fd,sendbuff+sent,32-sent); #endif } else { i = write(fd,cb->data+cb->from+(sent-32),cb->to-cb->from-(sent-32)); } if (i<0) { syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was reset by peer (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1); status=EIO; break; } sent+=i; if (sent==32+cb->to-cb->from) { havedata=0; } } } } while (waitforstatus>0 && now.tv_sec<10);//////////////////// id->waitingworker=0; tcpclose(fd); #ifdef WORKER_DEBUG gettimeofday(&now,NULL); if (now.tv_usec<start.tv_usec) { now.tv_sec--; now.tv_usec+=1000000; } now.tv_sec -= start.tv_sec; now.tv_usec -= start.tv_usec; cl=0; for (cnt=0 ; cnt<chainelements ; cnt++) { cl+=snprintf(debugchain+cl,200-cl,"%u.%u.%u.%u:%u->",(chainip[cnt]>>24)&255,(chainip[cnt]>>16)&255,(chainip[cnt]>>8)&255,chainip[cnt]&255,chainport[cnt]); } if (cl>=2) { debugchain[cl-2]='\0'; } syslog(LOG_NOTICE,"worker %lu sent %"PRIu32" blocks (%"PRIu32" partial) of chunk %016"PRIX64"_%08"PRIX32", received status for %"PRIu32" blocks (%"PRIu32" lost), bw: %.6lfMB ( %"PRIu32" B / %.0lf us ), chain: %s",(unsigned long)arg,nextwriteid-1,partialblocks,chunkid,version,nextwriteid-1-waitforstatus,waitforstatus,(double)bytessent/((double)(now.tv_sec)*1000000+(double)(now.tv_usec)),bytessent,((double)(now.tv_sec)*1000000+(double)(now.tv_usec)),debugchain); #endif for (cnt=0 ; cnt<10 ; cnt++) { westatus = fs_writeend(chunkid,id->inode,mfleng); if (westatus!=STATUS_OK) { usleep(100000+(10000<<cnt)); } else { break; } } if (westatus!=STATUS_OK) { write_job_end(id,ENXIO,0); } else if (status!=0 || wrstatus!=STATUS_OK) { if (wrstatus!=STATUS_OK) { // convert MFS status to OS errno if (wrstatus==ERROR_NOSPACE) { status=ENOSPC; } else { status=EIO; } } id->trycnt++; if (id->trycnt>=maxretries) { write_job_end(id,status,0); } else { write_job_end(id,0,1+(id->trycnt<30)?(id->trycnt/3):10); } } else { read_inode_ops(id->inode); write_job_end(id,0,0); } } }