Ejemplo n.º 1
0
int mainserv_connect(uint32_t fwdip,uint16_t fwdport,uint32_t timeout) {
	int fwdsock;
	fwdsock = tcpsocket();
	if (fwdsock<0) {
		mfs_errlog(LOG_WARNING,"create socket, error");
		return -1;
	}
	if (tcpnonblock(fwdsock)<0) {
		mfs_errlog(LOG_WARNING,"set nonblock, error");
		tcpclose(fwdsock);
		return -1;
	}
	if (tcpnumtoconnect(fwdsock,fwdip,fwdport,timeout)<0) {
		mfs_arg_errlog(LOG_WARNING,"connect to %u.%u.%u.%u:%u failed, error",(fwdip>>24)&0xFF,(fwdip>>16)&0xFF,(fwdip>>8)&0xFF,fwdip&0xFF,fwdport);
		tcpclose(fwdsock);
		return -1;
	}
Ejemplo n.º 2
0
/* main working thread | glock:UNLOCKED */
void* write_worker(void *arg) {
	uint32_t z1,z2,z3;
	uint8_t *data;
	int fd;
	int i;
	struct pollfd pfd[2];
	uint32_t sent,rcvd;
	uint8_t recvbuff[21];
	uint8_t sendbuff[32];
#ifdef HAVE_WRITEV
	struct iovec siov[2];
#endif
	uint8_t pipebuff[1024];
	uint8_t *wptr;
	const uint8_t *rptr;

	uint32_t reccmd;
	uint32_t recleng;
	uint64_t recchunkid;
	uint32_t recwriteid;
	uint8_t recstatus;

#ifdef WORKER_DEBUG
	uint32_t partialblocks;
	uint32_t bytessent;
	char debugchain[200];
	uint32_t cl;
#endif

	const uint8_t *cp,*cpe;
	uint32_t chainip[10];
	uint16_t chainport[10];
	uint16_t chainelements;

	uint16_t chindx;
	uint32_t ip;
	uint16_t port;
	uint32_t srcip;
	uint64_t mfleng;
	uint64_t maxwroffset;
	uint64_t chunkid;
	uint32_t version;
	uint32_t nextwriteid;
	const uint8_t *chain;
	uint32_t chainsize;
	const uint8_t *csdata;
	uint32_t csdatasize;
	uint8_t westatus;
	uint8_t wrstatus;
	int status;
	uint8_t waitforstatus;
	uint8_t havedata;
	struct timeval start,now,lastrcvd,lrdiff;

	uint8_t cnt;

	inodedata *id;
	cblock *cb,*rcb;
//	inodedata *id;

	chainelements = 0;

	(void)arg;
	for (;;) {
		for (cnt=0 ; cnt<chainelements ; cnt++) {
			csdb_writedec(chainip[cnt],chainport[cnt]);
		}
		chainelements=0;

		// get next job
		queue_get(jqueue,&z1,&z2,&data,&z3);
		id = (inodedata*)data;

		pthread_mutex_lock(&glock);
		if (id->datachainhead) {
			chindx = id->datachainhead->chindx;
		} else {
			syslog(LOG_WARNING,"writeworker got inode with no data to write !!!");
			chindx = 0xFFFF;
			status = EINVAL;	// this should never happen, so status is not important - just anything
		}
		status = id->status;
		pthread_mutex_unlock(&glock);

		if (status) {
			write_job_end(id,status,0);
			continue;
		}

		// syslog(LOG_NOTICE,"file: %"PRIu32", index: %"PRIu16" - debug1",id->inode,chindx);
		// get chunk data from master
		wrstatus = fs_writechunk(id->inode,chindx,&mfleng,&chunkid,&version,&csdata,&csdatasize);
		if (wrstatus!=STATUS_OK) {
			syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16" - fs_writechunk returns status %d",id->inode,chindx,wrstatus);
			if (wrstatus!=ERROR_LOCKED) {
				if (wrstatus==ERROR_ENOENT) {
					write_job_end(id,EBADF,0);
				} else if (wrstatus==ERROR_QUOTA) {
					write_job_end(id,EDQUOT,0);
				} else if (wrstatus==ERROR_NOSPACE) {
					write_job_end(id,ENOSPC,0);
				} else {
					id->trycnt++;
					if (id->trycnt>=maxretries) {
						if (wrstatus==ERROR_NOCHUNKSERVERS) {
							write_job_end(id,ENOSPC,0);
						} else {
							write_job_end(id,EIO,0);
						}
					} else {
						write_delayed_enqueue(id,1+(id->trycnt<30)?(id->trycnt/3):10);
					}
				}
			} else {
				write_delayed_enqueue(id,1+(id->trycnt<30)?(id->trycnt/3):10);
			}
			continue;	// get next job
		}
		if (csdata==NULL || csdatasize==0) {
			syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - there are no valid copies",id->inode,chindx,chunkid,version);
			id->trycnt+=6;
			if (id->trycnt>=maxretries) {
				write_job_end(id,ENXIO,0);
			} else {
				write_delayed_enqueue(id,60);
			}
			continue;
		}
		cp = csdata;
		cpe = csdata+csdatasize;
		while (cp<cpe && chainelements<10) {
			chainip[chainelements] = get32bit(&cp);
			chainport[chainelements] = get16bit(&cp);
			csdb_writeinc(chainip[chainelements],chainport[chainelements]);
			chainelements++;
		}

		chain = csdata;
		ip = get32bit(&chain);
		port = get16bit(&chain);
		chainsize = csdatasize-6;
		gettimeofday(&start,NULL);

/*
		if (csdatasize>CSDATARESERVE) {
			csdatasize = CSDATARESERVE;
		}
		memcpy(wrec->csdata,csdata,csdatasize);
		wrec->csdatasize=csdatasize;
		while (csdatasize>=6) {
			tmpip = get32bit(&csdata);
			tmpport = get16bit(&csdata);
			csdatasize-=6;
			csdb_writeinc(tmpip,tmpport);
		}
*/

		// make connection to cs
		srcip = fs_getsrcip();
		cnt=5;
		while (cnt>0) {
			fd = tcpsocket();
			if (fd<0) {
				syslog(LOG_WARNING,"can't create tcp socket: %m");
				cnt=0;
			}
			if (srcip) {
				if (tcpnumbind(fd,srcip,0)<0) {
					syslog(LOG_WARNING,"can't bind socket to given ip: %m");
					tcpclose(fd);
					fd=-1;
					break;
				}
			}
			if (tcpnumtoconnect(fd,ip,port,200)<0) {
				cnt--;
				if (cnt==0) {
					syslog(LOG_WARNING,"can't connect to (%08"PRIX32":%"PRIu16"): %m",ip,port);
				}
				tcpclose(fd);
				fd=-1;
			} else {
				cnt=0;
			}
		}
		if (fd<0) {
			fs_writeend(chunkid,id->inode,0);
			id->trycnt++;
			if (id->trycnt>=maxretries) {
				write_job_end(id,EIO,0);
			} else {
				write_delayed_enqueue(id,1+(id->trycnt<30)?(id->trycnt/3):10);
			}
			continue;
		}
		if (tcpnodelay(fd)<0) {
			syslog(LOG_WARNING,"can't set TCP_NODELAY: %m");
		}

#ifdef WORKER_DEBUG
		partialblocks=0;
		bytessent=0;
#endif
		nextwriteid=1;

		pfd[0].fd = fd;
		pfd[1].fd = id->pipe[0];
		rcvd = 0;
		sent = 0;
		waitforstatus=1;
		havedata=1;
		wptr = sendbuff;
		put32bit(&wptr,CUTOCS_WRITE);
		put32bit(&wptr,12+chainsize);
		put64bit(&wptr,chunkid);
		put32bit(&wptr,version);
// debug:	syslog(LOG_NOTICE,"writeworker: init packet prepared");
		cb = NULL;

		status = 0;
		wrstatus = STATUS_OK;

		lastrcvd.tv_sec = 0;

		do {
			gettimeofday(&now,NULL);

			if (lastrcvd.tv_sec==0) {
				lastrcvd = now;
			} else {
				lrdiff = now;
				if (lrdiff.tv_usec<lastrcvd.tv_usec) {
					lrdiff.tv_sec--;
					lrdiff.tv_usec+=1000000;
				}
				lrdiff.tv_sec -= lastrcvd.tv_sec;
				lrdiff.tv_usec -= lastrcvd.tv_usec;
				if (lrdiff.tv_sec>=2) {
					syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was timed out (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1);
					break;
				}
			}

			if (now.tv_usec<start.tv_usec) {
				now.tv_sec--;
				now.tv_usec+=1000000;
			}
			now.tv_sec -= start.tv_sec;
			now.tv_usec -= start.tv_usec;

			if (havedata==0 && now.tv_sec<5 && waitforstatus<5) {
				pthread_mutex_lock(&glock);
				if (cb==NULL) {
					if (id->datachainhead) {
						if (id->datachainhead->to-id->datachainhead->from==65536 || waitforstatus<=1) {
							cb = id->datachainhead;
							havedata=1;
						}
					}
				} else {
					if (cb->next) {
						if (cb->next->chindx==chindx) {
							if (cb->next->to-cb->next->from==65536 || waitforstatus<=1) {
								cb = cb->next;
								havedata=1;
							}
						}
					} else {
						id->waitingworker=1;
					}
				}
				if (havedata==1) {
					cb->writeid = nextwriteid++;
// debug:				syslog(LOG_NOTICE,"writeworker: data packet prepared (writeid:%"PRIu32",pos:%"PRIu16")",cb->writeid,cb->pos);
					waitforstatus++;
					wptr = sendbuff;
					put32bit(&wptr,CUTOCS_WRITE_DATA);
					put32bit(&wptr,24+(cb->to-cb->from));
					put64bit(&wptr,chunkid);
					put32bit(&wptr,cb->writeid);
					put16bit(&wptr,cb->pos);
					put16bit(&wptr,cb->from);
					put32bit(&wptr,cb->to-cb->from);
					put32bit(&wptr,mycrc32(0,cb->data+cb->from,cb->to-cb->from));
#ifdef WORKER_DEBUG
					if (cb->to-cb->from<65536) {
						partialblocks++;
					}
					bytessent+=(cb->to-cb->from);
#endif
					sent=0;
				}
				pthread_mutex_unlock(&glock);
			}

			pfd[0].events = POLLIN | (havedata?POLLOUT:0);
			pfd[0].revents = 0;
			pfd[1].events = POLLIN;
			pfd[1].revents = 0;
			if (poll(pfd,2,100)<0) { /* correct timeout - in msec */
				syslog(LOG_WARNING,"writeworker: poll error: %m");
				status=EIO;
				break;
			}
			if (pfd[1].revents&POLLIN) {	// used just to break poll - so just read all data from pipe to empty it
				i = read(id->pipe[0],pipebuff,1024);
			}
			if (pfd[0].revents&POLLIN) {
				i = read(fd,recvbuff+rcvd,21-rcvd);
				if (i==0) { 	// connection reset by peer ,读取文件头错误
					syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was reset by peer (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1);
					status=EIO;
					break;
				}
				gettimeofday(&lastrcvd,NULL);
				rcvd+=i;
				if (rcvd==21) {
					rptr = recvbuff;
					reccmd = get32bit(&rptr);
					recleng = get32bit(&rptr);
					recchunkid = get64bit(&rptr);
					recwriteid = get32bit(&rptr);
					recstatus = get8bit(&rptr);
					if (reccmd!=CSTOCU_WRITE_STATUS ||  recleng!=13) {
						syslog(LOG_WARNING,"writeworker: got unrecognized packet from chunkserver (cmd:%"PRIu32",leng:%"PRIu32")",reccmd,recleng);
						status=EIO;
						break;
					}
					if (recchunkid!=chunkid) {
						syslog(LOG_WARNING,"writeworker: got unexpected packet (expected chunkdid:%"PRIu64",packet chunkid:%"PRIu64")",chunkid,recchunkid);
						status=EIO;
						break;
					}
					if (recstatus!=STATUS_OK) {
						syslog(LOG_WARNING,"writeworker: write error: %"PRIu8,recstatus);
						wrstatus=recstatus;
						break;
					}
// debug:				syslog(LOG_NOTICE,"writeworker: received status ok for writeid:%"PRIu32,recwriteid);
					if (recwriteid>0) {
						pthread_mutex_lock(&glock);
						for (rcb = id->datachainhead ; rcb && rcb->writeid!=recwriteid ; rcb=rcb->next) {}
						if (rcb==NULL) {
							syslog(LOG_WARNING,"writeworker: got unexpected status (writeid:%"PRIu32")",recwriteid);
							pthread_mutex_unlock(&glock);
							status=EIO;
							break;
						}
						if (rcb==cb) {	// current block,cb为当前块儿指针
// debug:						syslog(LOG_NOTICE,"writeworker: received status for current block");
							if (havedata) {	// got status ok before all data had been sent - error
								syslog(LOG_WARNING,"writeworker: got status OK before all data have been sent");
								pthread_mutex_unlock(&glock);
								status=EIO;
								break;
							} else {
								cb = NULL;
							}
						}
						if (rcb->prev) {//将rcb所指块儿从链表中取出
							rcb->prev->next = rcb->next;
						} else {
							id->datachainhead = rcb->next;
						}
						if (rcb->next) {
							rcb->next->prev = rcb->prev;
						} else {
							id->datachaintail = rcb->prev;
						}
						maxwroffset = (((uint64_t)(chindx))<<26)+(((uint32_t)(rcb->pos))<<16)+rcb->to;
						if (maxwroffset>mfleng) {
							mfleng=maxwroffset;
						}
						write_cb_release(rcb);//
						id->cacheblocks--;
						if (id->cachewaiting>0) {
							pthread_cond_broadcast(&(id->cachecond));
						}
						pthread_mutex_unlock(&glock);
					}
					waitforstatus--;
					rcvd=0;
				}
			}
			if (havedata && (pfd[0].revents&POLLOUT)) {
				if (cb==NULL) {	// havedata==1 && cb==NULL means sending first packet (CUTOCS_WRITE)
					if (sent<20) {
#ifdef HAVE_WRITEV                //将多个数据存储在一起,将驻留在两个或更多的不连接的缓冲区中的数据一次写出去
						if (chainsize>0) {
							siov[0].iov_base = sendbuff+sent;
							siov[0].iov_len = 20-sent;
							siov[1].iov_base = (char*)chain;	// discard const (safe - because it's used in writev)
							siov[1].iov_len = chainsize;
							i = writev(fd,siov,2);
						} else {
#endif
							i = write(fd,sendbuff+sent,20-sent);
#ifdef HAVE_WRITEV
						}
#endif
					} else {
						i = write(fd,chain+(sent-20),chainsize-(sent-20));
					}
					if (i<0) {
						syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was reset by peer (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1);
						status=EIO;
						break;
					}
					sent+=i;
					if (sent==20+chainsize) {
						havedata=0;
					}
				} else {
					if (sent<32) {
#ifdef HAVE_WRITEV                
						siov[0].iov_base = sendbuff+sent;
						siov[0].iov_len = 32-sent;
						siov[1].iov_base = cb->data+cb->from;
						siov[1].iov_len = cb->to-cb->from;
						i = writev(fd,siov,2);
#else
						i = write(fd,sendbuff+sent,32-sent);
#endif
					} else {
						i = write(fd,cb->data+cb->from+(sent-32),cb->to-cb->from-(sent-32));
					}
					if (i<0) {
						syslog(LOG_WARNING,"file: %"PRIu32", index: %"PRIu16", chunk: %"PRIu64", version: %"PRIu32" - writeworker: connection with (%08"PRIX32":%"PRIu16") was reset by peer (unfinished writes: %"PRIu8"; try counter: %"PRIu32")",id->inode,chindx,chunkid,version,ip,port,waitforstatus,id->trycnt+1);
						status=EIO;
						break;
					}
					sent+=i;
					if (sent==32+cb->to-cb->from) {
						havedata=0;
					}
				}
			}
		} while (waitforstatus>0 && now.tv_sec<10);////////////////////


		id->waitingworker=0;

		tcpclose(fd);

#ifdef WORKER_DEBUG
		gettimeofday(&now,NULL);
		if (now.tv_usec<start.tv_usec) {
			now.tv_sec--;
			now.tv_usec+=1000000;
		}
		now.tv_sec -= start.tv_sec;
		now.tv_usec -= start.tv_usec;

		cl=0;
		for (cnt=0 ; cnt<chainelements ; cnt++) {
			cl+=snprintf(debugchain+cl,200-cl,"%u.%u.%u.%u:%u->",(chainip[cnt]>>24)&255,(chainip[cnt]>>16)&255,(chainip[cnt]>>8)&255,chainip[cnt]&255,chainport[cnt]);
		}
		if (cl>=2) {
			debugchain[cl-2]='\0';
		}
		syslog(LOG_NOTICE,"worker %lu sent %"PRIu32" blocks (%"PRIu32" partial) of chunk %016"PRIX64"_%08"PRIX32", received status for %"PRIu32" blocks (%"PRIu32" lost), bw: %.6lfMB ( %"PRIu32" B / %.0lf us ), chain: %s",(unsigned long)arg,nextwriteid-1,partialblocks,chunkid,version,nextwriteid-1-waitforstatus,waitforstatus,(double)bytessent/((double)(now.tv_sec)*1000000+(double)(now.tv_usec)),bytessent,((double)(now.tv_sec)*1000000+(double)(now.tv_usec)),debugchain);
#endif

		for (cnt=0 ; cnt<10 ; cnt++) {
			westatus = fs_writeend(chunkid,id->inode,mfleng);
			if (westatus!=STATUS_OK) {
				usleep(100000+(10000<<cnt));
			} else {
				break;
			}
		}

		if (westatus!=STATUS_OK) {
			write_job_end(id,ENXIO,0);
		} else if (status!=0 || wrstatus!=STATUS_OK) {
			if (wrstatus!=STATUS_OK) {	// convert MFS status to OS errno
				if (wrstatus==ERROR_NOSPACE) {
					status=ENOSPC;
				} else {
					status=EIO;
				}
			}
			id->trycnt++;
			if (id->trycnt>=maxretries) {
				write_job_end(id,status,0);
			} else {
				write_job_end(id,0,1+(id->trycnt<30)?(id->trycnt/3):10);
			}
		} else {
			read_inode_ops(id->inode);
			write_job_end(id,0,0);
		}
	}
}