示例#1
0
afs_int32
SDISK_SendFile(struct rx_call *rxcall, afs_int32 file,
	       afs_int32 length, struct ubik_version *avers)
{
    afs_int32 code;
    struct ubik_dbase *dbase = NULL;
    char tbuffer[1024];
    afs_int32 offset;
    struct ubik_version tversion;
    int tlen;
    struct rx_peer *tpeer;
    struct rx_connection *tconn;
    afs_uint32 otherHost = 0;
    char hoststr[16];
    char pbuffer[1028];
    int fd = -1;
    afs_int32 epoch = 0;
    afs_int32 pass;

    /* send the file back to the requester */

    dbase = ubik_dbase;

    if ((code = ubik_CheckAuth(rxcall))) {
	DBHOLD(dbase);
	goto failed;
    }

    /* next, we do a sanity check to see if the guy sending us the database is
     * the guy we think is the sync site.  It turns out that we might not have
     * decided yet that someone's the sync site, but they could have enough
     * votes from others to be sync site anyway, and could send us the database
     * in advance of getting our votes.  This is fine, what we're really trying
     * to check is that some authenticated bogon isn't sending a random database
     * into another configuration.  This could happen on a bad configuration
     * screwup.  Thus, we only object if we're sure we know who the sync site
     * is, and it ain't the guy talking to us.
     */
    offset = uvote_GetSyncSite();
    tconn = rx_ConnectionOf(rxcall);
    tpeer = rx_PeerOf(tconn);
    otherHost = ubikGetPrimaryInterfaceAddr(rx_HostOf(tpeer));
    if (offset && offset != otherHost) {
	/* we *know* this is the wrong guy */
	code = USYNC;
	DBHOLD(dbase);
	goto failed;
    }

    DBHOLD(dbase);

    /* abort any active trans that may scribble over the database */
    urecovery_AbortAll(dbase);

    ubik_print("Ubik: Synchronize database with server %s\n",
	       afs_inet_ntoa_r(otherHost, hoststr));

    offset = 0;
    UBIK_VERSION_LOCK;
    epoch = tversion.epoch = 0;		/* start off by labelling in-transit db as invalid */
    (*dbase->setlabel) (dbase, file, &tversion);	/* setlabel does sync */
    snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP",
	     ubik_dbase->pathName, (file<0)?"SYS":"",
	     (file<0)?-file:file);
    fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600);
    if (fd < 0) {
	code = errno;
	goto failed_locked;
    }
    code = lseek(fd, HDRSIZE, 0);
    if (code != HDRSIZE) {
	close(fd);
	goto failed_locked;
    }
    pass = 0;
    memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version));
    UBIK_VERSION_UNLOCK;
    while (length > 0) {
	tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length);
#if !defined(AFS_PTHREAD_ENV)
	if (pass % 4 == 0)
	    IOMGR_Poll();
#endif
	code = rx_Read(rxcall, tbuffer, tlen);
	if (code != tlen) {
	    ubik_dprint("Rx-read length error=%d\n", code);
	    code = BULK_ERROR;
	    close(fd);
	    goto failed;
	}
	code = write(fd, tbuffer, tlen);
	pass++;
	if (code != tlen) {
	    ubik_dprint("write failed error=%d\n", code);
	    code = UIOERROR;
	    close(fd);
	    goto failed;
	}
	offset += tlen;
	length -= tlen;
    }
    code = close(fd);
    if (code)
	goto failed;

    /* sync data first, then write label and resync (resync done by setlabel call).
     * This way, good label is only on good database. */
    snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d",
	     ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
#ifdef AFS_NT40_ENV
    snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD",
	     ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
    code = unlink(pbuffer);
    if (!code)
	code = rename(tbuffer, pbuffer);
    snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP",
	     ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
#endif
    if (!code)
	code = rename(pbuffer, tbuffer);
    UBIK_VERSION_LOCK;
    if (!code) {
	(*ubik_dbase->open) (ubik_dbase, file);
	code = (*ubik_dbase->setlabel) (dbase, file, avers);
    }
#ifdef AFS_NT40_ENV
    snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD",
	     ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
    unlink(pbuffer);
#endif
    memcpy(&ubik_dbase->version, avers, sizeof(struct ubik_version));
    udisk_Invalidate(dbase, file);	/* new dbase, flush disk buffers */
#ifdef AFS_PTHREAD_ENV
    assert(pthread_cond_broadcast(&dbase->version_cond) == 0);
#else
    LWP_NoYieldSignal(&dbase->version);
#endif

failed_locked:
    UBIK_VERSION_UNLOCK;

failed:
    if (code) {
	unlink(pbuffer);
	/* Failed to sync. Allow reads again for now. */
	if (dbase != NULL) {
	    UBIK_VERSION_LOCK;
	    tversion.epoch = epoch;
	    (*dbase->setlabel) (dbase, file, &tversion);
	    UBIK_VERSION_UNLOCK;
	}
	ubik_print
	    ("Ubik: Synchronize database with server %s failed (error = %d)\n",
	     afs_inet_ntoa_r(otherHost, hoststr), code);
    } else {
	ubik_print("Ubik: Synchronize database completed\n");
    }
    DBRELE(dbase);
    return code;
}
示例#2
0
文件: vote.c 项目: vkamra/openafs
/*!
 * \brief called by the sync site to handle vote beacons; if aconn is null, this is a
 * local call
 *
 * \returns 0 or time when the vote was sent.  It returns 0 if we are
 * not voting for this sync site, or the time we actually voted yes, if
 * non-zero.
 */
afs_int32
SVOTE_Beacon(struct rx_call * rxcall, afs_int32 astate,
	     afs_int32 astart, struct ubik_version * avers,
	     struct ubik_tid * atid)
{
    afs_int32 otherHost;
    afs_int32 now;
    afs_int32 vote;
    struct rx_connection *aconn;
    struct rx_peer *rxp;
    struct ubik_server *ts;
    int isClone = 0;
    char hoststr[16];

    if (rxcall) {		/* caller's host */
	aconn = rx_ConnectionOf(rxcall);
	rxp = rx_PeerOf(aconn);
	otherHost = rx_HostOf(rxp);

	/* get the primary interface address for this host.  */
	/* This is the identifier that ubik uses. */
	otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
	if (!otherHost) {
	    ubik_dprint("Received beacon from unknown host %s\n",
			afs_inet_ntoa_r(rx_HostOf(rxp), hoststr));
	    return 0;		/* I don't know about you: vote no */
	}
	for (ts = ubik_servers; ts; ts = ts->next) {
	    if (ts->addr[0] == otherHost)
		break;
	}
	if (!ts)
	    ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
	if (ts && ts->isClone)
	    isClone = 1;
    } else {
	otherHost = ubik_host[0];	/* this host */
	isClone = amIClone;
    }

    ubik_dprint("Received beacon type %d from host %s\n", astate,
		afs_inet_ntoa_r(otherHost, hoststr));

    /* compute the lowest server we've heard from.  We'll try to only vote for
     * this dude if we don't already have a synchronization site.  Also, don't
     * let a very old lowestHost confusing things forever.  We pick a new
     * lowestHost after BIGTIME seconds to limit the damage if this host
     * actually crashes.  Finally, we also count in this computation: don't
     * pick someone else if we're even better!
     *
     * Note that the test below must be <=, not <, so that we keep refreshing
     * lowestTime.  Otherwise it will look like we haven't heard from
     * lowestHost in a while and another host could slip in.  */


    /* First compute the lowest host we've heard from, whether we want them
     * for a sync site or not.  If we haven't heard from a site in BIGTIME
     * seconds, we ignore its presence in lowestHost: it may have crashed.
     * Note that we don't ever let anyone appear in our lowestHost if we're
     * lower than them, 'cause we know we're up. */
    /* But do not consider clones for lowesHost since they never may become
     * sync site */
    UBIK_VOTE_LOCK;
    now = FT_ApproxTime();	/* close to current time */
    if (!isClone
	&& (ntohl((afs_uint32)otherHost) <= ntohl((afs_uint32)vote_globals.lowestHost)
	    || vote_globals.lowestTime + BIGTIME < now)) {
	vote_globals.lowestTime = now;
	vote_globals.lowestHost = otherHost;
    }
    /* why do we need this next check?  Consider the case where each of two
     * servers decides the other is lowestHost.  Each stops sending beacons
     * 'cause the other is there.  Not obvious that this process terminates:
     * i.e. each guy could restart procedure and again think other side is
     * lowest.  Need to prove: if one guy in the system is lowest and knows
     * he's lowest, these loops don't occur.  because if someone knows he's
     * lowest, he will send out beacons telling others to vote for him. */
    if (!amIClone
	&& (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32)vote_globals.lowestHost)
	    || vote_globals.lowestTime + BIGTIME < now)) {
	vote_globals.lowestTime = now;
	vote_globals.lowestHost = ubik_host[0];
    }

    /* tell if we've heard from a sync site recently (even if we're not voting
     * for this dude yet).  After a while, time the guy out. */
    if (astate) {		/* this guy is a sync site */
	vote_globals.syncHost = otherHost;
	vote_globals.syncTime = now;
    } else if (vote_globals.syncTime + BIGTIME < now) {
	if (vote_globals.syncHost) {
	    ubik_dprint
		("Ubik: Lost contact with sync-site %s (NOT in quorum)\n",
		 afs_inet_ntoa_r(vote_globals.syncHost, hoststr));
	}
	vote_globals.syncHost = 0;
    }

    /* decide how to vote */
    vote = 0;			/* start off voting no */

    /* if we this guy isn't a sync site, we don't really have to vote for him.
     * We get to apply some heuristics to try to avoid weird oscillation sates
     * in the voting procedure. */
    if (astate == 0) {
	/* in here only if this guy doesn't claim to be a sync site */

	/* lowestHost is also trying for our votes, then just say no. */
	if (ntohl(vote_globals.lowestHost) != ntohl(otherHost)) {
	    goto done_zero;
	}

	/* someone else *is* a sync site, just say no */
	if (vote_globals.syncHost && vote_globals.syncHost != otherHost)
	    goto done_zero;
    } else if (vote_globals.lastYesHost == 0xffffffff && otherHost == ubik_host[0]) {
	/* fast startup if this is the only non-clone */
	int i = 0;
	for (ts = ubik_servers; ts; ts = ts->next) {
	    if (ts->addr[0] == otherHost)
		continue;
	    if (!ts->isClone)
		i++;
	}
	if (!i)
	    vote_globals.lastYesHost = otherHost;
    }


    if (isClone)
	goto done_zero;		/* clone never can become sync site */

    /* Don't promise sync site support to more than one host every BIGTIME
     * seconds.  This is the heart of our invariants in this system. */
    if (vote_globals.ubik_lastYesTime + BIGTIME < now || otherHost == vote_globals.lastYesHost) {
	if ((vote_globals.ubik_lastYesTime + BIGTIME < now) || (otherHost != vote_globals.lastYesHost)
	    || (vote_globals.lastYesState != astate)) {
	    /* A new vote or a change in the vote or changed quorum */
	    ubik_dprint("Ubik: vote 'yes' for %s %s\n",
			afs_inet_ntoa_r(otherHost, hoststr),
			(astate ? "(in quorum)" : "(NOT in quorum)"));
	}

	vote = now;		/* vote yes */
	vote_globals.ubik_lastYesTime = now;	/* remember when we voted yes */
	vote_globals.lastYesClaim = astart;	/* remember for computing when sync site expires */
	vote_globals.lastYesHost = otherHost;	/* and who for */
	vote_globals.lastYesState = astate;	/* remember if site is a sync site */
	vote_globals.ubik_dbVersion = *avers;	/* resync value */
	vote_globals.ubik_dbTid = *atid;	/* transaction id, if any, of active trans */
	UBIK_VOTE_UNLOCK;
	DBHOLD(ubik_dbase);
	urecovery_CheckTid(atid, 0);	/* check if current write trans needs aborted */
	DBRELE(ubik_dbase);
    } else {
	UBIK_VOTE_UNLOCK;
    }
    return vote;
done_zero:
    UBIK_VOTE_UNLOCK;
    return 0;
}