示例#1
0
void
afs_CheckServerDaemon(void)
{
    afs_int32 now, delay, lastCheck, last10MinCheck;

    afs_CheckServerDaemonStarted = 1;

    while (afs_initState < 101)
	afs_osi_Sleep(&afs_initState);
    afs_osi_Wait(PROBE_WAIT(), &AFS_CSWaitHandler, 0);

    last10MinCheck = lastCheck = osi_Time();
    while (1) {
	if (afs_termState == AFSOP_STOP_CS) {
	    afs_termState = AFSOP_STOP_TRUNCDAEMON;
	    afs_osi_Wakeup(&afs_termState);
	    break;
	}

	now = osi_Time();
	if (afs_probe_interval + lastCheck <= now) {
	    afs_CheckServers(1, NULL);	/* check down servers */
	    lastCheck = now = osi_Time();
	}

	if (afs_probe_all_interval + last10MinCheck <= now) {
	    afs_Trace1(afs_iclSetp, CM_TRACE_PROBEUP, ICL_TYPE_INT32, afs_probe_all_interval);
	    afs_CheckServers(0, NULL);
	    last10MinCheck = now = osi_Time();
	}
	/* shutdown check. */
	if (afs_termState == AFSOP_STOP_CS) {
	    afs_termState = AFSOP_STOP_TRUNCDAEMON;
	    afs_osi_Wakeup(&afs_termState);
	    break;
	}

	/* Compute time to next probe. */
	delay = afs_probe_interval + lastCheck;
	if (delay > afs_probe_all_interval + last10MinCheck)
	    delay = afs_probe_all_interval + last10MinCheck;
	delay -= now;
	if (delay < 1)
	    delay = 1;
	afs_osi_Wait(delay * 1000, &AFS_CSWaitHandler, 0);
    }
    afs_CheckServerDaemonStarted = 0;
}
示例#2
0
/*------------------------------------------------------------------------
 * EXPORTED afs_Analyze
 *
 * Description:
 *	Analyze the outcome of an RPC operation, taking whatever support
 *	actions are necessary.
 *
 * Arguments:
 *	aconn : Ptr to the relevant connection on which the call was made.
 *	acode : The return code experienced by the RPC.
 *	afid  : The FID of the file involved in the action.  This argument
 *		may be null if none was involved.
 *	areq  : The request record associated with this operation.
 *      op    : which RPC we are analyzing.
 *      cellp : pointer to a cell struct.  Must provide either fid or cell.
 *
 * Returns:
 *	Non-zero value if the related RPC operation should be retried,
 *	zero otherwise.
 *
 * Environment:
 *	This routine is typically called in a do-while loop, causing the
 *	embedded RPC operation to be called repeatedly if appropriate
 *	until whatever error condition (if any) is intolerable.
 *
 * Side Effects:
 *	As advertised.
 *
 * NOTE:
 *	The retry return value is used by afs_StoreAllSegments to determine
 *	if this is a temporary or permanent error.
 *------------------------------------------------------------------------*/
int
afs_Analyze(register struct afs_conn *aconn, afs_int32 acode,
	    struct VenusFid *afid, register struct vrequest *areq, int op,
	    afs_int32 locktype, struct cell *cellp)
{
    afs_int32 i;
    struct srvAddr *sa;
    struct server *tsp;
    struct volume *tvp;
    afs_int32 shouldRetry = 0;
    afs_int32 serversleft = 1;
    struct afs_stats_RPCErrors *aerrP;
    afs_int32 markeddown;

 
 
    if (AFS_IS_DISCONNECTED && !AFS_IN_SYNC) {
	/* On reconnection, act as connected. XXX: for now.... */
        /* SXW - This may get very tired after a while. We should try and
	 *       intercept all RPCs before they get here ... */
	/*printf("afs_Analyze: disconnected\n");*/
	afs_FinalizeReq(areq);
	if (aconn) {
	    /* SXW - I suspect that this will _never_ happen - we shouldn't
	     *       get a connection because we're disconnected !!!*/
	    afs_PutConn(aconn, locktype);
	}
	return 0;
    }
    
    AFS_STATCNT(afs_Analyze);
    afs_Trace4(afs_iclSetp, CM_TRACE_ANALYZE, ICL_TYPE_INT32, op,
	       ICL_TYPE_POINTER, aconn, ICL_TYPE_INT32, acode, ICL_TYPE_LONG,
	       areq->uid);

    aerrP = (struct afs_stats_RPCErrors *)0;

    if ((op >= 0) && (op < AFS_STATS_NUM_FS_RPC_OPS))
	aerrP = &(afs_stats_cmfullperf.rpc.fsRPCErrors[op]);

    afs_FinalizeReq(areq);
    if (!aconn && areq->busyCount) {	/* one RPC or more got VBUSY/VRESTARTING */

	tvp = afs_FindVolume(afid, READ_LOCK);
	if (tvp) {
	    afs_warnuser("afs: Waiting for busy volume %u (%s) in cell %s\n",
			 (afid ? afid->Fid.Volume : 0),
			 (tvp->name ? tvp->name : ""),
			 ((tvp->serverHost[0]
			   && tvp->serverHost[0]->cell) ? tvp->serverHost[0]->
			  cell->cellName : ""));

	    for (i = 0; i < MAXHOSTS; i++) {
		if (tvp->status[i] != not_busy && tvp->status[i] != offline) {
		    tvp->status[i] = not_busy;
		}
		if (tvp->status[i] == not_busy)
		    shouldRetry = 1;
	    }
	    afs_PutVolume(tvp, READ_LOCK);
	} else {
	    afs_warnuser("afs: Waiting for busy volume %u\n",
			 (afid ? afid->Fid.Volume : 0));
	}

	if (areq->busyCount > 100) {
	    if (aerrP)
		(aerrP->err_Volume)++;
	    areq->volumeError = VOLBUSY;
	    shouldRetry = 0;
	} else {
	    VSleep(afs_BusyWaitPeriod);	/* poll periodically */
	}
	if (shouldRetry != 0)
	    areq->busyCount++;

	return shouldRetry;	/* should retry */
    }

    if (!aconn || !aconn->srvr) {
	if (!areq->volumeError) {
	    if (aerrP)
		(aerrP->err_Network)++;
	    if (hm_retry_int && !(areq->flags & O_NONBLOCK) &&	/* "hard" mount */
		((afid && afs_IsPrimaryCellNum(afid->Cell))
		 || (cellp && afs_IsPrimaryCell(cellp)))) {
		if (!afid) {
		    afs_warnuser
			("afs: hard-mount waiting for a vlserver to return to service\n");
		    VSleep(hm_retry_int);
		    afs_CheckServers(1, cellp);
		    shouldRetry = 1;
		} else {
		    tvp = afs_FindVolume(afid, READ_LOCK);
		    if (!tvp || (tvp->states & VRO)) {
			shouldRetry = hm_retry_RO;
		    } else {
			shouldRetry = hm_retry_RW;
		    }
		    if (tvp)
			afs_PutVolume(tvp, READ_LOCK);
		    if (shouldRetry) {
			afs_warnuser
			    ("afs: hard-mount waiting for volume %u\n",
			     afid->Fid.Volume);
			VSleep(hm_retry_int);
			afs_CheckServers(1, cellp);
		    }
		}
	    } /* if (hm_retry_int ... */
	    else {
		areq->networkError = 1;
	    }
	}
	return shouldRetry;
    }

    /* Find server associated with this connection. */
    sa = aconn->srvr;
    tsp = sa->server;

    /* Before we do anything with acode, make sure we translate it back to
     * a system error */
    if ((acode & ~0xff) == ERROR_TABLE_BASE_uae)
	acode = et_to_sys_error(acode);

    if (acode == 0) {
	/* If we previously took an error, mark this volume not busy */
	if (areq->volumeError) {
	    tvp = afs_FindVolume(afid, READ_LOCK);
	    if (tvp) {
		for (i = 0; i < MAXHOSTS; i++) {
		    if (tvp->serverHost[i] == tsp) {
			tvp->status[i] = not_busy;
		    }
		}
		afs_PutVolume(tvp, READ_LOCK);
	    }
	}

	afs_PutConn(aconn, locktype);
	return 0;
    }

    /* If network troubles, mark server as having bogued out again. */
    /* VRESTARTING is < 0 because of backward compatibility issues 
     * with 3.4 file servers and older cache managers */
#ifdef AFS_64BIT_CLIENT
    if (acode == -455)
	acode = 455;
#endif /* AFS_64BIT_CLIENT */
    if ((acode < 0) && (acode != VRESTARTING)) {
	if (acode == RX_CALL_TIMEOUT) {
	    serversleft = afs_BlackListOnce(areq, afid, tsp);
	    areq->idleError++;
	    if (serversleft) {
		shouldRetry = 1;
	    } else {
		shouldRetry = 0;
	    }
	    /* By doing this, we avoid ever marking a server down
	     * in an idle timeout case. That's because the server is 
	     * still responding and may only be letting a single vnode
	     * time out. We otherwise risk having the server continually
	     * be marked down, then up, then down again... 
	     */
	    goto out;
	} 
	markeddown = afs_ServerDown(sa);
	ForceNewConnections(sa); /**multi homed clients lock:afs_xsrvAddr? */
	if (aerrP)
	    (aerrP->err_Server)++;
#if 0
	/* retry *once* when the server is timed out in case of NAT */
	if (markeddown && acode == RX_CALL_DEAD) {
	    aconn->forceConnectFS = 1;
	    shouldRetry = 1;
	}
#endif
    }

    if (acode == VBUSY || acode == VRESTARTING) {
	if (acode == VBUSY) {
	    areq->busyCount++;
	    if (aerrP)
		(aerrP->err_VolumeBusies)++;
	} else
	    areq->busyCount = 1;

	tvp = afs_FindVolume(afid, READ_LOCK);
	if (tvp) {
	    for (i = 0; i < MAXHOSTS; i++) {
		if (tvp->serverHost[i] == tsp) {
		    tvp->status[i] = rdwr_busy;	/* can't tell which yet */
		    /* to tell which, have to look at the op code. */
		}
	    }
	    afs_PutVolume(tvp, READ_LOCK);
	} else {
	    afs_warnuser("afs: Waiting for busy volume %u in cell %s\n",
			 (afid ? afid->Fid.Volume : 0), tsp->cell->cellName);
	    VSleep(afs_BusyWaitPeriod);	/* poll periodically */
	}
	shouldRetry = 1;
	acode = 0;
    } else if (acode == VICETOKENDEAD
	       || (acode & ~0xff) == ERROR_TABLE_BASE_RXK) {
	/* any rxkad error is treated as token expiration */
	struct unixuser *tu;
	/*
	 * I'm calling these errors protection errors, since they involve
	 * faulty authentication.
	 */
	if (aerrP)
	    (aerrP->err_Protection)++;

	tu = afs_FindUser(areq->uid, tsp->cell->cellNum, READ_LOCK);
	if (tu) {
	    if (acode == VICETOKENDEAD) {
		aconn->forceConnectFS = 1;
	    } else if (acode == RXKADEXPIRED) {
		aconn->forceConnectFS = 0;	/* don't check until new tokens set */
		aconn->user->states |= UTokensBad;
		afs_warnuser
		    ("afs: Tokens for user of AFS id %d for cell %s have expired\n",
		     tu->vid, aconn->srvr->server->cell->cellName);
	    } else {
		serversleft = afs_BlackListOnce(areq, afid, tsp);
		areq->tokenError++;

		if (serversleft) {
		    afs_warnuser
			("afs: Tokens for user of AFS id %d for cell %s: rxkad error=%d\n",
			 tu->vid, aconn->srvr->server->cell->cellName, acode);
		    shouldRetry = 1;
		} else {
		    areq->tokenError = 0;
		    aconn->forceConnectFS = 0;	/* don't check until new tokens set */
		    aconn->user->states |= UTokensBad;
		    afs_warnuser
			("afs: Tokens for user of AFS id %d for cell %s are discarded (rxkad error=%d)\n",
			 tu->vid, aconn->srvr->server->cell->cellName, acode);
		}
	    }
	    afs_PutUser(tu, READ_LOCK);
	} else {
	    /* The else case shouldn't be possible and should probably be replaced by a panic? */
	    if (acode == VICETOKENDEAD) {
		aconn->forceConnectFS = 1;
	    } else if (acode == RXKADEXPIRED) {
		aconn->forceConnectFS = 0;	/* don't check until new tokens set */
		aconn->user->states |= UTokensBad;
		afs_warnuser
		    ("afs: Tokens for user %d for cell %s have expired\n",
		     areq->uid, aconn->srvr->server->cell->cellName);
	    } else {
		aconn->forceConnectFS = 0;	/* don't check until new tokens set */
		aconn->user->states |= UTokensBad;
		afs_warnuser
		    ("afs: Tokens for user %d for cell %s are discarded (rxkad error = %d)\n",
		     areq->uid, aconn->srvr->server->cell->cellName, acode);
	    }
	}
	shouldRetry = 1;	/* Try again (as root). */
    }
    /* Check for access violation. */
    else if (acode == EACCES) {
	/* should mark access error in non-existent per-user global structure */
	if (aerrP)
	    (aerrP->err_Protection)++;
	areq->accessError = 1;
	if (op == AFS_STATS_FS_RPCIDX_STOREDATA)
	    areq->permWriteError = 1;
	shouldRetry = 0;
    }
    /* check for ubik errors; treat them like crashed servers */
    else if (acode >= ERROR_TABLE_BASE_U && acode < ERROR_TABLE_BASE_U + 255) {
	afs_ServerDown(sa);
	if (aerrP)
	    (aerrP->err_Server)++;
	shouldRetry = 1;	/* retryable (maybe one is working) */
	VSleep(1);		/* just in case */
    }
    /* Check for bad volume data base / missing volume. */
    else if (acode == VSALVAGE || acode == VOFFLINE || acode == VNOVOL
	     || acode == VNOSERVICE || acode == VMOVED) {
	struct cell *tcell;
	int same;

	shouldRetry = 1;
	areq->volumeError = VOLMISSING;
	if (aerrP)
	    (aerrP->err_Volume)++;
	if (afid && (tcell = afs_GetCell(afid->Cell, 0))) {
	    same = VLDB_Same(afid, areq);
	    tvp = afs_FindVolume(afid, READ_LOCK);
	    if (tvp) {
		for (i = 0; i < MAXHOSTS && tvp->serverHost[i]; i++) {
		    if (tvp->serverHost[i] == tsp) {
			if (tvp->status[i] == end_not_busy)
			    tvp->status[i] = offline;
			else
			    tvp->status[i]++;
		    } else if (!same) {
			tvp->status[i] = not_busy;	/* reset the others */
		    }
		}
		afs_PutVolume(tvp, READ_LOCK);
	    }
	}
    } else if (acode >= ERROR_TABLE_BASE_VL && acode <= ERROR_TABLE_BASE_VL + 255) {	/* vlserver errors */
	shouldRetry = 0;
	areq->volumeError = VOLMISSING;
    } else if (acode >= 0) {
	if (aerrP)
	    (aerrP->err_Other)++;
	if (op == AFS_STATS_FS_RPCIDX_STOREDATA)
	    areq->permWriteError = 1;
	shouldRetry = 0;	/* Other random Vice error. */
    } else if (acode == RX_MSGSIZE) {	/* same meaning as EMSGSIZE... */
	VSleep(1);		/* Just a hack for desperate times. */
	if (aerrP)
	    (aerrP->err_Other)++;
	shouldRetry = 1;	/* packet was too big, please retry call */
    }

    if (acode < 0 && acode != RX_MSGSIZE && acode != VRESTARTING) {
	/* If we get here, code < 0 and we have network/Server troubles.
	 * areq->networkError is not set here, since we always
	 * retry in case there is another server.  However, if we find
	 * no connection (aconn == 0) we set the networkError flag.
	 */
	afs_MarkServerUpOrDown(sa, SRVR_ISDOWN);
	if (aerrP)
	    (aerrP->err_Server)++;
	VSleep(1);		/* Just a hack for desperate times. */
	shouldRetry = 1;
    }
out:
    /* now unlock the connection and return */
    afs_PutConn(aconn, locktype);
    return (shouldRetry);
}				/*afs_Analyze */
示例#3
0
/* This function always holds the GLOCK whilst it is running. The caller
 * gets the GLOCK before invoking it, and afs_osi_Sleep drops the GLOCK
 * whilst we are sleeping, and regains it when we're woken up.
 */
void
afs_Daemon(void)
{
    afs_int32 code;
    struct afs_exporter *exporter;
    afs_int32 now;
    afs_int32 last3MinCheck, last10MinCheck, last60MinCheck, lastNMinCheck;
    afs_int32 last1MinCheck, last5MinCheck;
    afs_uint32 lastCBSlotBump;
    char cs_warned = 0;

    AFS_STATCNT(afs_Daemon);

    afs_rootFid.Fid.Volume = 0;
    while (afs_initState < 101)
	afs_osi_Sleep(&afs_initState);

#ifdef AFS_DARWIN80_ENV
    if (afs_osi_ctxtp_initialized)
        osi_Panic("vfs context already initialized");
    while (afs_osi_ctxtp && vfs_context_ref)
        afs_osi_Sleep(&afs_osi_ctxtp);
    if (afs_osi_ctxtp && !vfs_context_ref)
       vfs_context_rele(afs_osi_ctxtp);
    afs_osi_ctxtp = vfs_context_create(NULL);
    afs_osi_ctxtp_initialized = 1;
#endif
    now = osi_Time();
    lastCBSlotBump = now;

    /* when a lot of clients are booted simultaneously, they develop
     * annoying synchronous VL server bashing behaviors.  So we stagger them.
     */
    last1MinCheck = now + ((afs_random() & 0x7fffffff) % 60);	/* an extra 30 */
    last3MinCheck = now - 90 + ((afs_random() & 0x7fffffff) % 180);
    last60MinCheck = now - 1800 + ((afs_random() & 0x7fffffff) % 3600);
    last10MinCheck = now - 300 + ((afs_random() & 0x7fffffff) % 600);
    last5MinCheck = now - 150 + ((afs_random() & 0x7fffffff) % 300);
    lastNMinCheck = now - 90 + ((afs_random() & 0x7fffffff) % 180);

    /* start off with afs_initState >= 101 (basic init done) */
    while (1) {
	afs_CheckCallbacks(20);	/* unstat anything which will expire soon */

	/* things to do every 20 seconds or less - required by protocol spec */
	if (afs_nfsexporter)
	    afs_FlushActiveVcaches(0);	/* flush NFS writes */
	afs_FlushVCBs(1);	/* flush queued callbacks */

	afs_MaybeWakeupTruncateDaemon();	/* free cache space if have too */
	rx_CheckPackets();	/* Does RX need more packets? */

	now = osi_Time();
	if (lastCBSlotBump + CBHTSLOTLEN < now) {	/* pretty time-dependant */
	    lastCBSlotBump = now;
	    if (afs_BumpBase()) {
		afs_CheckCallbacks(20);	/* unstat anything which will expire soon */
	    }
	}

	if (last1MinCheck + 60 < now) {
	    /* things to do every minute */
	    DFlush();		/* write out dir buffers */
	    afs_WriteThroughDSlots();	/* write through cacheinfo entries */
	    ObtainWriteLock(&afs_xvcache, 736);
	    afs_FlushReclaimedVcaches();
	    ReleaseWriteLock(&afs_xvcache);
	    afs_FlushActiveVcaches(1);	/* keep flocks held & flush nfs writes */
#if 0
	    afs_StoreDirtyVcaches();
#endif
	    afs_CheckRXEpoch();
	    last1MinCheck = now;
	}

	if (last3MinCheck + 180 < now) {
	    afs_CheckTokenCache();	/* check for access cache resets due to expired
					 * tickets */
	    last3MinCheck = now;
	}

        if (afsd_dynamic_vcaches && (last5MinCheck + 300 < now)) {
            /* start with trying to drop us back to our base usage */
            int anumber = VCACHE_FREE + (afs_vcount - afs_cacheStats);

	    if (anumber > 0) {
		ObtainWriteLock(&afs_xvcache, 734);
		afs_ShakeLooseVCaches(anumber);
		ReleaseWriteLock(&afs_xvcache);
	    }
            last5MinCheck = now;
        }

	if (!afs_CheckServerDaemonStarted) {
	    /* Do the check here if the correct afsd is not installed. */
	    if (!cs_warned) {
		cs_warned = 1;
		afs_warn("Please install afsd with check server daemon.\n");
	    }
	    if (lastNMinCheck + afs_probe_interval < now) {
		/* only check down servers */
		afs_CheckServers(1, NULL);
		lastNMinCheck = now;
	    }
	}
	if (last10MinCheck + 600 < now) {
#ifdef AFS_USERSPACE_IP_ADDR
	    extern int rxi_GetcbiInfo(void);
#endif
	    afs_Trace1(afs_iclSetp, CM_TRACE_PROBEUP, ICL_TYPE_INT32, 600);
#ifdef AFS_USERSPACE_IP_ADDR
	    if (rxi_GetcbiInfo()) {	/* addresses changed from last time */
		afs_FlushCBs();
	    }
#else /* AFS_USERSPACE_IP_ADDR */
	    if (rxi_GetIFInfo()) {	/* addresses changed from last time */
		afs_FlushCBs();
	    }
#endif /* else AFS_USERSPACE_IP_ADDR */
	    if (!afs_CheckServerDaemonStarted)
		afs_CheckServers(0, NULL);
	    afs_GCUserData(0);	/* gc old conns */
	    /* This is probably the wrong way of doing GC for the various exporters but it will suffice for a while */
	    for (exporter = root_exported; exporter;
		 exporter = exporter->exp_next) {
		(void)EXP_GC(exporter, 0);	/* Generalize params */
	    }
	    {
		static int cnt = 0;
		if (++cnt < 12) {
		    afs_CheckVolumeNames(AFS_VOLCHECK_EXPIRED |
					 AFS_VOLCHECK_BUSY);
		} else {
		    cnt = 0;
		    afs_CheckVolumeNames(AFS_VOLCHECK_EXPIRED |
					 AFS_VOLCHECK_BUSY |
					 AFS_VOLCHECK_MTPTS);
		}
	    }
	    last10MinCheck = now;
	}
	if (last60MinCheck + 3600 < now) {
	    afs_Trace1(afs_iclSetp, CM_TRACE_PROBEVOLUME, ICL_TYPE_INT32,
		       3600);
	    afs_CheckRootVolume();
#if AFS_GCPAGS
	    if (afs_gcpags == AFS_GCPAGS_OK) {
		afs_int32 didany;
		afs_GCPAGs(&didany);
	    }
#endif
	    last60MinCheck = now;
	}
	if (afs_initState < 300) {	/* while things ain't rosy */
	    code = afs_CheckRootVolume();
	    if (code == 0)
		afs_initState = 300;	/* succeeded */
	    if (afs_initState < 200)
		afs_initState = 200;	/* tried once */
	    afs_osi_Wakeup(&afs_initState);
	}

	/* 18285 is because we're trying to divide evenly into 128, that is,
	 * CBSlotLen, while staying just under 20 seconds.  If CBSlotLen
	 * changes, should probably change this interval, too.
	 * Some of the preceding actions may take quite some time, so we
	 * might not want to wait the entire interval */
	now = 18285 - (osi_Time() - now);
	if (now > 0) {
	    afs_osi_Wait(now, &AFS_WaitHandler, 0);
	}

	if (afs_termState == AFSOP_STOP_AFS) {
	    if (afs_CheckServerDaemonStarted)
		afs_termState = AFSOP_STOP_CS;
	    else
		afs_termState = AFSOP_STOP_TRUNCDAEMON;
	    afs_osi_Wakeup(&afs_termState);
	    return;
	}
    }
}