static void * watchdog_handler() { /* Start watchdog in manual mode */ start_watchdog(0); /* Set watchdog to persistent mode so timer expiry will happen independent * of this process's liveliness. */ set_persistent_watchdog(WATCHDOG_SET_PERSISTENT); while(1) { sleep(5); /* * Restart the watchdog countdown. If this process is terminated, * the persistent watchdog setting will cause the system to reboot after * the watchdog timeout. */ kick_watchdog(); } }
int main(int argc, char **argv) { /* Sensor values */ #if defined(CONFIG_WEDGE) int intake_temp; int exhaust_temp; int switch_temp; int userver_temp; #else float intake_temp; float exhaust_temp; float userver_temp; #endif int fan_speed = fan_high; int bad_reads = 0; int fan_failure = 0; int fan_speed_changes = 0; int old_speed; int fan_bad[FANS]; int fan; unsigned log_count = 0; // How many times have we logged our temps? int opt; int prev_fans_bad = 0; struct sigaction sa; sa.sa_handler = fand_interrupt; sa.sa_flags = 0; sigemptyset(&sa.sa_mask); sigaction(SIGTERM, &sa, NULL); sigaction(SIGINT, &sa, NULL); sigaction(SIGUSR1, &sa, NULL); // Start writing to syslog as early as possible for diag purposes. openlog("fand", LOG_CONS, LOG_DAEMON); #if defined(CONFIG_WEDGE) && !defined(CONFIG_WEDGE100) if (is_two_fan_board(false)) { /* Alternate, two fan configuration */ total_fans = 2; fan_offset = 2; /* fan 3 is the first */ fan_low = SIXPACK_FAN_LOW; fan_medium = SIXPACK_FAN_MEDIUM; fan_high = SIXPACK_FAN_HIGH; fan_max = SIXPACK_FAN_MAX; fan_speed = fan_high; } #endif while ((opt = getopt(argc, argv, "l:m:h:b:t:r:v")) != -1) { switch (opt) { case 'l': fan_low = atoi(optarg); break; case 'm': fan_medium = atoi(optarg); break; case 'h': fan_high = atoi(optarg); break; case 'b': temp_bottom = INTERNAL_TEMPS(atoi(optarg)); break; case 't': temp_top = INTERNAL_TEMPS(atoi(optarg)); break; case 'r': report_temp = atoi(optarg); break; case 'v': verbose = true; break; default: usage(); break; } } if (optind > argc) { usage(); } if (temp_bottom > temp_top) { fprintf(stderr, "Should temp-bottom (%d) be higher than " "temp-top (%d)? Starting anyway.\n", EXTERNAL_TEMPS(temp_bottom), EXTERNAL_TEMPS(temp_top)); } if (fan_low > fan_medium || fan_low > fan_high || fan_medium > fan_high) { fprintf(stderr, "fan RPMs not strictly increasing " "-- %d, %d, %d, starting anyway\n", fan_low, fan_medium, fan_high); } daemon(1, 0); if (verbose) { syslog(LOG_DEBUG, "Starting up; system should have %d fans.", total_fans); } for (fan = 0; fan < total_fans; fan++) { fan_bad[fan] = 0; write_fan_speed(fan + fan_offset, fan_speed); write_fan_led(fan + fan_offset, FAN_LED_BLUE); } #if defined(CONFIG_YOSEMITE) /* Ensure that we can read from sensors before proceeding. */ int found = 0; userver_temp = 100; while (!found) { for (int node = 1; node <= TOTAL_1S_SERVERS && !found; node++) { if (!yosemite_sensor_read(node, BIC_SENSOR_SOC_THERM_MARGIN, &userver_temp) && userver_temp < 0) { syslog(LOG_DEBUG, "SOC_THERM_MARGIN first valid read of %f.", userver_temp); found = 1; } sleep(5); } // XXX: Will it ever be a problem that we don't exit this until // we see a valid value? } #endif /* Start watchdog in manual mode */ start_watchdog(0); /* Set watchdog to persistent mode so timer expiry will happen independent * of this process's liveliness. */ set_persistent_watchdog(WATCHDOG_SET_PERSISTENT); sleep(5); /* Give the fans time to come up to speed */ while (1) { int max_temp; old_speed = fan_speed; /* Read sensors */ #if defined(CONFIG_WEDGE) || defined(CONFIG_WEDGE100) read_temp(INTAKE_TEMP_DEVICE, &intake_temp); read_temp(EXHAUST_TEMP_DEVICE, &exhaust_temp); read_temp(CHIP_TEMP_DEVICE, &switch_temp); read_temp(USERVER_TEMP_DEVICE, &userver_temp); /* * uServer can be powered down, but all of the rest of the sensors * should be readable at any time. */ if ((intake_temp == BAD_TEMP || exhaust_temp == BAD_TEMP || switch_temp == BAD_TEMP)) { bad_reads++; } #else intake_temp = exhaust_temp = userver_temp = BAD_TEMP; if (yosemite_sensor_read(FRU_SPB, SP_SENSOR_INLET_TEMP, &intake_temp) || yosemite_sensor_read(FRU_SPB, SP_SENSOR_OUTLET_TEMP, &exhaust_temp)) bad_reads++; /* * There are a number of 1S servers; any or all of them * could be powered off and returning no values. Ignore these * invalid values. */ for (int node = 1; node <= TOTAL_1S_SERVERS; node++) { float new_temp; if (!yosemite_sensor_read(node, BIC_SENSOR_SOC_THERM_MARGIN, &new_temp)) { if (userver_temp < new_temp) { userver_temp = new_temp; } } // Since the yosemite_sensor_read() times out after 8secs, keep WDT from expiring kick_watchdog(); } #endif if (bad_reads > BAD_READ_THRESHOLD) { server_shutdown("Some sensors couldn't be read"); } if (log_count++ % report_temp == 0) { syslog(LOG_DEBUG, #if defined(CONFIG_WEDGE) || defined(CONFIG_WEDGE100) "Temp intake %d, t2 %d, " " userver %d, exhaust %d, " "fan speed %d, speed changes %d", #else "Temp intake %f, max server %f, exhaust %f, " "fan speed %d, speed changes %d", #endif intake_temp, #if defined(CONFIG_WEDGE) || defined(CONFIG_WEDGE100) switch_temp, #endif userver_temp, exhaust_temp, fan_speed, fan_speed_changes); } /* Protection heuristics */ if (intake_temp > INTAKE_LIMIT) { server_shutdown("Intake temp limit reached"); } #if defined(CONFIG_WEDGE) || defined(CONFIG_WEDGE100) if (switch_temp > SWITCH_LIMIT) { server_shutdown("T2 temp limit reached"); } #endif if (userver_temp + USERVER_TEMP_FUDGE > USERVER_LIMIT) { server_shutdown("uServer temp limit reached"); } /* * Calculate change needed -- we should eventually * do something more sophisticated, like PID. * * We should use the intake temperature to adjust this * as well. */ #if defined(CONFIG_YOSEMITE) /* Use tables to lookup the new fan speed for Yosemite. */ int intake_speed = temp_to_fan_speed(intake_temp, intake_map, INTAKE_MAP_SIZE); int cpu_speed = temp_to_fan_speed(userver_temp, cpu_map, CPU_MAP_SIZE); if (fan_speed == fan_max && fan_failure != 0) { /* Don't change a thing */ } else if (intake_speed > cpu_speed) { fan_speed = intake_speed; } else { fan_speed = cpu_speed; } #else /* Other systems use a simpler built-in table to determine fan speed. */ if (switch_temp > userver_temp + USERVER_TEMP_FUDGE) { max_temp = switch_temp; } else { max_temp = userver_temp + USERVER_TEMP_FUDGE; } /* * If recovering from a fan problem, spin down fans gradually in case * temperatures are still high. Gradual spin down also reduces wear on * the fans. */ if (fan_speed == fan_max) { if (fan_failure == 0) { fan_speed = fan_high; } } else if (fan_speed == fan_high) { if (max_temp + COOLDOWN_SLOP < temp_top) { fan_speed = fan_medium; } } else if (fan_speed == fan_medium) { if (max_temp > temp_top) { fan_speed = fan_high; } else if (max_temp + COOLDOWN_SLOP < temp_bottom) { fan_speed = fan_low; } } else {/* low */ if (max_temp > temp_bottom) { fan_speed = fan_medium; } } #endif /* * Update fans only if there are no failed ones. If any fans failed * earlier, all remaining fans should continue to run at max speed. */ if (fan_failure == 0 && fan_speed != old_speed) { syslog(LOG_NOTICE, "Fan speed changing from %d to %d", old_speed, fan_speed); fan_speed_changes++; for (fan = 0; fan < total_fans; fan++) { write_fan_speed(fan + fan_offset, fan_speed); } } /* * Wait for some change. Typical I2C temperature sensors * only provide a new value every second and a half, so * checking again more quickly than that is a waste. * * We also have to wait for the fan changes to take effect * before measuring them. */ sleep(5); /* Check fan RPMs */ for (fan = 0; fan < total_fans; fan++) { /* * Make sure that we're within some percentage * of the requested speed. */ if (fan_speed_okay(fan + fan_offset, fan_speed, FAN_FAILURE_OFFSET)) { if (fan_bad[fan] > FAN_FAILURE_THRESHOLD) { write_fan_led(fan + fan_offset, FAN_LED_BLUE); syslog(LOG_CRIT, "Fan %d has recovered", fan); } fan_bad[fan] = 0; } else { fan_bad[fan]++; } } fan_failure = 0; for (fan = 0; fan < total_fans; fan++) { if (fan_bad[fan] > FAN_FAILURE_THRESHOLD) { fan_failure++; write_fan_led(fan + fan_offset, FAN_LED_RED); } } if (fan_failure > 0) { if (prev_fans_bad != fan_failure) { syslog(LOG_CRIT, "%d fans failed", fan_failure); } /* * If fans are bad, we need to blast all of the * fans at 100%; we don't bother to turn off * the bad fans, in case they are all that is left. * * Note that we have a temporary bug with setting fans to * 100% so we only do fan_max = 99%. */ fan_speed = fan_max; for (fan = 0; fan < total_fans; fan++) { write_fan_speed(fan + fan_offset, fan_speed); } #if defined(CONFIG_WEDGE) || defined(CONFIG_WEDGE100) /* * On Wedge, we want to shut down everything if none of the fans * are visible, since there isn't automatic protection to shut * off the server or switch chip. On other platforms, the CPUs * generating the heat will automatically turn off, so this is * unnecessary. */ if (fan_failure == total_fans) { int count = 0; for (fan = 0; fan < total_fans; fan++) { if (fan_bad[fan] > FAN_SHUTDOWN_THRESHOLD) count++; } if (count == total_fans) { server_shutdown("all fans are bad for more than 12 cycles"); } } #endif /* * Fans can be hot swapped and replaced; in which case the fan daemon * will automatically detect the new fan and (assuming the new fan isn't * itself faulty), automatically readjust the speeds for all fans down * to a more suitable rpm. The fan daemon does not need to be restarted. */ } /* Suppress multiple warnings for similar number of fan failures. */ prev_fans_bad = fan_failure; /* if everything is fine, restart the watchdog countdown. If this process * is terminated, the persistent watchdog setting will cause the system * to reboot after the watchdog timeout. */ kick_watchdog(); } }