Esempio n. 1
0
LinearRegression Table_to_LinearRegression (Table me) {
	try {
		long numberOfIndependentVariables = my numberOfColumns - 1, numberOfParameters = my numberOfColumns;
		long numberOfCells = my rows -> size, icell, ivar;
		if (numberOfParameters < 1)   /* Includes intercept. */
			Melder_throw (U"Not enough columns (has to be more than 1).");
		if (numberOfCells < numberOfParameters) {
			Melder_warning (U"Solution is not unique (more parameters than cases).");
		}
		autoNUMmatrix <double> u (1, numberOfCells, 1, numberOfParameters);
		autoNUMvector <double> b (1, numberOfCells);
		autoNUMvector <double> x (1, numberOfParameters);
		autoLinearRegression thee = LinearRegression_create ();
		for (ivar = 1; ivar <= numberOfIndependentVariables; ivar ++) {
			double minimum = Table_getMinimum (me, ivar);
			double maximum = Table_getMaximum (me, ivar);
			Regression_addParameter (thee.peek(), my columnHeaders [ivar]. label, minimum, maximum, 0.0);
		}
		for (icell = 1; icell <= numberOfCells; icell ++) {
			for (ivar = 1; ivar < numberOfParameters; ivar ++) {
				u [icell] [ivar] = Table_getNumericValue_Assert (me, icell, ivar);
			}
			u [icell] [numberOfParameters] = 1.0;   /* For the intercept. */
			b [icell] = Table_getNumericValue_Assert (me, icell, my numberOfColumns);   // the dependent variable
		}
		NUMsolveEquation (u.peek(), numberOfCells, numberOfParameters, b.peek(), NUMeps * numberOfCells, x.peek());
		thy intercept = x [numberOfParameters];
		for (ivar = 1; ivar <= numberOfIndependentVariables; ivar ++) {
			RegressionParameter parm = static_cast<RegressionParameter> (thy parameters -> item [ivar]);
			parm -> value = x [ivar];
		}
		return thee.transfer();
	} catch (MelderError) {
		Melder_throw (me, U": linear regression not performed.");
	}
}
static LogisticRegression _Table_to_LogisticRegression (Table me, long *factors, long numberOfFactors, long dependent1, long dependent2) {
	long numberOfParameters = numberOfFactors + 1;
	long numberOfCells = my rows -> size, numberOfY0 = 0, numberOfY1 = 0, numberOfData = 0;
	double logLikelihood = 1e300, previousLogLikelihood = 2e300;
	if (numberOfParameters < 1)   // includes intercept
		Melder_throw ("Not enough columns (has to be more than 1).");
	/*
	 * Divide up the contents of the table into a number of independent variables (x) and two dependent variables (y0 and y1).
	 */
	autoNUMmatrix <double> x (1, numberOfCells, 0, numberOfFactors);   // column 0 is the intercept
	autoNUMvector <double> y0 (1, numberOfCells);
	autoNUMvector <double> y1 (1, numberOfCells);
	autoNUMvector <double> meanX (1, numberOfFactors);
	autoNUMvector <double> stdevX (1, numberOfFactors);
	autoNUMmatrix <double> smallMatrix (0, numberOfFactors, 0, numberOfParameters);
	autoLogisticRegression thee = LogisticRegression_create (my columnHeaders [dependent1]. label, my columnHeaders [dependent2]. label);
	for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
		double minimum = Table_getMinimum (me, factors [ivar]);
		double maximum = Table_getMaximum (me, factors [ivar]);
		Regression_addParameter (thee.peek(), my columnHeaders [factors [ivar]]. label, minimum, maximum, 0.0);
	}
	for (long icell = 1; icell <= numberOfCells; icell ++) {
		y0 [icell] = Table_getNumericValue_Assert (me, icell, dependent1);
		y1 [icell] = Table_getNumericValue_Assert (me, icell, dependent2);
		numberOfY0 += y0 [icell];
		numberOfY1 += y1 [icell];
		numberOfData += y0 [icell] + y1 [icell];
		x [icell] [0] = 1.0;   /* Intercept. */
		for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
			x [icell] [ivar] = Table_getNumericValue_Assert (me, icell, factors [ivar]);
			meanX [ivar] += x [icell] [ivar] * (y0 [icell] + y1 [icell]);
		}
	}
	if (numberOfY0 == 0 && numberOfY1 == 0)
		Melder_throw ("No data in either class. Cannot determine result.");
	if (numberOfY0 == 0)
		Melder_throw ("No data in class ", my columnHeaders [dependent1]. label, ". Cannot determine result.");
	if (numberOfY1 == 0)
		Melder_throw ("No data in class ", my columnHeaders [dependent2]. label, ". Cannot determine result.");
	/*
	 * Normalize the data.
	 */
	for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
		meanX [ivar] /= numberOfData;
		for (long icell = 1; icell <= numberOfCells; icell ++) {
			x [icell] [ivar] -= meanX [ivar];
		}
	}
	for (long icell = 1; icell <= numberOfCells; icell ++) {
		for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
			stdevX [ivar] += x [icell] [ivar] * x [icell] [ivar] * (y0 [icell] + y1 [icell]);
		}
	}
	for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
		stdevX [ivar] = sqrt (stdevX [ivar] / numberOfData);
		for (long icell = 1; icell <= numberOfCells; icell ++) {
			x [icell] [ivar] /= stdevX [ivar];
		}
	}
	/*
	 * Initial state of iteration: the null model.
	 */
	thy intercept = log ((double) numberOfY1 / (double) numberOfY0);   // initial state of intercept: best guess for average log odds
	for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
		RegressionParameter parm = static_cast<RegressionParameter> (thy parameters -> item [ivar]);
		parm -> value = 0.0;   // initial state of dependence: none
	}
	long iteration = 1;
	for (; iteration <= 100; iteration ++) {
		previousLogLikelihood = logLikelihood;
		for (long ivar = 0; ivar <= numberOfFactors; ivar ++) {
			for (long jvar = ivar; jvar <= numberOfParameters; jvar ++) {
				smallMatrix [ivar] [jvar] = 0.0;
			}
		}
		/*
		 * Compute the current log likelihood.
		 */
		logLikelihood = 0.0;
		for (long icell = 1; icell <= numberOfCells; icell ++) {
			double fittedLogit = thy intercept, fittedP, fittedQ, fittedLogP, fittedLogQ, fittedPQ, fittedVariance;
			for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
				RegressionParameter parm = static_cast<RegressionParameter> (thy parameters -> item [ivar]);
				fittedLogit += parm -> value * x [icell] [ivar];
			}
			/*
			 * Basically we have fittedP = 1.0 / (1.0 + exp (- fittedLogit)),
			 * but that works neither for fittedP values near 0 nor for values near 1.
			 */
			if (fittedLogit > 15.0) {
				/*
				 * For large fittedLogit, fittedLogP = ln (1/(1+exp(-fittedLogit))) = -ln (1+exp(-fittedLogit)) =~ - exp(-fittedLogit)
				 */
				fittedLogP = - exp (- fittedLogit);
				fittedLogQ = - fittedLogit;
				fittedPQ = exp (- fittedLogit);
				fittedP = exp (fittedLogP);
				fittedQ = 1.0 - fittedP;
			} else if (fittedLogit < -15.0) {
				fittedLogP = fittedLogit;
				fittedLogQ = - exp (fittedLogit);
				fittedPQ = exp (fittedLogit);
				fittedP = exp (fittedLogP);
				fittedQ = 1 - fittedP;
			} else {
				fittedP = 1.0 / (1.0 + exp (- fittedLogit));
				fittedLogP = log (fittedP);
				fittedQ = 1.0 - fittedP;
				fittedLogQ = log (fittedQ);
				fittedPQ = fittedP * fittedQ;
			}
			logLikelihood += -2 * (y1 [icell] * fittedLogP + y0 [icell] * fittedLogQ);
			/*
			 * Matrix shifting stuff.
			 * Suppose a + b Sk + c Tk = ln (pk / qk),
			 * where {a, b, c} are the coefficients to be optimized,
			 * Sk and Tk are properties of stimulus k,
			 * and pk and qk are the fitted probabilities for y1 and y0, respectively, given stimulus k.
			 * Then ln pk = - ln (1 + qk / pk) = - ln (1 + exp (- (a + b Sk + c Tk)))
			 * d ln pk / da = 1 / (1 + exp (a + b Sk + c Tk)) = qk
			 * d ln pk / db = qk Sk
			 * d ln pk / dc = qk Tk
			 * d ln qk / da = - pk
			 * Now LL = Sum(k) (y1k ln pk + y0k ln qk)
			 * so that dLL/da = Sum(k) (y1k d ln pk / da + y0k ln qk / da) = Sum(k) (y1k qk - y0k pk)
			 */
			fittedVariance = fittedPQ * (y0 [icell] + y1 [icell]);
			for (long ivar = 0; ivar <= numberOfFactors; ivar ++) {
				/*
				 * The last column gets the gradient of LL: dLL/da, dLL/db, dLL/dc.
				 */
				smallMatrix [ivar] [numberOfParameters] += x [icell] [ivar] * (y1 [icell] * fittedQ - y0 [icell] * fittedP);
				for (long jvar = ivar; jvar <= numberOfFactors; jvar ++) {
					smallMatrix [ivar] [jvar] += x [icell] [ivar] * x [icell] [jvar] * fittedVariance;
				}
			}
		}
		if (fabs (logLikelihood - previousLogLikelihood) < 1e-11) {
			break;
		}
		/*
		 * Make matrix symmetric.
		 */
		for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
			for (long jvar = 0; jvar < ivar; jvar ++) {
				smallMatrix [ivar] [jvar] = smallMatrix [jvar] [ivar];
			}
		}
		/*
		 * Invert matrix in the simplest way, and shift and wipe the last column with it.
		 */
		for (long ivar = 0; ivar <= numberOfFactors; ivar ++) {
			double pivot = smallMatrix [ivar] [ivar];   /* Save diagonal. */
			smallMatrix [ivar] [ivar] = 1.0;
			for (long jvar = 0; jvar <= numberOfParameters; jvar ++) {
				smallMatrix [ivar] [jvar] /= pivot;
			}
			for (long jvar = 0; jvar <= numberOfFactors; jvar ++) {
				if (jvar != ivar) {
					double temp = smallMatrix [jvar] [ivar];
					smallMatrix [jvar] [ivar] = 0.0;
					for (long kvar = 0; kvar <= numberOfParameters; kvar ++) {
						smallMatrix [jvar] [kvar] -= temp * smallMatrix [ivar] [kvar];
					}
				}
			}
		}
		/*
		 * Update the parameters from the last column of smallMatrix.
		 */
		thy intercept += smallMatrix [0] [numberOfParameters];
		for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
			RegressionParameter parm = static_cast<RegressionParameter> (thy parameters -> item [ivar]);
			parm -> value += smallMatrix [ivar] [numberOfParameters];
		}
	}
	if (iteration > 100) {
		Melder_warning (L"Logistic regression has not converged in 100 iterations. The results are unreliable.");
	}
	for (long ivar = 1; ivar <= numberOfFactors; ivar ++) {
		RegressionParameter parm = static_cast<RegressionParameter> (thy parameters -> item [ivar]);
		parm -> value /= stdevX [ivar];
		thy intercept -= parm -> value * meanX [ivar];
	}
	return thee.transfer();
}