void stat_calcUngapKarlinParameters(double *dbCharProb, double *queryCharProb, int scoringMatrix[16][16]) { int4 scoreProbSize, highest, lowest; double * scoreProb; int i, j; lowest = stat_scoreBlock.mismatch; highest = stat_scoreBlock.match; scoreProbSize = highest-lowest+1; scoreProb = (double *) calloc(scoreProbSize, sizeof(double)); for (i=0; i<16; i++) { for (j=0; j<16; j++) { scoreProb[scoringMatrix[i][j] - lowest] += dbCharProb[i] * queryCharProb[j]; } } // calculate the Karlin parameters, refer to karlin.c BlastKarlinBlkCalc(scoreProb-lowest, lowest, highest); stat_ungapH = BlastKarlin_H; stat_ungapK = BlastKarlin_K; stat_ungapLambda = BlastKarlin_lambda; if (stat_ungapK <= 0.001) { fprintf(stderr,"ERROR: unable to calculate the KARLIN statistic for the given scoring scheme.\n"); exit(-1); } free(scoreProb); }
void statistics_calculateUngappedKarlinParameters(struct PSSMatrix PSSMatrix) { int4 highest, lowest; double* scoreProbabilities; int4 queryPosition; unsigned char subjectCode; double probability, sum = 0; int4 score; int4 numRegularLettersInQuery = 0; static float* alphabetFrequencies; if (encoding_alphabetType == encoding_nucleotide) { alphabetFrequencies = Nucleotide_prob; } else { // Use Robinson&Robinson frequencies for protein alphabet alphabetFrequencies = Robinson_prob; } highest = PSSMatrix.highestValue; lowest = PSSMatrix.lowestValue; // Initialize array to hold probability values for range of possible scores scoreProbabilities = (double*)global_malloc(sizeof(double) * (highest - lowest + 1)); scoreProbabilities -= lowest; score = lowest; while (score <= highest) { scoreProbabilities[score] = 0.0; score++; } // Determine the number of regular letters (ie. non-ambigious codes) in the query queryPosition = 0; while (queryPosition < statistics_querySize) { // if (PSSMatrix.queryCodes[queryPosition] < encoding_numRegularLetters) if (PSSMatrix.queryCodes[queryPosition] != encoding_numRegularLetters) { numRegularLettersInQuery++; } queryPosition++; } // For each position in the query PSSMatrix that does not represent an ambigious code queryPosition = 0; while (queryPosition < statistics_querySize) { // if (PSSMatrix.queryCodes[queryPosition] < encoding_numRegularLetters) if (PSSMatrix.queryCodes[queryPosition] != encoding_numRegularLetters) { // For each regular amino-acid subject code subjectCode = 0; while (subjectCode < encoding_numRegularLetters) { // Calculate probability that, if a residue was randomly chosen from the subject database // and a position was randomly chosen from query, they would be subjectCode and queryPosition // respectively probability = alphabetFrequencies[subjectCode] / (float)numRegularLettersInQuery; // Calculate score of aligning query position to this subject residue score = PSSMatrix.matrix[queryPosition][subjectCode]; // Add to set of probabilities scoreProbabilities[score] += probability; subjectCode++; } } queryPosition++; } // Calculate residue frequency normalizing value subjectCode = 0; while (subjectCode < encoding_numRegularLetters) { sum += alphabetFrequencies[subjectCode]; subjectCode++; } // Normalized probabilities by dividing them by the sum of the robinson frequency values score = lowest; while (score <= highest) { scoreProbabilities[score] /= sum; score++; } // Calculate the Lambda, H and K values for these score probabilities // See karlin.c for implementation BlastKarlinBlkCalc(scoreProbabilities, lowest, highest); statistics_ungappedH = BlastKarlin_H; statistics_ungappedK = BlastKarlin_K; statistics_ungappedLambda = BlastKarlin_lambda; if (statistics_ungappedK <= 0.0001) { fprintf(stderr, "Error: unable to calculate statistical significance for given scoring scheme\n"); fflush(stderr); exit(-1); } scoreProbabilities += lowest; free(scoreProbabilities); }