double* calcMIp_line(char m[],int n,int l, int pos) { /* This function is used to calculate the MIp matrix. m is the fastas sequences which has been concanated to one array n is the number of sequences and l is the length. len(m) must eaqul to l*n. */ double *mi=calcMI(m,n,l); int i,j,k; double mean[l],allmean=0; k=0; for (i=0;i<l;i++) { mean[i]=0; for (j=0;j<l;j++) { mean[i]+=mi[k]; k++; } mean[i]=mean[i]/(l-1); allmean+=mean[i]; } allmean=allmean/l; double *mip; mip=malloc( l * sizeof(double) ); for (j=0;j<l;j++) { mip[j]=mi[pos*l+j]-((mean[pos]*mean[j])/allmean); } return mip; }
static PyObject *msamutinfo(PyObject *self, PyObject *args, PyObject *kwargs) { PyArrayObject *msa, *mutinfo; int ambiguity = 1, turbo = 1, debug = 0, norm = 0; static char *kwlist[] = {"msa", "mutinfo", "ambiguity", "turbo", "norm", "debug", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|iiii", kwlist, &msa, &mutinfo, &ambiguity, &turbo, &norm, &debug)) return NULL; /* make sure to have a contiguous and well-behaved array */ msa = PyArray_GETCONTIGUOUS(msa); /* check dimensions */ long number = PyArray_DIMS(msa)[0], length = PyArray_DIMS(msa)[1]; /* get pointers to data */ char *seq = (char *) PyArray_DATA(msa); /*size: number x length */ double *mut = (double *) PyArray_DATA(mutinfo); long i, j; /* allocate memory */ unsigned char *iseq = malloc(number * sizeof(unsigned char)); if (!iseq) return PyErr_NoMemory(); /* hold transpose of the sorted character array */ unsigned char **trans = malloc(length * sizeof(unsigned char *)); if (!trans) { turbo = 0; } if (turbo) { /* allocate rows that will store columns of MSA */ trans[0] = iseq; for (i = 1; i < length; i++) { trans[i] = malloc(number * sizeof(unsigned char)); if (!trans[i]) { for (j = 1; j < i; j++) free(trans[j]); free(trans); turbo = 0; } } } unsigned char *jseq = iseq; /* so that we don't get uninitialized warning*/ /* length*27, a row for each column in the MSA */ double **probs = malloc(length * sizeof(double *)), *prow; if (!probs) { if (turbo) for (j = 1; j < length; j++) free(trans[j]); free(trans); free(iseq); return PyErr_NoMemory(); } /* 27x27, alphabet characters and a gap*/ double **joint = malloc(NUMCHARS * sizeof(double *)), *jrow; if (!joint) { if (turbo) for (j = 1; j < length; j++) free(trans[j]); free(trans); free(iseq); free(probs); return PyErr_NoMemory(); } for (i = 0; i < length; i++) { prow = malloc(NUMCHARS * sizeof(double)); if (!prow) { for (j = 0; j < i; j++) free(probs[j]); free(probs); free(joint); if (turbo) for (j = 1; j < length; j++) free(trans[j]); free(trans); free(iseq); return PyErr_NoMemory(); } probs[i] = prow; for (j = 0; j < NUMCHARS; j++) prow[j] = 0; } for (i = 0; i < NUMCHARS; i++) { joint[i] = malloc(NUMCHARS * sizeof(double)); if (!joint[i]) { for (j = 0; j < i; j++) free(joint[j]); free(joint); for (j = 0; j < length; j++) free(probs[j]); free(probs); if (turbo) for (j = 1; j < length; j++) free(trans[j]); free(trans); free(iseq); return PyErr_NoMemory(); } } if (debug) printProbs(probs, length); unsigned char a, b; long k, l, diff, offset; double p_incr = 1. / number, prb = 0; prow = probs[0]; /* START mutinfo calculation */ /* calculate first row of MI matrix and all column probabilities */ i = 0; mut[0] = 0; for (j = 1; j < length; j++) { mut[j * length + j] = 0; /* using empty, so needed for diagonal */ jrow = probs[j]; zeroJoint(joint); diff = j - 1; if (turbo) /* in turbo mode, there is a row for refined sequences */ jseq = trans[j]; for (k = 0; k < number; k++) { offset = k * length; if (diff) { a = iseq[k]; } else { a = (unsigned char) seq[offset + i]; if (a > 90) a -= 96; else a -= 64; if (a < 1 || a > 26) a = 0; /* gap character */ iseq[k] = a; prow[a] += p_incr; } b = (unsigned char) seq[offset + j]; if (b > 90) b -= 96; else b -= 64; if (b < 1 || b > 26) b = 0; /* gap character */ if (turbo) /* we keep the refined chars for all sequences*/ jseq[k] = b; joint[a][b] += p_incr; jrow[b] += p_incr; } if (ambiguity) { if (debug) printProbs(probs, length); if (diff) k = j; else k = 0; for (; k <= j; k++) { prow = probs[k]; prb = prow[2]; if (prb > 0) { /* B -> D, N */ prb = prb / 2.; prow[4] += prb; prow[14] += prb; prow[2] = 0; } prb = prow[10]; if (prb > 0) { /* J -> I, L */ prb = prb / 2.; prow[9] += prb; prow[12] += prb; prow[10] = 0; } prb = prow[26]; if (prb > 0) { /* Z -> E, Q */ prb = prb / 2.; prow[5] += prb; prow[17] += prb; prow[26] = 0; } if (prow[24] > 0) { /* X -> 20 AA */ prb = prow[24] / 20.; for (l = 0; l < 20; l++) prow[twenty[l]] += prb; prow[24] = 0; } } if (debug) printProbs(probs, length); if (debug) printJoint(joint, i, j); sortJoint(joint); if (debug) printJoint(joint, i, j); } if (norm) mut[j] = mut[length * j] = calcMI(joint, probs, i, j, debug) / jointEntropy(joint); else mut[j] = mut[length * j] = calcMI(joint, probs, i, j, debug); } if (debug) printProbs(probs, length); if (turbo) free(iseq); /* calculate rest of MI matrix */ long ioffset; for (i = 1; i < length; i++) { ioffset = i * length; if (turbo) iseq = trans[i]; for (j = i + 1; j < length; j++) { zeroJoint(joint); if (turbo) { jseq = trans[j]; for (k = 0; k < number; k++) joint[iseq[k]][jseq[k]] += p_incr; } else { diff = j - i - 1; for (k = 0; k < number; k++) { offset = k * length; if (diff) { a = iseq[k]; } else { a = (unsigned char) seq[offset + i]; if (a > 90) a -= 96; else a -= 64; if (a < 1 || a > 26) a = 0; /* gap character */ iseq[k] = a; } b = (unsigned char) seq[offset + j]; if (b > 90) b -= 96; else b -= 64; if (b < 1 || b > 26) b = 0; /* gap character */ joint[a][b] += p_incr; } } if (ambiguity) sortJoint(joint); if (norm) mut[ioffset + j] = mut[i + length * j] = calcMI(joint, probs, i, j, debug) / jointEntropy(joint); else mut[ioffset + j] = mut[i + length * j] = calcMI(joint, probs, i, j, debug); } } /* free memory */ for (i = 0; i < length; i++){ free(probs[i]); } free(probs); for (i = 0; i < NUMCHARS; i++){ free(joint[i]); } free(joint); if (turbo) for (j = 1; j < length; j++) free(trans[j]); free(trans); return Py_BuildValue("O", mutinfo); }