/* Function: esl_hyperexp_Read() * * Purpose: Reads hyperexponential parameters from an open <e>. * which is an <ESL_FILEPARSER> tokenizer for an open stream. * * The first token is <K>, the number of mixture components. * The second token is <mu>, the x offset shared by all components. * Then for each mixture component <k=1..K>, it reads * a mixture coefficient <q[k]> and a decay parameter * <lambda[k]>. * * The <2K+2> data tokens must occur in this order, but * they can be grouped into any number of lines, because the * parser ignores line breaks. * * Anything after a <\#> character on a line is a comment, and * is ignored. * * Returns: <eslOK> on success, and <ret_hxp> points to a new <ESL_HYPEREXP> * object. * <eslEFORMAT> on "normal" parse failure caused by a bad file * format that's likely the user's fault. * * Throws: <eslEMEM> if allocation of the new <ESL_HYPEREXP> fails. * * * FIXME: All our mixture models (esl_dirichlet, for example) should be * reconciled w/ identical interfaces & behaviour. */ int esl_hyperexp_Read(ESL_FILEPARSER *e, ESL_HYPEREXP **ret_hxp) { ESL_HYPEREXP *hxp = NULL; char *tok; int status = eslOK; int nc; int k; double sum; esl_fileparser_SetCommentChar(e, '#'); if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; nc = atoi(tok); if (nc < 1) { sprintf(e->errbuf, "Expected # of components K >= 1 as first token"); goto ERROR; } if ((hxp = esl_hyperexp_Create(nc)) == NULL) return eslEMEM; /* percolation */ if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; hxp->mu = atof(tok); for (k = 0; k < hxp->K; k++) { if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; hxp->q[k] = atof(tok); if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; hxp->lambda[k] = atof(tok); if (hxp->q[k] < 0. || hxp->q[k] > 1.) { sprintf(e->errbuf, "Expected a mixture coefficient q[k], 0<=q[k]<=1"); goto ERROR; } if (hxp->lambda[k] <= 0.) { sprintf(e->errbuf, "Expected a lambda parameter, lambda>0"); goto ERROR; } } sum = esl_vec_DSum(hxp->q, hxp->K); if (fabs(sum-1.0) > 0.05) { sprintf(e->errbuf, "Expected mixture coefficients to sum to 1"); goto ERROR; } esl_vec_DNorm(hxp->q, hxp->K); *ret_hxp = hxp; return eslOK; ERROR: esl_hyperexp_Destroy(hxp); return eslEFORMAT; }
/* read_mask_file * * Given an open file pointer, read the first token of the * file and return it as *ret_mask. It must contain only * '0' or '1' characters. * * Returns: eslOK on success. */ int read_mask_file(char *filename, char *errbuf, char **ret_mask, int *ret_mask_len) { ESL_FILEPARSER *efp = NULL; char *mask = NULL; char *tok; int toklen; int n; int status; if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to open %s in read_mask_file\n", filename); esl_fileparser_SetCommentChar(efp, '#'); if((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to read a single token from %s\n", filename); ESL_ALLOC(mask, sizeof(char) * (toklen+1)); for(n = 0; n < toklen; n++) { if((tok[n] == '0') || (tok[n] == '1')) { mask[n] = tok[n]; } else { ESL_XFAIL(eslFAIL, errbuf, "read a non-0 and non-1 character (%c) in the mask file %s\n", tok[n], filename); } } mask[n] = '\0'; *ret_mask = mask; *ret_mask_len = n; esl_fileparser_Close(efp); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (mask) free(mask); return status; }
/* Function: esl_paml_ReadE() * Incept: SRE, Fri Jul 9 09:27:24 2004 [St. Louis] * * Purpose: Read an amino acid rate matrix in PAML format from stream * <fp>. Return it in two pieces: the symmetric E * exchangeability matrix in <E>, and the stationary * probability vector $\pi$ in <pi>. * Caller provides the memory for both <E> and <pi>. <E> * is a $20 \times 20$ matrix allocated as * <esl_dmatrix_Create(20, 20)>. <pi> is an array with * space for at least 20 doubles. * * The <E> matrix is symmetric for off-diagonal elements: * $E_{ij} = E_{ij}$ for $i \neq j$. The on-diagonal * elements $E_{ii}$ are not valid and should not be * accessed. (They are set to zero.) * The rate matrix will later be obtained from <E> * and <pi> as * $Q_{ij} = E_{ij} \pi_j$ for $i \neq j$ * and * $Q_{ii} = -\sum_{j \neq i} Q_{ij}$ * then scaled to units of one * substitution/site; see <esl_ratemx_E2Q()> and * <esl_ratemx_ScaleTo()>. * * Data file format: First 190 numbers are a * lower-triangular matrix E of amino acid * exchangeabilities $E_{ij}$. Next 20 numbers are the * amino acid frequencies $\pi_i$. Remainder of the * datafile is ignored. * * The alphabet order in the matrix and the frequency * vector is assumed to be "ARNDCQEGHILKMFPSTWYV" * (alphabetical by three-letter code), which appears to be * PAML's default order. This is transformed to Easel's * "ACDEFGHIKLMNPQRSTVWY" (alphabetical by one-letter code) * in the $E_{ij}$ and $\pi_i$ that are returned. * * Args: fp - open datafile for reading. * E - RETURN: E matrix of amino acid exchangeabilities e_ij, * symmetric (E_ij = E_ji), * in Easel amino acid alphabet order A..Y. * Caller provides appropriately allocated space. * pi - RETURN: \pi_i vector of amino acid frequencies, * in Easel amino acid alphabet order A..Y. * Caller provides appropriately allocated space. * * Returns: <eslOK> on success. * Returns <eslEOF> on premature end of file (parse failed), in which * case the contents of <E> and <pi> are undefined. * * Throws: <eslEMEM> on internal allocation failure, * and the contents of <E> and <pi> are undefined. * * Xref: STL8/p.56. */ int esl_paml_ReadE(FILE *fp, ESL_DMATRIX *E, double *pi) { int status; ESL_FILEPARSER *efp = NULL; char *tok; int i,j; char *pamlorder = "ARNDCQEGHILKMFPSTWYV"; char *eslorder = "ACDEFGHIKLMNPQRSTVWY"; int perm[20]; if ((status = esl_dmatrix_SetZero(E)) != eslOK) goto ERROR; esl_vec_DSet(pi, 20, 0.); if ((efp = esl_fileparser_Create(fp)) == NULL) goto ERROR; if ((status = esl_fileparser_SetCommentChar(efp, '#')) != eslOK) goto ERROR; /* Construct the alphabet permutation we need. * perm[i] -> original row/column i goes to row/column perm[i] */ for (i = 0; i < 20; i++) perm[i] = (int) (strchr(eslorder, pamlorder[i]) - eslorder); /* Read the s_ij matrix data in, permuting as we go. */ for (i = 1; i < 20; i++) for (j = 0; j < i; j++) { if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR; E->mx[perm[i]][perm[j]] = atof(tok); E->mx[perm[j]][perm[i]] = E->mx[perm[i]][perm[j]]; } /* Read the pi_i vector in, permuting as we read. */ for (i = 0; i < 20; i++) { if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR; pi[perm[i]] = atof(tok); } esl_fileparser_Destroy(efp); return eslOK; ERROR: if (efp != NULL) esl_fileparser_Destroy(efp); return status; }
/* Function: p7_bg_Read() * Synopsis: Read background frequencies from a file. * * Purpose: Read new background frequencies from file <bgfile>, * overwriting the frequencies previously in the * <P7_BG> object <bg>. * * Note that <bg> is already created by the caller, not * created here. Also note that <p7_bg_Read()> only reads * residue background frequencies used for the "null * model", whereas a <P7_BG> object contains additional * information for the bias filter and for the biased * composition correction. * * Args: bgfile - file to read. * bg - existing <P7_BG> object provided by the caller. * errbuf - OPTIONAL: space for an error message, upon parse errors; or NULL. * * Returns: <eslOK> on success, and background frequencies in <bg> * are overwritten. * * <eslENOTFOUND> if <bgfile> can't be opened for reading. * <eslEFORMAT> if parsing of <bgfile> fails for some * reason. In both cases, <errbuf> contains a * user-directed error message upon return, including (if * relevant) the file name <bgfile> and the line number on * which an error was detected. <bg> is unmodified. * * Throws: <eslEMEM> on allocation failure; <bg> is unmodified, * and <errbuf> is empty. */ int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf) { ESL_FILEPARSER *efp = NULL; float *fq = NULL; int n = 0; char *tok; int toklen; int alphatype; ESL_DSQ x; int status; if (errbuf) errbuf[0] = '\0'; status = esl_fileparser_Open(bgfile, NULL, &efp); if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file %s for reading", bgfile); else if (status != eslOK) goto ERROR; esl_fileparser_SetCommentChar(efp, '#'); /* First token is alphabet type: amino | DNA | RNA */ status = esl_fileparser_GetToken(efp, &tok, &toklen); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; alphatype = esl_abc_EncodeType(tok); if (alphatype == eslUNKNOWN) ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile); else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile); ESL_ALLOC(fq, sizeof(float) * bg->abc->K); esl_vec_FSet(fq, bg->abc->K, -1.0); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (toklen != 1 || ! esl_abc_CIsCanonical(bg->abc, *tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); x = esl_abc_DigitizeSymbol(bg->abc, *tok); if (fq[x] != -1.0) ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile); n++; status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); fq[x] = atof(tok); status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslEOL) goto ERROR; } if (status != eslEOF) goto ERROR; if ( n != bg->abc->K) ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile); if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile); /* all checking complete. no more error cases. overwrite bg with the new frequencies */ esl_vec_FNorm(fq, bg->abc->K); esl_vec_FCopy(fq, bg->abc->K, bg->f); free(fq); esl_fileparser_Close(efp); return eslOK; ERROR: if (fq) free(fq); if (efp) esl_fileparser_Close(efp); return status; }