/* Function: esl_msafile_a2m_GuessAlphabet() * Synopsis: Guess the alphabet of an open A2M MSA file. * * Purpose: Guess the alpbabet of the sequences in open * A2M format MSA file <afp>. * * On a normal return, <*ret_type> is set to <eslDNA>, * <eslRNA>, or <eslAMINO>, and <afp> is reset to its * original position. * * Args: afp - open A2M format MSA file * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO> * * Returns: <eslOK> on success. * <eslENOALPHABET> if alphabet type can't be determined. * In either case, <afp> is rewound to the position it * started at. * * Throws: <eslEMEM> on allocation error. * <eslESYS> on failures of fread() or other system calls */ int esl_msafile_a2m_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type) { int alphatype = eslUNKNOWN; esl_pos_t anchor = -1; int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */ int nsteps = 3; int step = 0; int nres = 0; int x; int64_t ct[26]; char *p; esl_pos_t n, pos; int status; for (x = 0; x < 26; x++) ct[x] = 0; anchor = esl_buffer_GetOffset(afp->bf); if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */ while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK) { while (n && isspace(*p)) { p++; n--; } if (!n || *p == '>') continue; for (pos = 0; pos < n; pos++) if (isalpha(p[pos])) { x = toupper(p[pos]) - 'A'; ct[x]++; nres++; } /* try to stop early, checking after 500, 5000, and 50000 residues: */ if (step < nsteps && nres > threshold[step]) { if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */ step++; } } if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */ status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */ /* deliberate flowthrough...*/ DONE: esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */ esl_buffer_RaiseAnchor(afp->bf, anchor); *ret_type = alphatype; return status; ERROR: if (anchor != -1) { esl_buffer_SetOffset(afp->bf, anchor); esl_buffer_RaiseAnchor(afp->bf, anchor); } *ret_type = eslUNKNOWN; return status; }
/* Function: esl_msafile_psiblast_GuessAlphabet() * Synopsis: Guess the alphabet of an open PSI-BLAST MSA file. * * Purpose: Guess the alpbabet of the sequences in open * PSI-BLAST format MSA file <afp>. * * On a normal return, <*ret_type> is set to <eslDNA>, * <eslRNA>, or <eslAMINO>, and <afp> is reset to its * original position. * * Args: afp - open PSI-BLAST format MSA file * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO> * * Returns: <eslOK> on success. * <eslENOALPHABET> if alphabet type can't be determined. * In either case, <afp> is rewound to the position it * started at. */ int esl_msafile_psiblast_GuessAlphabet(ESLX_MSAFILE *afp, int *ret_type) { int alphatype = eslUNKNOWN; esl_pos_t anchor = -1; int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */ int nsteps = 3; int step = 0; int nres = 0; int x; int64_t ct[26]; char *p, *tok; esl_pos_t n, toklen, pos; int status; for (x = 0; x < 26; x++) ct[x] = 0; anchor = esl_buffer_GetOffset(afp->bf); if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */ while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK) { if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */ /* p now points to the rest of the sequence line, after a name */ /* count characters into ct[] array */ for (pos = 0; pos < n; pos++) if (isalpha(p[pos])) { x = toupper(p[pos]) - 'A'; ct[x]++; nres++; } /* try to stop early, checking after 500, 5000, and 50000 residues: */ if (step < nsteps && nres > threshold[step]) { if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */ step++; } } if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */ status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */ DONE: esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */ esl_buffer_RaiseAnchor(afp->bf, anchor); *ret_type = alphatype; return status; ERROR: if (anchor != -1) { esl_buffer_SetOffset(afp->bf, anchor); esl_buffer_RaiseAnchor(afp->bf, anchor); } *ret_type = eslUNKNOWN; return status; }