/* Function: esl_msafile_a2m_Read() * Synopsis: Read a UCSC A2M format alignment. * * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing * for UCSC A2M (SAM) format. Create a new MSA, * and return a ptr to it in <*ret_msa>. Caller is responsible * for freeing this <ESL_MSA>. * * The <msa> has a reference line (<msa->rf[]>) that * corresponds to the uppercase/lowercase columns in the * alignment: consensus (uppercase) columns are marked 'X', * and insert (lowercase) columns are marked '.' in the RF * annotation line. * * This input parser can deal both with "dotless" A2M, and * full A2M format with dots. * * Args: afp - open <ESL_MSAFILE> * ret_msa - RETURN: newly parsed <ESL_MSA> * * Returns: <eslOK> on success. <*ret_msa> is set to the newly * allocated MSA, and <afp> is at EOF. * * <eslEOF> if no (more) alignment data are found in * <afp>, and <afp> is returned at EOF. * * <eslEFORMAT> on a parse error. <*ret_msa> is set to * <NULL>. <afp> contains information sufficient for * constructing useful diagnostic output: * | <afp->errmsg> | user-directed error message | * | <afp->linenumber> | line # where error was detected | * | <afp->line> | offending line (not NUL-term) | * | <afp->n> | length of offending line | * | <afp->bf->filename> | name of the file | * and <afp> is poised at the start of the following line, * so (in principle) the caller could try to resume * parsing. * * Throws: <eslEMEM> - an allocation failed. * <eslESYS> - a system call such as fread() failed * <eslEINCONCEIVABLE> - "impossible" corruption * On these, <*ret_msa> is returned <NULL>, and the state of * <afp> is undefined. */ int esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; char **csflag = NULL; /* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus */ int *nins = NULL; /* # of inserted residues before each consensus col [0..ncons-1] */ int *this_nins = NULL; /* # of inserted residues before each consensus residue in this seq */ int nseq = 0; int ncons = 0; int idx; int64_t thislen; int64_t spos; int this_ncons; int cpos, bpos; char *p, *tok; esl_pos_t n, toklen; int status; ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) ); afp->errmsg[0] = '\0'; #ifdef eslAUGMENT_ALPHABET if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } #endif if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc); for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL; /* skip leading blank lines in file */ while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ; if (status != eslOK) goto ERROR; /* includes normal EOF */ /* tolerate sloppy space at start of name/desc line */ while (n && isspace(*p)) { p++; n--; } if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >"); do { /* for each record starting in '>': */ p++; n--; /* advance past > */ if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record"); if (nseq >= msa->sqalloc) { int old_sqalloc = msa->sqalloc; if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR; ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc); for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL; } if ( (status = esl_msa_SetSeqName (msa, nseq, tok, toklen)) != eslOK) goto ERROR; if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p, n)) != eslOK) goto ERROR; /* now for each sequence line... */ thislen = 0; /* count of lowercase, uppercase, and '-': w/o dots, on first pass */ this_ncons = 0; /* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */ if (nseq) { for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. <this_nins> is allocated on first seq, when nseq=0. this_nins[cpos] = 0; // cppcheck gets confused and erroneously calls "possible null pointer deference"; ignore it. } while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */ if (n == 0) continue; /* tolerate and skip blank lines */ if (*p == '>') break; ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */ if (nseq == 0) { ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1)); for (cpos = this_ncons; cpos <= this_ncons+n; cpos++) this_nins[cpos] = 0; } for (spos = thislen, bpos = 0; bpos < n; bpos++) { if (p[bpos] == 'O') continue; else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE; this_ncons++; } else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; } else if (p[bpos] == '-') { csflag[nseq][spos++] = TRUE; this_ncons++; } if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)"); } csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */ #ifdef eslAUGMENT_ALPHABET if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]), &thislen, p, n); } #endif if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); } if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters"); else if (status != eslOK) goto ERROR; ESL_DASSERT1( (spos == thislen) ); } if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */ /* status == OK: then *p == '>'. status == eslEOF: we're eof. status == anything else: error */ /* Finished reading a sequence record. */ if (nseq == 0) { ncons = this_ncons; ESL_ALLOC(nins, sizeof(int) * (ncons+1)); for (cpos = 0; cpos <= ncons; cpos++) nins[cpos] = this_nins[cpos]; } else { if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)"); for (cpos = 0; cpos <= ncons; cpos++) nins[cpos] = ESL_MAX(nins[cpos], this_nins[cpos]); } nseq++; } while (status == eslOK); /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1. * nins[0..ncons] tells us the max number of inserted residues before each consensus column * This is sufficient information to reconstruct each aligned sequence. */ msa->nseq = nseq; #ifdef eslAUGMENT_ALPHABET if (msa->abc) { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; } #endif if (!msa->abc) { if ((status = a2m_padding_text (msa, csflag, nins, ncons)) != eslOK) goto ERROR; } if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; *ret_msa = msa; free(nins); free(this_nins); for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]); free(csflag); return eslOK; ERROR: if (nins) free(nins); if (this_nins) free(this_nins); if (csflag) { for (idx = 0; idx < msa->nseq; idx++) if (csflag[idx]) free(csflag[idx]); free(csflag); } if (msa) esl_msa_Destroy(msa); return status; }
/* Function: esl_msafile_psiblast_Read() * Synopsis: Read an alignment in PSI-BLAST's input format. * * Purpose: Read an MSA from an open <ESLX_MSAFILE> <afp>, parsing for * PSI-BLAST input format, starting from the current point. * Create a new multiple alignment, and return a ptr to * that alignment via <*ret_msa>. Caller is responsible for * free'ing this <ESL_MSA>. * * The <msa> has a reference line (<msa->rf[]>) that * corresponds to the uppercase/lowercase columns in the * alignment: consensus (uppercase) columns are marked 'x', * and insert (lowercase) columns are marked '.' in this RF * line. * * Args: afp - open <ESL_MSAFILE> * ret_msa - RETURN: newly parsed <ESL_MSA> * * Returns: <eslOK> on success. <*ret_msa> contains the newly * allocated MSA. <afp> is at EOF. * * <eslEOF> if no (more) alignment data are found in * <afp>, and <afp> is returned at EOF. * * <eslEFORMAT> on a parse error. <*ret_msa> is set to * <NULL>. <afp> contains information sufficient for * constructing useful diagnostic output: * | <afp->errmsg> | user-directed error message | * | <afp->linenumber> | line # where error was detected | * | <afp->line> | offending line (not NUL-term) | * | <afp->n> | length of offending line | * | <afp->bf->filename> | name of the file | * and <afp> is poised at the start of the following line, * so (in principle) the caller could try to resume * parsing. * * Throws: <eslEMEM> on allocation error. * <eslESYS> if a system call fails, such as fread(). * <eslEINCONCEIVABLE> - "impossible" corruption * On these, <*ret_msa> is returned <NULL>, and the state of * <afp> is undefined. */ int esl_msafile_psiblast_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; int idx = 0; /* counter over sequences in a block */ int nblocks = 0; /* counter over blocks */ int64_t alen = 0; int nseq = 0; int64_t cur_alen; esl_pos_t pos; /* position on a line */ esl_pos_t name_start, name_len; esl_pos_t seq_start, seq_len; esl_pos_t block_seq_start, block_seq_len; int status; ESL_DASSERT1( (afp->format == eslMSAFILE_PSIBLAST) ); afp->errmsg[0] = '\0'; /* allocate a growable MSA. We set msa->{nseq,alen} only when we're done. */ #ifdef eslAUGMENT_ALPHABET if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } #endif if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } /* skip leading blank lines in file */ while ( (status = eslx_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ; if (status != eslOK) goto ERROR; /* includes normal EOF */ /* Read the file a line at a time; if a parsing error occurs, detect immediately, with afp->linenumber set correctly */ do { /* while in the file... */ idx = 0; do { /* while in a block... */ for (pos = 0; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break; name_start = pos; for (pos = pos+1; pos < afp->n; pos++) if ( isspace(afp->line[pos])) break; name_len = pos - name_start; for (pos = pos+1; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break; seq_start = pos; if (pos >= afp->n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line"); for (pos = afp->n-1; pos > 0; pos--) if (! isspace(afp->line[pos])) break; seq_len = pos - seq_start + 1; if (idx == 0) { block_seq_start = seq_start; block_seq_len = seq_len; } else { if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned"); if (seq_len != block_seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned"); } /* Process the consensus #=RF line. */ if (idx == 0) { ESL_REALLOC(msa->rf, sizeof(char) * (alen + seq_len + 1)); for (pos = 0; pos < seq_len; pos++) msa->rf[alen+pos] = '-'; /* anything neutral other than . or x will do. */ msa->rf[alen+pos] = '\0'; } for (pos = 0; pos < seq_len; pos++) { if (afp->line[seq_start+pos] == '-') continue; if (isupper(afp->line[seq_start+pos])) { if (msa->rf[alen+pos] == '.') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected upper case residue (#%d on line)", (int) pos+1); msa->rf[alen+pos] = 'x'; } if (islower(afp->line[seq_start+pos])) { if (msa->rf[alen+pos] == 'x') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected lower case residue (#%d on line)", (int) pos+1); msa->rf[alen+pos] = '.'; } } /* Store the sequence name. */ if (nblocks == 0) { /* make sure we have room for another sequence */ if (idx >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) goto ERROR; if ( (status = esl_msa_SetSeqName(msa, idx, afp->line+name_start, name_len)) != eslOK) goto ERROR; } else { if (! esl_memstrcmp(afp->line+name_start, name_len, msa->sqname[idx])) ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, afp->line+name_start); } /* Append the sequence. */ cur_alen = alen; #ifdef eslAUGMENT_ALPHABET if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &(cur_alen), afp->line+seq_start, seq_len); } #endif if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), afp->line+seq_start, seq_len); } if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters"); else if (status != eslOK) goto ERROR; if (cur_alen - alen != seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters"); /* get next line. if it's blank, or if we're EOF, we're done with the block */ idx++; status = eslx_msafile_GetLine(afp, NULL, NULL); } while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* blank line ends a block. */ if (status != eslOK && status != eslEOF) goto ERROR; /* End of one block */ if (nblocks == 0) nseq = idx; else if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks"); alen += block_seq_len; nblocks++; /* skip blank lines to start of next block, if any */ while ( (status = eslx_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ; } while (status == eslOK); if (status != eslEOF) goto ERROR; msa->nseq = nseq; msa->alen = alen; if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; *ret_msa = msa; return eslOK; ERROR: if (msa) esl_msa_Destroy(msa); *ret_msa = NULL; return status; }