/* Function: p7_Fastmodelmaker() * * Purpose: Heuristic model construction. * Construct an HMM from an alignment by a simple rule, * based on the fractional occupancy of each columns w/ * residues vs gaps. Any column w/ a fractional * occupancy of $\geq$ <symfrac> is assigned as a MATCH column; * for instance, if thresh = 0.5, columns w/ $\geq$ 50\% * residues are assigned to match... roughly speaking. * * "Roughly speaking" because sequences may be weighted * in the input <msa>, and because missing data symbols are * ignored, in order to deal with sequence fragments. * * The <msa> must be in digital mode. * * If the caller wants to designate any sequences as * fragments, it does so by converting all N-terminal and * C-terminal flanking gap symbols to missing data symbols. * * NOTE: p7_Fastmodelmaker() will slightly revise the * alignment if the assignment of columns implies * DI and ID transitions. * * Returns the HMM in counts form (ready for applying Dirichlet * priors as the next step). Also returns fake traceback * for each training sequence. * * Models must have at least one node, so if the <msa> defined * no consensus columns, a <eslENORESULT> error is returned. * * Args: msa - multiple sequence alignment * symfrac - threshold for residue occupancy; >= assigns MATCH * bld - holds information on regions requiring masking, optionally NULL -> no masking * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. ret_hmm and opt_tr allocated here, * and must be free'd by the caller (FreeTrace(tr[i]), free(tr), * FreeHMM(hmm)). * * Returns <eslENORESULT> if no consensus columns were annotated; * in this case, <ret_hmm> and <opt_tr> are returned NULL. * * Throws: <eslEMEM> on allocation failure; <eslEINVAL> if the * <msa> isn't in digital mode. */ int p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; /* return status flag */ int *matassign = NULL; /* MAT state assignments if 1; 1..alen */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA"); /* Allocations: matassign is 1..alen array of bit flags. */ ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1)); /* Determine weighted sym freq in each column, set matassign[] accordingly. */ for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE; else matassign[apos] = FALSE; } /* Once we have matassign calculated, modelmakers behave * the same; matassign2hmm() does this stuff (traceback construction, * trace counting) and sets up ret_hmm and opt_tr. */ if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) { fprintf (stderr, "hmm construction error during trace counting\n"); goto ERROR; } free(matassign); return eslOK; ERROR: if (matassign != NULL) free(matassign); return status; }
/* Function: p7_Alimask_MakeModel2AliMap() * Synopsis: Compute map of coordinate in the alignment corresponding to each model position. * * Args: msa - The alignment for which the mapped model is to be computed. We assume * the MSA has already been manipulated to account for model building * flags (e.g. weighting). * do_hand - TRUE when the model is to follow a hand-build RF line (which must be * part of the file. * symfraq - if weighted occupancy exceeds this value, include the column in the model. * map - int array into which the map values will be stored. Calling function * must allocate (msa->alen+1) ints. * * Returns: The number of mapped model positions. */ int p7_Alimask_MakeModel2AliMap(ESL_MSA *msa, int do_hand, float symfrac, int *map ) { int i = 0; int apos, idx; float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ i = 0; if ( do_hand ) { if (msa->rf == NULL) p7_Fail("Model file does not contain an RF line, required for --hand.\n"); /* Watch for off-by-one. rf is [0..alen-1]*/ for (apos = 1; apos <= msa->alen; apos++) { if (!esl_abc_CIsGap(msa->abc, msa->rf[apos-1]) ) { map[i] = apos; i++; } } } else { for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) { map[i] = apos; i++; } } } return i; }
/* Function: rejustify_insertions_digital() * Synopsis: * Incept: SRE, Thu Oct 23 13:06:12 2008 [Janelia] * * Purpose: * * Args: msa - alignment to rejustify * digital mode: ax[0..nseq-1][1..alen] and abc is valid * text mode: aseq[0..nseq-1][0..alen-1] * inserts - # of inserted columns following node k, for k=0.1..M * inserts[0] is for N state; inserts[M] is for C state * matmap - index of column associated with node k [k=0.1..M; matmap[0] = 0] * this is an alignment column index 1..alen, same offset as <ax> * if applied to text mode aseq or annotation, remember to -1 * if no residues use match state k, matmap[k] is the * index of the last column used before node k's columns * start: thus matmap[k]+1 is always the start of * node k's insertion (if any). * matuse - TRUE if an alignment column is associated with node k: [k=0.1..M; matuse[0] = 0]. * if matuse[k] == 0, every sequence deleted at node k, * and we're collapsing the column rather than showing all * gaps. * * Note: The insertion for node k is of length <inserts[k]> columns, * and in 1..alen coords it runs from * matmap[k]+1 .. matmap[k+1]-matuse[k+1]. * * * Returns: * * Throws: (no abnormal error conditions) * * Xref: */ static int rejustify_insertions_digital(ESL_MSA *msa, const int *inserts, const int *matmap, const int *matuse, int M) { int idx; int k; int apos; int nins; int npos, opos; for (idx = 0; idx < msa->nseq; idx++) { for (k = 0; k < M; k++) if (inserts[k] > 1) { for (nins = 0, apos = matmap[k]+1; apos <= matmap[k+1]-matuse[k+1]; apos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) nins++; if (k == 0) nins = 0; /* N-terminus is right justified */ else nins /= 2; /* split in half; nins now = # of residues left left-justified */ opos = npos = matmap[k+1]-matuse[k+1]; while (opos >= matmap[k]+1+nins) { if (esl_abc_XIsGap(msa->abc, msa->ax[idx][opos])) opos--; else { msa->ax[idx][npos] = msa->ax[idx][opos]; if (msa->pp != NULL && msa->pp[idx] != NULL) msa->pp[idx][npos-1] = msa->pp[idx][opos-1]; npos--; opos--; } } while (npos >= matmap[k]+1+nins) { msa->ax[idx][npos] = esl_abc_XGetGap(msa->abc); if (msa->pp != NULL && msa->pp[idx] != NULL) msa->pp[idx][npos-1] = '.'; npos--; } } } return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go; /* application configuration */ int kstatus, tstatus;/* return code from Easel routine */ int fmt; /* expected format of kfile, tfile */ char *kfile, *tfile; /* known, test structure file */ ESL_MSAFILE *kfp, *tfp; /* open kfile, tfile */ ESL_MSA *ka, *ta; /* known, trusted alignment */ int64_t klen, tlen; /* lengths of dealigned seqs */ int i; /* counter over sequences */ int apos; /* counter over alignment columns */ int rfpos; /* counter over consensus (non-gap RF) columns */ int is_rfpos; /* TRUE if current apos is a consensus pos, FALSE if not */ int uapos; /* counter over unaligned residue positions */ int nali; /* number of alignment we're on in each file */ int **kp; /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */ int **tp; /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */ /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert' * after non-gap RF position (x * -1) */ int *km_pos; /* [0..rflen] = x, in known aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ki_pos; /* [0..rflen] = x, in known aln, number of residues inserted after non-gap RF column x */ int *tm_pos; /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ti_pos; /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */ int *cor_tm_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *cor_ti_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */ int *km_seq; /* [0..i..nseq-1] = x, in known aln, number of residues aligned to non-gap RF columns in seq i; */ int *ki_seq; /* [0..i..nseq-1] = x, in known aln, number of residues inserted in seq i */ int *tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */ int *ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */ int *cor_tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */ int *cor_ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */ int *seqlen; /* [0..i..nseq-1] = x, unaligned seq i has length x */ ESL_ALPHABET *abc = NULL; /* alphabet for all alignments */ int rflen, t_rflen; /* non-gap RF length (consensus lengths) */ int status; char *namedashes; int ni; int namewidth = 8; /* length of 'seq name' */ int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */ char *mask = NULL; int masklen; ESL_DSQ *ks; ESL_DSQ *ts; FILE *dfp = NULL; /* for --c2dfile */ /* variables needed for -p and related options */ int do_post = FALSE; /* TRUE if -p enabled */ int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */ int p; /* counter over integerized posteriors */ int *ptm = NULL; /* [0..p..10] number of total matches with posterior value p (10="*")*/ int *pti = NULL; /* [0..p..10] number of total inserts with posterior value p */ int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */ int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */ int npostvals = 11; /* number of posterior values 0-9, * */ int ppidx; /* index of PP */ char ppchars[11] = "0123456789*"; int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */ // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti; /* total counts of posteriors */ // int tot_incor_ptm,tot_incor_pti; // SRE: commented out; don't seem to be used; need to silence compiler warning char errbuf[eslERRBUFSIZE]; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); exit(EXIT_SUCCESS); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } kfile = esl_opt_GetArg(go, 1); tfile = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the two Stockholm files. ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus); do_post = esl_opt_GetBoolean(go, "-p"); /* read the mask file if --p-mask is enabled */ if(! esl_opt_IsDefault(go, "--p-mask")) { if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf); } /* open the c2dfile for output, if nec */ if (esl_opt_IsOn(go, "--c2dfile")) { if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile")); } /*********************************************** * Do alignment comparisons, one seq at a time; * this means looping over all seqs in all alignments. ***********************************************/ nali = 0; while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF) { if ( kstatus != eslOK) esl_msafile_ReadFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus); nali++; if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); /* Sanity check on alignment */ if (ka->nseq != ta->nseq) esl_fatal("trusted, test alignments don't have same seq #\n"); if (ka->rf == NULL) esl_fatal("trusted alignment has no reference annotation\n"); if (ta->rf == NULL) esl_fatal("test alignment has no reference annotation\n"); /* make sure the sequences are all identical */ ESL_ALLOC(seqlen, sizeof(int) * ka->nseq); for(i = 0; i < ka->nseq; i++) { if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2)); memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen); ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2)); memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen); if (tlen != klen) esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen); if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]); seqlen[i] = tlen; free(ks); free(ts); } /* determine non-gap RF length */ rflen = 0; for(apos = 1; apos <= ka->alen; apos++) { if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++; } t_rflen = 0; for(apos = 1; apos <= ta->alen; apos++) { if((! esl_abc_CIsGap (ta->abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (ta->abc, ta->rf[apos-1]))) t_rflen++; } if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen); /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */ if(do_post) { if(! esl_opt_IsDefault(go, "--p-mask")) { if(masklen != rflen) { esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen); } } if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists"); ESL_ALLOC(ptm, sizeof(int) * npostvals); ESL_ALLOC(pti, sizeof(int) * npostvals); ESL_ALLOC(cor_ptm, sizeof(int) * npostvals); ESL_ALLOC(cor_pti, sizeof(int) * npostvals); esl_vec_ISet(ptm, npostvals, 0); esl_vec_ISet(pti, npostvals, 0); esl_vec_ISet(cor_ptm, npostvals, 0); esl_vec_ISet(cor_pti, npostvals, 0); } /* allocate and initialize our counters */ ESL_ALLOC(kp, sizeof(int *) * ka->nseq); ESL_ALLOC(tp, sizeof(int *) * ta->nseq); for(i = 0; i < ka->nseq; i++) { ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1)); ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1)); esl_vec_ISet(kp[i], seqlen[i]+1, -987654321); esl_vec_ISet(tp[i], seqlen[i]+1, -987654321); } ESL_ALLOC(km_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1)); esl_vec_ISet(km_pos, rflen+1, 0); esl_vec_ISet(ki_pos, rflen+1, 0); esl_vec_ISet(tm_pos, rflen+1, 0); esl_vec_ISet(ti_pos, rflen+1, 0); esl_vec_ISet(cor_tm_pos, rflen+1, 0); esl_vec_ISet(cor_ti_pos, rflen+1, 0); ESL_ALLOC(km_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq); ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq); esl_vec_ISet(km_seq, ka->nseq, 0); esl_vec_ISet(ki_seq, ka->nseq, 0); esl_vec_ISet(tm_seq, ka->nseq, 0); esl_vec_ISet(ti_seq, ka->nseq, 0); esl_vec_ISet(cor_tm_seq, ka->nseq, 0); esl_vec_ISet(cor_ti_seq, ka->nseq, 0); /* determine non-gap RF location of each residue in known alignment */ for(i = 0; i < ka->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ka->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing (ka->abc, ka->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; } if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { uapos++; kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; } else { ki_pos[rfpos]++; ki_seq[i]++; } } } } /* determine non-gap RF location of each residue in predicted alignment */ for(i = 0; i < ta->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ta->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (abc, ta->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; if(do_post) { do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE; } } if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { uapos++; tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(do_post) { if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos); ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]); if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos); } if(is_rfpos) { tm_pos[rfpos]++; tm_seq[i]++; if(do_post_for_this_rfpos) ptm[ppidx]++; } else { ti_pos[rfpos]++; ti_seq[i]++; if(do_post) pti[ppidx]++; } if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */ if(is_rfpos) { cor_tm_seq[i]++; cor_tm_pos[rfpos]++; if(do_post_for_this_rfpos) cor_ptm[ppidx]++; } else { cor_ti_seq[i]++; cor_ti_pos[rfpos]++; if(do_post) cor_pti[ppidx]++; } } } } } if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { /* print per sequence statistics */ /* determine the longest name in msa */ for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni])); ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-'; printf("# %-*s %6s %28s %28s %28s\n", namewidth, "seq name", "len", "match columns", "insert columns", "all columns"); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------"); for(i = 0; i < ta->nseq; i++) { printf(" %-*s %6d %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, ka->sqname[i], seqlen[i], cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); } cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq); cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq); km = esl_vec_ISum(km_seq, ka->nseq); ki = esl_vec_ISum(ki_seq, ka->nseq); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------"); printf("# %-*s %6s %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, "*all*", "-", cor_tm, km, ((float) cor_tm / (float) km), cor_ti, ki, ((float) cor_ti / (float) ki), (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); free(namedashes); for(i = 0; i < ka->nseq; i++) { free(kp[i]); free(tp[i]); } } else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */ printf("# %5s %20s %20s %20s\n", "rfpos", "match", "insert", "both"); printf("# %5s %20s %20s %20s\n", "-----", "--------------------", "--------------------", "--------------------"); for(rfpos = 0; rfpos <= rflen; rfpos++) { printf(" %5d %4d / %4d (%.3f) %4d / %4d (%.3f) %4d / %4d (%.3f)\n", rfpos, cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); } } else if(do_post) { /* do posterior output */ if(mask == NULL) { printf("# %2s %29s %29s\n", "", " match columns ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } else { printf("# %2s %29s %29s\n", "", " match columns within mask ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0; //tot_ptm = esl_vec_ISum(ptm, npostvals); //tot_pti = esl_vec_ISum(pti, npostvals); //tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals); //tot_cor_pti = esl_vec_ISum(cor_pti, npostvals); //tot_incor_ptm = tot_ptm - tot_cor_ptm; //tot_incor_pti = tot_pti - tot_cor_pti; for(p = (npostvals-1); p >= 0; p--) { cm_cor_ptm += cor_ptm[p]; cm_cor_pti += cor_pti[p]; cm_ptm += ptm[p]; cm_pti += pti[p]; cm_incor_ptm += ptm[p] - cor_ptm[p]; cm_incor_pti += pti[p] - cor_pti[p]; printf(" %2c %8d / %8d (%.5f) %8d / %8d (%.5f)\n", ppchars[p], cor_ptm[p], ptm[p], (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], cor_pti[p], pti[p], (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]); } } /* handle --c2dfile */ if (dfp != NULL) { /* match stats, 4 fields, CMYK color values */ for(rfpos = 1; rfpos <= rflen; rfpos++) { if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */ 1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */ 0.); } } fprintf(dfp, "//\n"); /* insert stats, 4 fields, CMYK color values */ rfpos = 0; /* special case, combine insert posn 0 and 1 together */ if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */ 0., 0.); } /* insert stats posn 2..rflen */ for(rfpos = 2; rfpos <= rflen; rfpos++) { if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */ 0., 0.); } } fprintf(dfp, "//\n"); } if(ptm != NULL) free(ptm); if(pti != NULL) free(pti); if(cor_ptm != NULL) free(cor_ptm); if(cor_ptm != NULL) free(cor_pti); free(kp); free(tp); free(km_seq); free(ki_seq); free(tm_seq); free(ti_seq); free(cor_tm_seq); free(cor_ti_seq); free(km_pos); free(ki_pos); free(tm_pos); free(ti_pos); free(cor_tm_pos); free(cor_ti_pos); free(seqlen); esl_msa_Destroy(ka); esl_msa_Destroy(ta); } if(mask != NULL) free(mask); if(dfp != NULL) { fclose(dfp); printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile")); } if(abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msafile_Close(tfp); esl_msafile_Close(kfp); return 0; ERROR: return status; }
/* Function: esl_msafile_a2m_Write() * Synopsis: Write an A2M (UCSC SAM) dotless format alignment to a stream. * * Purpose: Write alignment <msa> in dotless UCSC A2M format to a * stream <fp>. * * The <msa> should have a valid reference line <msa->rf>, * with alphanumeric characters marking consensus (match) * columns, and non-alphanumeric characters marking * nonconsensus (insert) columns. If it does not, * then as a fallback, the first sequence in the alignment is * considered to be the consensus. * * In "dotless" A2M format, gap characters (.) in insert * columns are omitted; therefore sequences can be of * different lengths, but each sequence has the same number * of consensus columns (residue or -). * * A2M format cannot represent missing data symbols * (Easel's ~). Any missing data symbols are converted to * gaps. * * A2M format cannot represent pyrrolysine residues in * amino acid sequences, because it treats 'O' symbols * specially, as indicating a position at which a * free-insertion module (FIM) should be created. Any 'O' * in the <msa> is written instead as an unknown * residue ('X', in protein sequences). * * Args: fp - open output stream * msa - MSA to write * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. * <eslEWRITE> on any system write error, such as filled disk. */ int esl_msafile_a2m_Write(FILE *fp, const ESL_MSA *msa) { char *buf = NULL; int cpl = 60; int bpos; int pos; int is_consensus; int is_residue; int do_dotless = TRUE; /* just changing this to FALSE makes it write dots too */ int i; int sym; int status; ESL_ALLOC(buf, sizeof(char) * (cpl+1)); for (i = 0; i < msa->nseq; i++) { /* Construct the name/description line */ if (fprintf(fp, ">%s", msa->sqname[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); if (msa->sqacc != NULL && msa->sqacc[i] != NULL) { if (fprintf(fp, " %s", msa->sqacc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); } if (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) { if (fprintf(fp, " %s", msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); } if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); #ifdef eslAUGMENT_ALPHABET if (msa->abc) { pos = 0; while (pos < msa->alen) { for (bpos = 0; pos < msa->alen && bpos < cpl; pos++) { sym = msa->abc->sym[msa->ax[i][pos+1]]; /* note off-by-one in digitized aseq: 1..alen */ is_residue = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+1]); if (msa->rf) is_consensus = (isalnum(msa->rf[pos]) ? TRUE : FALSE); else is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+1]) ? TRUE : FALSE); if (sym == 'O') sym = esl_abc_XGetUnknown(msa->abc); /* watch out: O means "insert a FIM" in a2m format, not pyrrolysine */ if (is_consensus) { buf[bpos++] = (is_residue ? toupper(sym) : '-'); } else if (is_residue) { buf[bpos++] = tolower(sym); } else if (! do_dotless) { buf[bpos++] = '.'; } } buf[bpos] = '\0'; if (bpos) { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");} } } #endif if (! msa->abc) { pos = 0; while (pos < msa->alen) { for (bpos = 0; pos < msa->alen && bpos < cpl; pos++) { sym = msa->aseq[i][pos]; is_residue = isalpha(msa->aseq[i][pos]); if (msa->rf) is_consensus = (isalnum(msa->rf[pos]) ? TRUE : FALSE); else is_consensus = (isalnum(msa->aseq[0][pos]) ? TRUE : FALSE); if (sym == 'O') sym = 'X'; if (is_consensus) { buf[bpos++] = ( is_residue ? toupper(sym) : '-'); } else if (is_residue) { buf[bpos++] = tolower(sym); } else if (! do_dotless) { buf[bpos++] = '.'; } } buf[bpos] = '\0'; if (bpos) { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); } } } } /* end, loop over sequences in the MSA */ free(buf); return eslOK; ERROR: if (buf) free(buf); return status; }
/* Function: esl_msafile_psiblast_Write() * Synopsis: Write an MSA to a stream in PSI-BLAST format * * Purpose: Write alignment <msa> in NCBI PSI-BLAST format to * stream <fp>. * * The <msa> should have a valid reference line <msa->rf>, * with alphanumeric characters marking consensus (match) * columns, and non-alphanumeric characters marking * nonconsensus (insert) columns. If it does not have RF * annotation, then the first sequence in the <msa> * defines the "consensus". * * PSI-BLAST format allows only one symbol ('-') for gaps, * and cannot represent missing data symbols (Easel's * '~'). Any missing data symbols are converted to gaps. * * Args: fp - open output stream * msa - MSA to write * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. * <eslEWRITE> on any system write failure, such as filled disk. */ int esl_msafile_psiblast_Write(FILE *fp, const ESL_MSA *msa) { char *buf = NULL; int cpl = 60; int acpl; int i; int sym; int64_t pos, bpos; int maxnamewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq); int is_consensus; int is_residue; int status; ESL_ALLOC(buf, sizeof(char) * (cpl+1)); for (pos = 0; pos < msa->alen; pos += cpl) { for (i = 0; i < msa->nseq; i++) { acpl = (msa->alen - pos > cpl)? cpl : msa->alen - pos; #ifdef eslAUGMENT_ALPHABET if (msa->abc) { for (bpos = 0; bpos < acpl; bpos++) { sym = msa->abc->sym[msa->ax[i][pos + bpos + 1]]; is_residue = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+bpos+1]); if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE); else is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+bpos+1]) ? TRUE : FALSE); if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); } else { buf[bpos] = (is_residue ? tolower(sym) : '-'); } } } #endif if (! msa->abc) { for (bpos = 0; bpos < acpl; bpos++) { sym = msa->aseq[i][pos + bpos]; is_residue = isalnum(sym); if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE); else is_consensus = (isalnum(msa->aseq[0][pos+bpos]) ? TRUE : FALSE); if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); } else { buf[bpos] = (is_residue ? tolower(sym) : '-'); } } } buf[acpl] = '\0'; if (fprintf(fp, "%-*s %s\n", maxnamewidth, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed"); } /* end loop over sequences */ if (pos + cpl < msa->alen) { if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed"); } } free(buf); return eslOK; ERROR: if (buf) free(buf); return status; }
/* Function: scoredata_GetSSVScoreArrays() * Synopsis: Get compact representation of substitution scores and maximal extensions * * Purpose: Extract 8-bit (MSV-style) substitution scores from optimized * matrix. These scores will be used in both standard MSV diagonal * recovery and FM-MSV diagonal scoring. * * Optionally, for each position in the model, capture the maximum * possible score that can be added to a diagonal's score (in both * directions) by extending lengths 1..10. These extension scores * are used in FM-MSV's pruning step. * * Once a hit passes the SSV filter, and the prefix/suffix * values of P7_SCOREDATA are required (to establish windows * around SSV diagonals), p7_hmm_ScoreDataComputeRest() * must be called. * * * Args: om - P7_OPROFILE containing scores used to produce SCOREDATA contents * data - where scores and will be stored * do_opt_ext - boolean, TRUE if optimal-extension scores are required (for FM-MSV) * scale - used to produce 8-bit extracted scores * bias - used to produce 8-bit extracted scores * * Returns: data->scores and possibly ->opt_ext_(fwd|rev) are filled in; * return eslEMEM on allocation failure, eslOK otherwise. */ static int scoredata_GetSSVScoreArrays(P7_OPROFILE *om, P7_PROFILE *gm, P7_SCOREDATA *data ) { int i, j, status; //gather values from gm->rsc into a succinct 2D array float *max_scores; float sc_fwd, sc_rev; int K = om->abc->Kp; data->M = om->M; if (!gm) { // get values for the standard pipeline data->type = p7_sd_std; ESL_ALLOC(data->ssv_scores, (om->M + 1) * K * sizeof(uint8_t)); p7_oprofile_GetSSVEmissionScoreArray(om, data->ssv_scores); } else {// need float, unscaled scores, and other stuff used in the FMindex-based SSV pipeline, data->type = p7_sd_fm; ESL_ALLOC(data->ssv_scores_f, (om->M + 1) * K * sizeof(float)); ESL_ALLOC(max_scores, (om->M + 1) * sizeof(float)); for (i = 1; i <= om->M; i++) { max_scores[i] = 0; for (j=0; j<K; j++) { if (esl_abc_XIsResidue(om->abc,j)) { data->ssv_scores_f[i*K + j] = gm->rsc[j][(i) * p7P_NR + p7P_MSC]; if (data->ssv_scores_f[i*K + j] > max_scores[i]) max_scores[i] = data->ssv_scores_f[i*K + j]; } } } //for each position in the query, what's the highest possible score achieved by extending X positions, for X=1..10 ESL_ALLOC(data->opt_ext_fwd, (om->M + 1) * sizeof(float*)); ESL_ALLOC(data->opt_ext_rev, (om->M + 1) * sizeof(float*)); for (i=1; i<om->M; i++) { ESL_ALLOC(data->opt_ext_fwd[i], 10 * sizeof(float)); ESL_ALLOC(data->opt_ext_rev[i], 10 * sizeof(float)); } for (i=1; i<om->M; i++) { sc_fwd = 0; sc_rev = 0; for (j=0; j<10 && i+j+1<=om->M; j++) { sc_fwd += max_scores[i+j+1]; data->opt_ext_fwd[i][j] = sc_fwd; sc_rev += max_scores[om->M-i-j]; data->opt_ext_rev[om->M-i][j] = sc_rev; } for ( ; j<10; j++) { //fill in empty values data->opt_ext_fwd[i][j] = data->opt_ext_fwd[i][j-1]; data->opt_ext_rev[om->M-i][j] = data->opt_ext_rev[om->M-i][j-1]; } } free(max_scores); } return eslOK; ERROR: return eslEMEM; }
/* dump_insert_info * * Given an MSA with RF annotation, print out information about how many 'insertions' come * after each non-gap RF column (consensus column). */ static int dump_insert_info(FILE *fp, ESL_MSA *msa, int use_weights, int nali, int *i_am_rf, char *alifile, char *errbuf) { int status; int apos, rfpos; double **ict; double *total_ict; int i; int rflen; double seqwt; /* weight of current sequence */ double nseq; /* contract check */ if(! (msa->flags & eslMSA_DIGITAL)) ESL_XFAIL(eslEINVAL, errbuf, "in dump_insert_info(), msa must be digitized."); if(msa->rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "No #=GC RF markup in alignment, it is needed for --iinfo."); if(i_am_rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "internal error, dump_insert_info() i_am_rf is NULL."); if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "dump_insert_info(): use_weights==TRUE but msa->wgt == NULL"); ESL_ALLOC(total_ict, sizeof(double) * (msa->alen+2)); esl_vec_DSet(total_ict, (msa->alen+2), 0.); ESL_ALLOC(ict, sizeof(double *) * (msa->alen+2)); for(i = 0; i <= msa->alen; i++) { ESL_ALLOC(ict[i], sizeof(double) * (msa->nseq)); esl_vec_DSet(ict[i], (msa->nseq), 0.); } fprintf(fp, "# Insert information:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); } fprintf(fp, "# rfpos is the nongap RF position after which insertions occur\n"); fprintf(fp, "# An rfpos of '0' indicates insertions before the first nongap RF position\n"); fprintf(fp, "# Number of sequences: %d\n", msa->nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); fprintf(fp, "# %8s %10s %8s %8s\n", "rfpos", "nseq w/ins", "freq ins", "avg len"); fprintf(fp, "# %8s %10s %8s %8s\n", "--------", "----------", "--------", "--------"); rflen = 0; for(apos = 1; apos <= msa->alen; apos++) if(i_am_rf[apos-1]) rflen++; rfpos = 0; for(apos = 1; apos <= msa->alen; apos++) { if(i_am_rf[apos-1]) rfpos++; else { for(i = 0; i < msa->nseq; i++) { seqwt = use_weights ? msa->wgt[i] : 1.0; if(esl_abc_XIsResidue(msa->abc, msa->ax[i][apos])) { ict[rfpos][i]++; total_ict[rfpos] += seqwt; } } } } rflen = rfpos; for(rfpos = 0; rfpos <= rflen; rfpos++) { nseq = 0.; for(i = 0; i < msa->nseq; i++) { if(ict[rfpos][i] >= 1) { seqwt = use_weights ? msa->wgt[i] : 1.0; nseq += seqwt; } } if(nseq > 0.) fprintf(fp, " %8d %10.1f %8.6f %8.3f\n", rfpos, nseq, nseq / (float) msa->nseq, ((float) total_ict[rfpos] / (float) nseq)); } fprintf(fp, "//\n"); for(i = 0; i <= msa->alen; i++) free(ict[i]); free(ict); free(total_ict); return eslOK; ERROR: return status; }