/* Function: p7_Builder() * Synopsis: Build a new HMM from an MSA. * * Purpose: Take the multiple sequence alignment <msa> and a build configuration <bld>, * and build a new HMM. * * Effective sequence number determination and calibration steps require * additionally providing a null model <bg>. * * Args: bld - build configuration * msa - multiple sequence alignment * bg - null model * opt_hmm - optRETURN: new HMM * opt_trarr - optRETURN: array of faux tracebacks, <0..nseq-1> * opt_gm - optRETURN: profile corresponding to <hmm> * opt_om - optRETURN: optimized profile corresponding to <gm> * opt_postmsa - optRETURN: RF-annotated, possibly modified MSA * * Returns: <eslOK> on success. The new HMM is optionally returned in * <*opt_hmm>, along with optional returns of an array of faux tracebacks * for each sequence in <*opt_trarr>, the annotated MSA used to construct * the model in <*opt_postmsa>, a configured search profile in * <*opt_gm>, and an optimized search profile in <*opt_om>. These are * all optional returns because the caller may, for example, be interested * only in an optimized profile, or may only be interested in the HMM. * * Returns <eslENORESULT> if no consensus columns were annotated. * Returns <eslEFORMAT> on MSA format problems, such as a missing RF annotation * line in hand architecture construction. On any returned error, * <bld->errbuf> contains an informative error message. * * Throws: <eslEMEM> on allocation error. * <eslEINVAL> if relative weights couldn't be calculated from <msa>. * * Xref: J4/30. */ int p7_Builder(P7_BUILDER *bld, ESL_MSA *msa, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE ***opt_trarr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om, ESL_MSA **opt_postmsa) { int i,j; uint32_t checksum = 0; /* checksum calculated for the input MSA. hmmalign --mapali verifies against this. */ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; P7_TRACE ***tr_ptr = (opt_trarr != NULL || opt_postmsa != NULL) ? &tr : NULL; int status; if ((status = validate_msa (bld, msa)) != eslOK) goto ERROR; if ((status = esl_msa_Checksum (msa, &checksum)) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to calculate checksum"); if ((status = relative_weights (bld, msa)) != eslOK) goto ERROR; if ((status = esl_msa_MarkFragments(msa, bld->fragthresh)) != eslOK) goto ERROR; if ((status = build_model (bld, msa, &hmm, tr_ptr)) != eslOK) goto ERROR; //Ensures that the weighted-average I->I count <= bld->max_insert_len //(MI currently contains the number of observed insert-starts) if (bld->max_insert_len>0) for (i=1; i<hmm->M; i++ ) hmm->t[i][p7H_II] = ESL_MIN(hmm->t[i][p7H_II], bld->max_insert_len*hmm->t[i][p7H_MI]); if ((status = effective_seqnumber (bld, msa, hmm, bg)) != eslOK) goto ERROR; if ((status = parameterize (bld, hmm)) != eslOK) goto ERROR; if ((status = annotate (bld, msa, hmm)) != eslOK) goto ERROR; if ((status = calibrate (bld, hmm, bg, opt_gm, opt_om)) != eslOK) goto ERROR; if ((status = make_post_msa (bld, msa, hmm, tr, opt_postmsa)) != eslOK) goto ERROR; //force masked positions to background (it'll be close already, so no relevant impact on weighting) if (hmm->mm != NULL) for (i=1; i<hmm->M; i++ ) if (hmm->mm[i] == 'm') for (j=0; j<hmm->abc->K; j++) hmm->mat[i][j] = bg->f[j]; if ( bld->abc->type == eslDNA || bld->abc->type == eslRNA ) { if (bld->w_len > 0) hmm->max_length = bld->w_len; else if (bld->w_beta == 0.0) hmm->max_length = hmm->M *4; else if ( (status = p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR; } hmm->checksum = checksum; hmm->flags |= p7H_CHKSUM; if (opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm); if (opt_trarr != NULL) *opt_trarr = tr; else p7_trace_DestroyArray(tr, msa->nseq); return eslOK; ERROR: p7_hmm_Destroy(hmm); p7_trace_DestroyArray(tr, msa->nseq); if (opt_gm != NULL) p7_profile_Destroy(*opt_gm); if (opt_om != NULL) p7_oprofile_Destroy(*opt_om); return status; }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }