int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; int be_verbose; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { esl_usage(stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ exit(0); } if (esl_opt_ArgNumber(go) != 0) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); exit(1); } be_verbose = esl_opt_GetBoolean(go, "-v"); if (esl_opt_GetBoolean(go, "-r")) { r = esl_randomness_CreateTimeseeded(); if (be_verbose) printf("seed = %ld\n", esl_randomness_GetSeed(r)); } else r = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); utest_LinearRegression(r, TRUE, be_verbose); utest_LinearRegression(r, FALSE, be_verbose); esl_getopts_Destroy(go); esl_randomness_Destroy(r); exit(0); }
/* Function: p7_CreateDefaultApp() * Synopsis: Initialize a small/simple/standard HMMER application * Incept: SRE, Thu Oct 28 15:03:21 2010 [Janelia] * * Purpose: Identical to <esl_getopts_CreateDefaultApp()>, but * specialized for HMMER. See documentation in * <easel/esl_getopts.c>. * * Args: options - array of <ESL_OPTIONS> structures for getopts * nargs - number of cmd line arguments expected (excl. of cmdname) * argc - <argc> from main() * argv - <argv> from main() * banner - optional one-line description of program (or NULL) * usage - optional one-line usage hint (or NULL) * * Returns: ptr to new <ESL_GETOPTS> object. * * On command line errors, this routine prints an error * message to <stderr> then calls <exit(1)> to halt * execution with abnormal (1) status. * * If the standard <-h> option is seen, the routine prints * the help page (using the data in the <options> structure), * then calls <exit(0)> to exit with normal (0) status. * * Xref: J7/3 * * Note: The only difference between this and esl_getopts_CreateDefaultApp() * is to call p7_banner() instead of esl_banner(), to get HMMER * versioning info into the header. There ought to be a better way * (perhaps using PACKAGE_* define's instead of HMMER_* vs. EASEL_* * define's in esl_banner(), thus removing the need for p7_banner). */ ESL_GETOPTS * p7_CreateDefaultApp(ESL_OPTIONS *options, int nargs, int argc, char **argv, char *banner, char *usage) { ESL_GETOPTS *go = NULL; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); if (usage != NULL) esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { if (banner != NULL) p7_banner(stdout, argv[0], banner); if (usage != NULL) esl_usage (stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != nargs) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } return go; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_RANDOMNESS *r = NULL; /* random number generator */ FILE *ofp = NULL; /* data output stream */ int outfmt = eslSQFILE_FASTA; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); /* Open the output data file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Initialize */ r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); /* Hand off execution to one of the three modes */ if (esl_opt_GetBoolean(go, "-A")) /* Alignment shuffling */ { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); msa_shuffling(go, r, ofp, outfmt); } else if (esl_opt_GetBoolean(go, "-G")) /* Sequence generation */ { if (esl_opt_ArgNumber(go) != 0) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); seq_generation(go, r, ofp, outfmt); } else if (esl_opt_GetBoolean(go, "-S")) /* Sequence shuffling */ { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); seq_shuffling(go, r, ofp, outfmt); } if (esl_opt_GetString(go, "-o") != NULL) fclose(ofp); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
static int process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_fmfile, char **ret_qfile) { ESL_GETOPTS *go = esl_getopts_Create(options); int status; if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { esl_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 120=textwidth*/ if (puts("\nSpecial options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); /* 2= group; 2 = indentation; 120=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_qfile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <qfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_fmfile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <fmfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* Validate any attempted use of stdin streams */ if (esl_strcmp(*ret_fmfile, "-") == 0 && esl_strcmp(*ret_qfile, "-") == 0) { if (puts("Either <fmfile> or <qfile> may be '-' (to read from stdin), but not both.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } *ret_go = go; return eslOK; FAILURE: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 80=textwidth*/ printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); esl_getopts_Destroy(go); exit(1); ERROR: if (go) esl_getopts_Destroy(go); exit(status); }
static void process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_hmmfile, char **ret_alifile) { ESL_GETOPTS *go = NULL; if ((go = esl_getopts_Create(options)) == NULL) p7_Die("problem with options structure"); if (esl_opt_ProcessEnvironment(go) != eslOK) { printf("Failed to process environment: %s\n", go->errbuf); goto ERROR; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); goto ERROR; } if (esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); goto ERROR; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\nOptions for selecting alphabet rather than guessing it:"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\nAlternative model construction strategies:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); puts("\nAlternative relative sequence weighting strategies:"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); puts("\nAlternate effective sequence weighting strategies:"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); puts("\nControl of E-value calibration:"); esl_opt_DisplayHelp(stdout, go, 6, 2, 80); puts("\nOther options:"); esl_opt_DisplayHelp(stdout, go, 8, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); goto ERROR; } if ((*ret_hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to get <hmmfile> argument on command line"); goto ERROR; } if ((*ret_alifile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to get <alifile> argument on command line"); goto ERROR; } *ret_go = go; return; ERROR: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); printf("\nTo see more help on other available options, do %s -h\n\n", argv[0]); exit(1); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *keyfile = NULL; char *tabfile = NULL; ESL_KEYHASH *kh = esl_keyhash_Create(); int nkeys = 0; ESL_DMATRIX *D = NULL; ESL_TREE *T = NULL; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], go, "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], go, "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], go, "Incorrect number of command line arguments.\n"); keyfile = esl_opt_GetArg(go, 1); tabfile = esl_opt_GetArg(go, 2); read_keyfile(go, keyfile, kh); nkeys = esl_keyhash_GetNumber(kh); D = esl_dmatrix_Create(nkeys, nkeys); read_tabfile(go, tabfile, kh, D); esl_tree_SingleLinkage(D, &T); //esl_tree_WriteNewick(stdout, T); output_clusters(go, T, kh); esl_tree_Destroy(T); esl_dmatrix_Destroy(D); esl_keyhash_Destroy(kh); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go; /* application configuration */ int kstatus, tstatus;/* return code from Easel routine */ int fmt; /* expected format of kfile, tfile */ char *kfile, *tfile; /* known, test structure file */ ESL_MSAFILE *kfp, *tfp; /* open kfile, tfile */ ESL_MSA *ka, *ta; /* known, trusted alignment */ int64_t klen, tlen; /* lengths of dealigned seqs */ int i; /* counter over sequences */ int apos; /* counter over alignment columns */ int rfpos; /* counter over consensus (non-gap RF) columns */ int is_rfpos; /* TRUE if current apos is a consensus pos, FALSE if not */ int uapos; /* counter over unaligned residue positions */ int nali; /* number of alignment we're on in each file */ int **kp; /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */ int **tp; /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */ /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert' * after non-gap RF position (x * -1) */ int *km_pos; /* [0..rflen] = x, in known aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ki_pos; /* [0..rflen] = x, in known aln, number of residues inserted after non-gap RF column x */ int *tm_pos; /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ti_pos; /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */ int *cor_tm_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *cor_ti_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */ int *km_seq; /* [0..i..nseq-1] = x, in known aln, number of residues aligned to non-gap RF columns in seq i; */ int *ki_seq; /* [0..i..nseq-1] = x, in known aln, number of residues inserted in seq i */ int *tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */ int *ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */ int *cor_tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */ int *cor_ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */ int *seqlen; /* [0..i..nseq-1] = x, unaligned seq i has length x */ ESL_ALPHABET *abc = NULL; /* alphabet for all alignments */ int rflen, t_rflen; /* non-gap RF length (consensus lengths) */ int status; char *namedashes; int ni; int namewidth = 8; /* length of 'seq name' */ int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */ char *mask = NULL; int masklen; ESL_DSQ *ks; ESL_DSQ *ts; FILE *dfp = NULL; /* for --c2dfile */ /* variables needed for -p and related options */ int do_post = FALSE; /* TRUE if -p enabled */ int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */ int p; /* counter over integerized posteriors */ int *ptm = NULL; /* [0..p..10] number of total matches with posterior value p (10="*")*/ int *pti = NULL; /* [0..p..10] number of total inserts with posterior value p */ int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */ int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */ int npostvals = 11; /* number of posterior values 0-9, * */ int ppidx; /* index of PP */ char ppchars[11] = "0123456789*"; int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */ // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti; /* total counts of posteriors */ // int tot_incor_ptm,tot_incor_pti; // SRE: commented out; don't seem to be used; need to silence compiler warning char errbuf[eslERRBUFSIZE]; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); exit(EXIT_SUCCESS); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } kfile = esl_opt_GetArg(go, 1); tfile = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the two Stockholm files. ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus); do_post = esl_opt_GetBoolean(go, "-p"); /* read the mask file if --p-mask is enabled */ if(! esl_opt_IsDefault(go, "--p-mask")) { if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf); } /* open the c2dfile for output, if nec */ if (esl_opt_IsOn(go, "--c2dfile")) { if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile")); } /*********************************************** * Do alignment comparisons, one seq at a time; * this means looping over all seqs in all alignments. ***********************************************/ nali = 0; while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF) { if ( kstatus != eslOK) esl_msafile_ReadFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus); nali++; if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); /* Sanity check on alignment */ if (ka->nseq != ta->nseq) esl_fatal("trusted, test alignments don't have same seq #\n"); if (ka->rf == NULL) esl_fatal("trusted alignment has no reference annotation\n"); if (ta->rf == NULL) esl_fatal("test alignment has no reference annotation\n"); /* make sure the sequences are all identical */ ESL_ALLOC(seqlen, sizeof(int) * ka->nseq); for(i = 0; i < ka->nseq; i++) { if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2)); memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen); ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2)); memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen); if (tlen != klen) esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen); if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]); seqlen[i] = tlen; free(ks); free(ts); } /* determine non-gap RF length */ rflen = 0; for(apos = 1; apos <= ka->alen; apos++) { if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++; } t_rflen = 0; for(apos = 1; apos <= ta->alen; apos++) { if((! esl_abc_CIsGap (ta->abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (ta->abc, ta->rf[apos-1]))) t_rflen++; } if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen); /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */ if(do_post) { if(! esl_opt_IsDefault(go, "--p-mask")) { if(masklen != rflen) { esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen); } } if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists"); ESL_ALLOC(ptm, sizeof(int) * npostvals); ESL_ALLOC(pti, sizeof(int) * npostvals); ESL_ALLOC(cor_ptm, sizeof(int) * npostvals); ESL_ALLOC(cor_pti, sizeof(int) * npostvals); esl_vec_ISet(ptm, npostvals, 0); esl_vec_ISet(pti, npostvals, 0); esl_vec_ISet(cor_ptm, npostvals, 0); esl_vec_ISet(cor_pti, npostvals, 0); } /* allocate and initialize our counters */ ESL_ALLOC(kp, sizeof(int *) * ka->nseq); ESL_ALLOC(tp, sizeof(int *) * ta->nseq); for(i = 0; i < ka->nseq; i++) { ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1)); ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1)); esl_vec_ISet(kp[i], seqlen[i]+1, -987654321); esl_vec_ISet(tp[i], seqlen[i]+1, -987654321); } ESL_ALLOC(km_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1)); esl_vec_ISet(km_pos, rflen+1, 0); esl_vec_ISet(ki_pos, rflen+1, 0); esl_vec_ISet(tm_pos, rflen+1, 0); esl_vec_ISet(ti_pos, rflen+1, 0); esl_vec_ISet(cor_tm_pos, rflen+1, 0); esl_vec_ISet(cor_ti_pos, rflen+1, 0); ESL_ALLOC(km_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq); ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq); esl_vec_ISet(km_seq, ka->nseq, 0); esl_vec_ISet(ki_seq, ka->nseq, 0); esl_vec_ISet(tm_seq, ka->nseq, 0); esl_vec_ISet(ti_seq, ka->nseq, 0); esl_vec_ISet(cor_tm_seq, ka->nseq, 0); esl_vec_ISet(cor_ti_seq, ka->nseq, 0); /* determine non-gap RF location of each residue in known alignment */ for(i = 0; i < ka->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ka->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing (ka->abc, ka->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; } if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { uapos++; kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; } else { ki_pos[rfpos]++; ki_seq[i]++; } } } } /* determine non-gap RF location of each residue in predicted alignment */ for(i = 0; i < ta->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ta->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (abc, ta->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; if(do_post) { do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE; } } if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { uapos++; tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(do_post) { if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos); ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]); if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos); } if(is_rfpos) { tm_pos[rfpos]++; tm_seq[i]++; if(do_post_for_this_rfpos) ptm[ppidx]++; } else { ti_pos[rfpos]++; ti_seq[i]++; if(do_post) pti[ppidx]++; } if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */ if(is_rfpos) { cor_tm_seq[i]++; cor_tm_pos[rfpos]++; if(do_post_for_this_rfpos) cor_ptm[ppidx]++; } else { cor_ti_seq[i]++; cor_ti_pos[rfpos]++; if(do_post) cor_pti[ppidx]++; } } } } } if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { /* print per sequence statistics */ /* determine the longest name in msa */ for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni])); ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-'; printf("# %-*s %6s %28s %28s %28s\n", namewidth, "seq name", "len", "match columns", "insert columns", "all columns"); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------"); for(i = 0; i < ta->nseq; i++) { printf(" %-*s %6d %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, ka->sqname[i], seqlen[i], cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); } cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq); cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq); km = esl_vec_ISum(km_seq, ka->nseq); ki = esl_vec_ISum(ki_seq, ka->nseq); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------"); printf("# %-*s %6s %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, "*all*", "-", cor_tm, km, ((float) cor_tm / (float) km), cor_ti, ki, ((float) cor_ti / (float) ki), (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); free(namedashes); for(i = 0; i < ka->nseq; i++) { free(kp[i]); free(tp[i]); } } else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */ printf("# %5s %20s %20s %20s\n", "rfpos", "match", "insert", "both"); printf("# %5s %20s %20s %20s\n", "-----", "--------------------", "--------------------", "--------------------"); for(rfpos = 0; rfpos <= rflen; rfpos++) { printf(" %5d %4d / %4d (%.3f) %4d / %4d (%.3f) %4d / %4d (%.3f)\n", rfpos, cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); } } else if(do_post) { /* do posterior output */ if(mask == NULL) { printf("# %2s %29s %29s\n", "", " match columns ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } else { printf("# %2s %29s %29s\n", "", " match columns within mask ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0; //tot_ptm = esl_vec_ISum(ptm, npostvals); //tot_pti = esl_vec_ISum(pti, npostvals); //tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals); //tot_cor_pti = esl_vec_ISum(cor_pti, npostvals); //tot_incor_ptm = tot_ptm - tot_cor_ptm; //tot_incor_pti = tot_pti - tot_cor_pti; for(p = (npostvals-1); p >= 0; p--) { cm_cor_ptm += cor_ptm[p]; cm_cor_pti += cor_pti[p]; cm_ptm += ptm[p]; cm_pti += pti[p]; cm_incor_ptm += ptm[p] - cor_ptm[p]; cm_incor_pti += pti[p] - cor_pti[p]; printf(" %2c %8d / %8d (%.5f) %8d / %8d (%.5f)\n", ppchars[p], cor_ptm[p], ptm[p], (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], cor_pti[p], pti[p], (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]); } } /* handle --c2dfile */ if (dfp != NULL) { /* match stats, 4 fields, CMYK color values */ for(rfpos = 1; rfpos <= rflen; rfpos++) { if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */ 1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */ 0.); } } fprintf(dfp, "//\n"); /* insert stats, 4 fields, CMYK color values */ rfpos = 0; /* special case, combine insert posn 0 and 1 together */ if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */ 0., 0.); } /* insert stats posn 2..rflen */ for(rfpos = 2; rfpos <= rflen; rfpos++) { if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */ 0., 0.); } } fprintf(dfp, "//\n"); } if(ptm != NULL) free(ptm); if(pti != NULL) free(pti); if(cor_ptm != NULL) free(cor_ptm); if(cor_ptm != NULL) free(cor_pti); free(kp); free(tp); free(km_seq); free(ki_seq); free(tm_seq); free(ti_seq); free(cor_tm_seq); free(cor_ti_seq); free(km_pos); free(ki_pos); free(tm_pos); free(ti_pos); free(cor_tm_pos); free(cor_ti_pos); free(seqlen); esl_msa_Destroy(ka); esl_msa_Destroy(ta); } if(mask != NULL) free(mask); if(dfp != NULL) { fclose(dfp); printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile")); } if(abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msafile_Close(tfp); esl_msafile_Close(kfp); return 0; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *hmmfile = NULL; /* HMM file name */ P7_HMMFILE *hfp = NULL; /* open HMM file */ FILE *ofp = NULL; /* output stream for HMMs */ int status; /* easel/hmmer return code */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) < 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); /* Open the HMM file. */ hmmfile = esl_opt_GetArg(go, 1); status = p7_hmmfile_Open(hmmfile, NULL, &hfp); if (status == eslENOTFOUND) p7_Fail("Failed to open HMM file %s for reading.\n", hmmfile); else if (status == eslEFORMAT) p7_Fail("File %s does not appear to be in a recognized HMM format.\n", hmmfile); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n", status, hmmfile); /* Open the output file, if any */ if (esl_opt_GetBoolean(go, "-O")) { if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetArg(go, 2)); } else if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Hand off to the appropriate routine */ if (esl_opt_GetBoolean(go, "--index")) { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); create_ssi_index(go, hfp); } else if (esl_opt_GetBoolean(go, "-f")) { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); multifetch(go, ofp, esl_opt_GetArg(go, 2), hfp); } else { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); onefetch(go, ofp, esl_opt_GetArg(go, 2), hfp); if (ofp != stdout) printf("\n\nRetrieved HMM %s.\n", esl_opt_GetArg(go, 2)); } if (esl_opt_GetBoolean(go, "-O") || esl_opt_GetString(go, "-o") != NULL) fclose(ofp); p7_hmmfile_Close(hfp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float nseq; int do_eval2score = 0; int do_score2eval = 0; int z_val; float e_val; float s_val; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } output_header(stdout, go); if ( esl_opt_IsOn(go, "--eval2score") ) { do_eval2score = TRUE; e_val = esl_opt_GetReal(go, "-E"); } else if ( esl_opt_IsOn(go, "--score2eval") ) { do_score2eval = TRUE; s_val = esl_opt_GetReal(go, "-S"); } else if ( esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1") || esl_opt_IsUsed(go, "-Z") ) { puts("The flags -Z, --baseZ, and --baseZ1 are for use with --eval2score and --score2eval."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_IsUsed(go, "--baseZ") ) { z_val = 1000000 * 2 * (long)(esl_opt_GetInteger(go, "--baseZ")); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { z_val = 1000000 * (long)(esl_opt_GetInteger(go, "--baseZ1")); } else { z_val = esl_opt_GetInteger(go, "-Z"); } /* Initializations: open the HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); if (do_eval2score) printf (" %6s %6.2g", "sc for", e_val); if (do_score2eval) printf (" %6s %6.2f", "E-val for", s_val); printf("\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); if (do_eval2score) printf (" %13s", "-------------"); if (do_score2eval) printf (" %13s", "-------------"); printf("\n"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if ( esl_opt_IsOn(go, "--eval2score") || esl_opt_IsOn(go, "--score2eval") ) { if (esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1" ) ) { if ( hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --baseZ and --baseZ1 can't be used with non-nucleotide models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } else if ( hmm->abc->type != eslAMINO && hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --eval2score and --score2eval can't be used with non-sequence models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } if (esl_opt_IsUsed(go, "--baseZ") ) { nseq = (float)z_val / (float)(hmm->max_length); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { nseq = (float)z_val / (float)(hmm->max_length); } else { nseq = z_val; } if (bg == NULL) bg = p7_bg_Create(abc); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); if ( do_eval2score ) { float sc; sc = esl_exp_invsurv( e_val / nseq , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2f", sc); } else if ( do_score2eval) { float e; e = nseq * esl_exp_surv( s_val , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2g", e); } printf("\n"); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *alifile = NULL; /* alignment file name */ int infmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ int outfmt = eslMSAFILE_UNKNOWN; /* output format for fetched msa's */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ FILE *ofp = NULL; /* output stream for alignments */ int status; /* easel return code */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) < 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input alignment file format for --informat", esl_opt_GetString(go, "--informat")); } outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid output alignment file format for --outformat", esl_opt_GetString(go, "--outformat")); alifile = esl_opt_GetArg(go, 1); /* Open the alignment file. */ if ( (status = eslx_msafile_Open(NULL, alifile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); /* Open the SSI index, if any */ if (! esl_opt_GetBoolean(go, "--index")) { if (afp->bf->mode_is == eslBUFFER_FILE || afp->bf->mode_is == eslBUFFER_ALLFILE || afp->bf->mode_is == eslBUFFER_MMAP) { char *ssifile = NULL; esl_sprintf(&ssifile, "%s.ssi", afp->bf->filename); status = esl_ssi_Open(ssifile, &(afp->ssi)); if (status == eslERANGE ) esl_fatal("SSI index %s has 64-bit offsets; this system doesn't support them", ssifile); else if (status == eslEFORMAT) esl_fatal("SSI index %s has an unrecognized format. Try recreating, w/ esl-afetch --index", ssifile); else if (status == eslENOTFOUND) afp->ssi = NULL; else if (status != eslOK) esl_fatal("SSI index %s: open failed, error code %d\n", ssifile, status); free(ssifile); } } /* Open the output file, if any */ if (esl_opt_GetBoolean(go, "-O")) { if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetArg(go, 2)); } else if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Hand off control flow as appropriate */ if (esl_opt_GetBoolean(go, "--index")) { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); create_ssi_index(go, afp); } else if (esl_opt_GetBoolean(go, "-f")) { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); multifetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); } else { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); onefetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); if (ofp != stdout) printf("\n\nRetrieved alignment %s.\n", esl_opt_GetArg(go, 2)); } eslx_msafile_Close(afp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; char **as = NULL; /* aligned character seqs (random, iid) */ int N,L; /* # of seqs, and their aligned lengths */ int seed; int i,j; int status; double p[4]; /* ACGT probabilities */ #ifdef eslAUGMENT_ALPHABET ESL_DSQ **ax = NULL; /* digitized alignment */ ESL_ALPHABET *abc = NULL; #endif /* Process command line */ go = esl_getopts_Create(options); esl_opt_ProcessCmdline(go, argc, argv); esl_opt_VerifyConfig(go); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } L = esl_opt_GetInteger(go, "-L"); N = esl_opt_GetInteger(go, "-N"); seed = esl_opt_GetInteger(go, "--seed"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } esl_getopts_Destroy(go); /* Create a random DNA alignment; * force it to obey the conventions of the unit tests: * 0,1 are identical * 0,2 are completely dissimilar */ r = esl_randomness_Create(seed); for (i = 0; i < 4; i++) p[i] = 0.25; ESL_ALLOC(as, sizeof(char *) * N); for (i = 0; i < N; i++) ESL_ALLOC(as[i], sizeof(char) * (L+1)); esl_rsq_IID(r, "ACGT", p, 4, L, as[0]); strcpy(as[1], as[0]); esl_rsq_IID(r, "ACGT", p, 4, L, as[2]); for (j = 0; j < L; j++) while (as[2][j] == as[0][j]) as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)]; for (i = 3; i < N; i++) esl_rsq_IID(r, "ACGT", p, 4, L, as[i]); #ifdef eslAUGMENT_ALPHABET abc = esl_alphabet_Create(eslDNA); ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N); for (i = 0; i < N; i++) esl_abc_CreateDsq(abc, as[i], &(ax[i])); #endif /*eslAUGMENT_ALPHABET*/ /* Unit tests */ if (utest_CPairId(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantor(4, as, N) != eslOK) return eslFAIL; #ifdef eslAUGMENT_ALPHABET if (utest_XPairId(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL; #endif /*eslAUGMENT_ALPHABET*/ #ifdef eslAUGMENT_DMATRIX if (utest_CPairIdMx(as, N) != eslOK) return eslFAIL; if (utest_CDiffMx(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantorMx(4, as, N) != eslOK) return eslFAIL; #endif /* eslAUGMENT_DMATRIX*/ #if defined (eslAUGMENT_ALPHABET) && defined (eslAUGMENT_DMATRIX) if (utest_XPairIdMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XDiffMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantorMx(abc, as, ax, N) != eslOK) return eslFAIL; #endif esl_randomness_Destroy(r); esl_Free2D((void **) as, N); #ifdef eslAUGMENT_ALPHABET esl_alphabet_Destroy(abc); esl_Free2D((void **) ax, N); #endif return eslOK; ERROR: return eslFAIL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt; /* format code for alifiles */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *msa = NULL; /* multiple sequence alignment */ int status; /* easel return code */ int do_info = TRUE; /* TRUE if -i */ int do_max = FALSE; /* TRUE if -x */ int do_ffreq = FALSE; /* TRUE if --ffreq */ int do_fmin = FALSE; /* TRUE if --fmin */ float fthresh = 0.; /* <x> from -f <x> */ int do_remove_bps = FALSE; /* TRUE if -r */ int do_consistent = FALSE; /* TRUE if -c */ int do_indi2cons = FALSE; /* TRUE if --indi <x> */ int have_cons; /* TRUE if first alignment has consensus sequence */ int do_newcons = FALSE; /* TRUE if we're creating a new consensus structure * and outputing a new alignment (if -x -f -c or --indi) */ int do_a = FALSE; /* TRUE if -a */ char *indi; /* for <x> from --indi <x> */ int nindi_read; /* number of individual sequence SS lines we've read for current alignment */ int a; /* counter over seqs */ int i, i2; /* counter over residues */ int j, j2; /* counter over residues */ int nali; /* counter over alignments */ int **bp = NULL; /* bp[i][j] is number of individual bps exist between aln cols i and j */ int *cur_ct = NULL; /* ct array of basepairs for current sequence */ int *cons_ct = NULL; /* ct array of basepairs for SS_cons being created */ int *xcons_ct = NULL; /* ct array of basepairs for existing SS_cons */ int *ngaps = NULL; /* number of gaps in each alignment position */ FILE *ofp; /* output file (default is stdout) */ int be_verbose = FALSE; /* TRUE to print extra info */ int seqthresh; /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/ char *sscons = NULL; /* the new SS_cons line */ FILE *lfp = NULL; /* file to list sequences with conflicting bps to */ int nlist = 0; /* number of sequences listed to list file */ int *nconflictsA; /* number of conflicting bps in seq a's individual structure annotation */ int nconflicts_total = 0; /* total number of conflicts */ int nconflicts_list = 0; /* total number of conflicts in sequences listed to file <x> from -l <x> */ int noverlaps_total = 0; /* total number of overlaps */ int nconsistent_total = 0; /* total number of consistent bps */ int nbps_total = 0; /* total number of bps */ int *nconsistentA; /* number of consistent bps in seq a's individual structure annotation */ int *noverlapsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int *nbpsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int ncons_bps = 0; /* number of bps in consensus structure */ int max_noverlaps_aidx; int max_nconsistent_aidx; int max_nbps_aidx; int *removebp; /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */ int *has_conflict; int *nmates_l2r; /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */ int *nmates_r2l; /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left mates i for j */ int lmax; /* with -l, maximum number of conflicts to allow */ int namewidth = 18; /* length of 'SS_cons(consensus)' */ char *namedashes = NULL; /* to store underline for seq name */ /* --fmin related variables */ int nbps = 0; int prev_nbps = -1; float fmin; int inconsistent_flag; int pknot_flag; int k,l; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\noptions for defining a new consensus structure (all of these require -o):"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\noptions for listing sequences based on structure:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = esl_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); /* open output file */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = NULL; if (esl_opt_GetString(go, "-l") != NULL) { if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l")); } /* determine if we're creating a structure */ do_max = esl_opt_GetBoolean(go, "-x"); if(!(esl_opt_IsDefault(go, "--ffreq"))) { do_ffreq = TRUE; fthresh = esl_opt_GetReal(go, "--ffreq"); } if(!(esl_opt_IsDefault(go, "--fmin"))) { do_fmin = TRUE; } do_remove_bps = esl_opt_GetBoolean(go, "-r"); do_consistent = esl_opt_GetBoolean(go, "-c"); if(!(esl_opt_IsDefault(go, "--indi"))) { do_indi2cons = TRUE; } if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_newcons = TRUE; } do_a = esl_opt_GetBoolean(go, "-a"); if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_info = FALSE; } /*********************************************** * Read MSAs one at a time. ***********************************************/ nali = 0; have_cons = FALSE; lmax = esl_opt_GetInteger(go, "--lmax"); if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; while ((status = esl_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) esl_msafile_ReadFailure(afp, status); nali++; /* determine max length name */ namewidth = 18; /* length of 'SS_cons(consensus)' */ for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i])); if(namedashes != NULL) { free(namedashes); } ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(i = 0; i < namewidth; i++) namedashes[i] = '-'; ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1)); ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1)); ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1)); ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1)); esl_vec_ISet(cur_ct, (msa->alen+1), 0); esl_vec_ISet(cons_ct, (msa->alen+1), 0); esl_vec_ISet(xcons_ct, (msa->alen+1), 0); esl_vec_ISet(removebp, (msa->alen+1), FALSE); esl_vec_ISet(has_conflict, (msa->alen+1), FALSE); esl_vec_ISet(nmates_l2r, (msa->alen+1), 0); esl_vec_ISet(nmates_r2l, (msa->alen+1), 0); ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq); ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq); ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq); ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq); esl_vec_ISet(nconflictsA, msa->nseq, 0); esl_vec_ISet(noverlapsA, msa->nseq, 0); esl_vec_ISet(nconsistentA, msa->nseq, 0); esl_vec_ISet(nbpsA, msa->nseq, 0); max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0; nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0; for(i = 1; i <= msa->alen; i++) { ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1)); esl_vec_ISet(bp[i], (msa->alen+1), 0); } /* make sure we have ss_cons and indi ss if we need it */ if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_max) esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_consistent) esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_indi2cons) esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_ffreq) esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_fmin) esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss_cons != NULL) { if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { esl_fatal("Existing SS_cons for alignment %d is invalid.", nali); } ncons_bps = 0; for(i = 1; i <= msa->alen; i++) if(xcons_ct[i] != 0 && i < xcons_ct[i]) ncons_bps++; if(nali > 1 && !have_cons) esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); if(nali == 1) have_cons = TRUE; } else if (lfp != NULL) { esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_remove_bps) { esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_consistent) { esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); } else { if(nali > 1 && have_cons) esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); } if(do_info) { printf("# Per-sequence basepair information:\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); } if(have_cons) { printf("#\n"); printf("# indibp: number of basepairs in the individual sequence SS annotation\n"); printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n"); printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n"); printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n"); printf("#\n"); printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n"); printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n"); printf("#\n"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, namedashes, "------", "------", "-----", "------"); } else { printf("# %-*s %6s\n", namewidth, "seqname", "nbp"); printf("# %-*s %6s\n", namewidth, namedashes, "------"); } } nindi_read = 0; for (a = 0; a < msa->nseq; a++) { if(msa->ss != NULL && msa->ss[a] != NULL) { if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { esl_fatal("SS annotation for sequence %d, aln %d is invalid.\n", (a+1), nali); } nindi_read++; for(i = 1; i <= msa->alen; i++) { if(i < cur_ct[i]) { bp[i][cur_ct[i]]++; if(bp[i][cur_ct[i]] == 1) { nmates_l2r[i]++; nmates_r2l[cur_ct[i]]++; } } } for(i = 1; i <= msa->alen; i++) { if(cur_ct[i] != 0 && i < cur_ct[i]) { if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++; if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */ removebp[i] = TRUE; removebp[xcons_ct[i]] = TRUE; } else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */ removebp[cur_ct[i]] = TRUE; removebp[xcons_ct[cur_ct[i]]] = TRUE; } else nconsistentA[a]++; } } if(nconflictsA[a] > lmax) { if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); nconflicts_list += nconflictsA[a]; nlist++; } nbpsA[a] = nconflictsA[a] + nconsistentA[a]; nconflicts_total += nconflictsA[a]; nconsistent_total += nconsistentA[a]; noverlaps_total += noverlapsA[a]; nbps_total += nbpsA[a]; if(do_info && have_cons) printf(" %-*s %6d %6d %6d %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); if(do_info && !have_cons) printf(" %-*s %6d\n", namewidth, msa->sqname[a], nbpsA[a]); if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a; if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a; if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a; } else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); } } if(do_info && have_cons) { if(nindi_read > 0) printf("\n"); printf(" %-*s %6d %6d %6d %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); if(nindi_read > 0) { printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.); printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.); printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.); } else { printf("# No sequences in the alignment have GR SS annotation.\n"); } } if(lfp != NULL) { printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); } /* determine number of gaps per alignment column */ if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR; /* -x: determine max bp structure OR * -a: list all conflicts in individual structures */ if(do_max || do_a) { for(i = 1; i <= msa->alen; i++) { if(nmates_l2r[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[i][j] > 0) { if(do_a) printf("More than 1 right mates for left mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { if(nmates_r2l[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[j][i] > 0) { if(do_a) printf("More than 1 left mates for right mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/ if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { j = i+1; while(bp[i][j] == 0) j++; cons_ct[i] = j; cons_ct[j] = i; } } /* remove pseudoknotted bps greedily */ for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { for(i2 = i+1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/ /* note: remove both if they have equal number of sequences */ if(bp[i][j] <= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i, j);*/ cons_ct[cons_ct[i]] = 0; cons_ct[i] = 0; } if(bp[i][j] >= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i2, j2);*/ cons_ct[cons_ct[i2]] = 0; cons_ct[i2] = 0; } } } } } } } /***************************************/ /*PARANOID, second check for knots for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { printf("BP: %4d:%4d\n", i, j); for(i2 = 1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { if((i < i2)) { printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]); } } } } } } ******************************************/ /***************************************/ /*PARANOID, check cons_ct for consistency for(i = 1; i <= msa->alen; i++) { if(cons_ct[i] != 0) { if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); } } } */ /*PARANOID, write out SS_cons for(i = 1; i <= msa->alen; i++) { if(i < cons_ct[i]) printf("<"); else if(cons_ct[i] != 0) { printf(">"); } else printf("."); } printf("\n"); */ /***************************************/ /* textize alignment */ if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); /* --fmin */ if(do_fmin) { /* define ss_cons */ prev_nbps = -1; fthresh = 0.99; inconsistent_flag = pknot_flag = FALSE; printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > <x> individual SS contain i:j as a pair\n"); printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n"); printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n"); printf("#\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); printf("# Number of seqs: %d\n", msa->nseq); printf("#\n"); printf("# %5s %23s %6s\n", "<x>", "nseq-required-with-bp", "numbps"); printf("# %5s %23s %6s\n", "-----", "-----------------------", "------"); while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { nbps = 0; seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ esl_vec_ISet(cons_ct, msa->alen+1, 0); for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0 || cons_ct[j] != 0) { inconsistent_flag = TRUE; } /* check for pseudoknots */ for(k = i+1; k < j; k++) { l = cons_ct[k]; if((k < l) && (l > j)) { pknot_flag = TRUE; } if((k > l) && (l != 0) && (l < i)) { pknot_flag = TRUE; } } cons_ct[i] = j; cons_ct[j] = i; nbps++; } } } if(inconsistent_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "inconsistent"); else if(pknot_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "pseudoknotted"); else { if(nbps != prev_nbps) { printf(" %.3f %23d %6d\n", fthresh, seqthresh+1, nbps); } fmin = fthresh; } fthresh -= 0.01; prev_nbps = nbps; } fthresh = fmin; esl_vec_ISet(cons_ct, msa->alen+1, 0); } /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */ if(do_ffreq || do_fmin) { if(do_fmin) { printf("#\n# <x> determined to be %.3f\n", fthresh); } if(do_ffreq) { printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > %f individual SS contain i:j as a pair\n", fthresh); } esl_vec_ISet(cons_ct, msa->alen+1, 0); /* define ss_cons */ seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j); } if(cons_ct[j] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j); } cons_ct[i] = j; cons_ct[j] = i; } } } } /* -r: redefine consensus struct by removing any bps that conflict with individual structures */ if(do_remove_bps) { for(i = 1; i <= msa->alen; i++) { if(!(removebp[i])) { cons_ct[i] = xcons_ct[i]; cons_ct[cons_ct[i]] = i; } else { printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]); cons_ct[xcons_ct[i]] = 0; cons_ct[i] = 0; } } } /* -c: define consensus structure as indi sequence with highest number of consistent bps with structure OR */ /* --indi: define consensus structure as indi sequence <x> from --indi <x> */ if(do_consistent || do_indi2cons) { if(do_indi2cons) { indi = esl_opt_GetString(go, "--indi"); for(a = 0; a < msa->nseq; a++) { if(strcmp(indi, msa->sqname[a]) == 0) break; } if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi); } else { /* do_consistent */ a = max_nconsistent_aidx; } if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]); if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { esl_fatal("Second pass... SS annotation for sequence %d, aln %d is invalid.\n", (a), nali); } printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]); if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) { if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; } if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR; printf("# Defined new RF as %s sequence\n", msa->sqname[a]); } } /* write out alignment with new SS_cons */ if(do_newcons) { if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR; if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; } if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR; status = esl_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM)); if (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n"); else if (status != eslOK) esl_fatal("Writing alignment file failed with error %d\n", status); } free(sscons); free(cur_ct); free(cons_ct); free(xcons_ct); for(i = 1; i <= msa->alen; i++) free(bp[i]); free(bp); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); /* Cleanup, normal return */ if(lfp != NULL) fclose(lfp); if(ofp != NULL) { printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o")); fclose(ofp); } esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; ERROR: if(afp) esl_msafile_Close(afp); if(go) esl_getopts_Destroy(go); if(msa) esl_msa_Destroy(msa); if(lfp) fclose(lfp); if(ofp) fclose(ofp); esl_fatal("ERROR\n"); return 1; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; float *sqd; int status; int nbad; int nali = 0; int nbadali = 0; int nwgt = 0; int nbadwgt = 0; int i; int be_quiet; int do_gsc; int do_pb; int do_blosum; double maxid; double tol; int maxN; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } be_quiet = esl_opt_GetBoolean(go, "-q"); do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); tol = esl_opt_GetReal (go, "--tol"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } msafile = esl_opt_GetArg(go, 1); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } nali++; nwgt += msa->nseq; ESL_ALLOC(sqd, sizeof(float) * msa->nseq); if (do_gsc) { esl_msaweight_GSC(msa); GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_pb) { esl_msaweight_PB(msa); PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_blosum) { esl_msaweight_BLOSUM(msa, maxid); BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd); /* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */ esl_vec_FNorm (sqd, msa->nseq); esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq); } if (! be_quiet) { for (i = 0; i < msa->nseq; i++) fprintf(stdout, "%-20s %.3f %.3f\n", msa->sqname[i], msa->wgt[i], sqd[i]); } nbad = 0; for (i = 0; i < msa->nseq; i++) if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) nbad++; if (nbad > 0) nbadali++; nbadwgt += nbad; if (nbad > 0) printf("%-20s :: alignment shows %d weights that differ (out of %d) \n", msa->name, nbad, msa->nseq); esl_msa_Destroy(msa); free(sqd); } eslx_msafile_Close(afp); if (nbadali == 0) printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali); else { printf("%d of %d weights mismatched at (> %f fractional difference)\n", nbadwgt, nwgt, tol); printf("involving %d of %d total alignments\n", nbadali, nali); } return eslOK; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); /* Open the MSA file; determine alphabet */ status = esl_msafile_Open(alifile, alifmt, NULL, &afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); else { int type; status = esl_msafile_GuessAlphabet(afp, &type); if (status == eslEAMBIGUOUS) esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile); else if (status == eslEFORMAT) esl_fatal("Alignment file parse failed: %s\n", afp->errbuf); else if (status == eslENODATA) esl_fatal("Alignment file %s is empty\n", alifile); else if (status != eslOK) esl_fatal("Failed to read alignment file %s\n", alifile); cfg.abc = esl_alphabet_Create(type); } esl_msafile_SetDigital(afp, cfg.abc); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = esl_msa_Read(afp, &origmsa)) == eslOK) { remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); ntestdom = esl_stack_ObjectCount(teststack); if (ntestdom >= 2) { esl_stack_Shuffle(cfg.r, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); esl_msa_MinimGaps(trainmsa, NULL, NULL); esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", afp->linenumber, afp->fname, afp->errbuf, afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); if (nali > 0) synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_GETOPTS *go = NULL; /* command line processing */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ P7_HMM *hmm = NULL; /* sampled HMM to emit from */ P7_HMM *core = NULL; /* safe copy of the HMM, before config */ P7_BG *bg = NULL; /* null model */ ESL_SQ *sq = NULL; /* sampled sequence */ P7_TRACE *tr = NULL; /* sampled trace */ P7_PROFILE *gm = NULL; /* profile */ int i,j; int i1,i2; int k1,k2; int iseq; FILE *fp = NULL; double expected; int do_ilocal; char *hmmfile = NULL; int nseq; int do_swlike; int do_ungapped; int L; int M; int do_h2; char *ipsfile = NULL; char *kpsfile = NULL; ESL_DMATRIX *imx = NULL; ESL_DMATRIX *kmx = NULL; ESL_DMATRIX *iref = NULL; /* reference matrix: expected i distribution under ideality */ int Lbins; int status; char errbuf[eslERRBUFSIZE]; /***************************************************************** * Parse the command line *****************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:\n"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2 = indentation; 80=textwidth*/ return eslOK; } do_ilocal = esl_opt_GetBoolean(go, "-i"); hmmfile = esl_opt_GetString (go, "-m"); nseq = esl_opt_GetInteger(go, "-n"); do_swlike = esl_opt_GetBoolean(go, "-s"); do_ungapped = esl_opt_GetBoolean(go, "-u"); L = esl_opt_GetInteger(go, "-L"); M = esl_opt_GetInteger(go, "-M"); do_h2 = esl_opt_GetBoolean(go, "-2"); ipsfile = esl_opt_GetString (go, "--ips"); kpsfile = esl_opt_GetString (go, "--kps"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); printf("Usage: %s [options]\n", argv[0]); return eslFAIL; } r = esl_randomness_CreateFast(0); if (hmmfile != NULL) { /* Read the HMM (and get alphabet from it) */ P7_HMMFILE *hfp = NULL; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); if ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslOK) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else esl_fatal("Unexpected error in reading HMMs"); } M = hmm->M; p7_hmmfile_Close(hfp); } else { /* Or sample the HMM (create alphabet first) */ abc = esl_alphabet_Create(eslAMINO); if (do_ungapped) p7_hmm_SampleUngapped(r, M, abc, &hmm); else if (do_swlike) p7_hmm_SampleUniform (r, M, abc, 0.05, 0.5, 0.05, 0.2, &hmm); /* tmi, tii, tmd, tdd */ else p7_hmm_Sample (r, M, abc, &hmm); } Lbins = M; imx = esl_dmatrix_Create(Lbins, Lbins); iref = esl_dmatrix_Create(Lbins, Lbins); kmx = esl_dmatrix_Create(M, M); esl_dmatrix_SetZero(imx); esl_dmatrix_SetZero(iref); esl_dmatrix_SetZero(kmx); tr = p7_trace_Create(); sq = esl_sq_CreateDigital(abc); bg = p7_bg_Create(abc); core = p7_hmm_Clone(hmm); if (do_h2) { gm = p7_profile_Create(hmm->M, abc); p7_H2_ProfileConfig(hmm, bg, gm, p7_UNILOCAL); } else { gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); } /* Sample endpoints. * Also sample an ideal reference distribution for i endpoints. i * endpoints are prone to discretization artifacts, when emitted * sequences have varying lengths. Taking log odds w.r.t. an ideal * reference that is subject to the same discretization artifacts * cancels out the effect. */ for (iseq = 0; iseq < nseq; iseq++) { if (do_ilocal) ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); else profile_local_endpoints(r, core, gm, sq, tr, Lbins, &i1, &i2, &k1, &k2); imx->mx[i1-1][i2-1] += 1.; kmx->mx[k1-1][k2-1] += 1.; /* reference distribution for i */ ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); iref->mx[i1-1][i2-1] += 1.; } /* Adjust both mx's to log_2(obs/exp) ratio */ printf("Before normalization/log-odds:\n"); printf(" i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf(" k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); printf("iref matrix values range from %f to %f\n", dmx_upper_min(iref), dmx_upper_max(iref)); expected = (double) nseq * 2. / (double) (M*(M+1)); for (i = 0; i < kmx->m; i++) for (j = i; j < kmx->n; j++) kmx->mx[i][j] = log(kmx->mx[i][j] / expected) / log(2.0); for (i = 0; i < imx->m; i++) for (j = i; j < imx->m; j++) if (iref->mx[i][j] == 0. && imx->mx[i][j] == 0.) imx->mx[i][j] = 0.; else if (iref->mx[i][j] == 0.) imx->mx[i][j] = eslINFINITY; else if (imx->mx[i][j] == 0.) imx->mx[i][j] = -eslINFINITY; else imx->mx[i][j] = log(imx->mx[i][j] / iref->mx[i][j]) / log(2.0); /* Print ps files */ if (kpsfile != NULL) { if ((fp = fopen(kpsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", kpsfile); dmx_Visualize(fp, kmx, -4., 5.); fclose(fp); } if (ipsfile != NULL) { if ((fp = fopen(ipsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", ipsfile); dmx_Visualize(fp, imx, -4., 5.); /* dmx_Visualize(fp, imx, dmx_upper_min(imx), dmx_upper_max(imx)); */ fclose(fp); } printf("After normalization/log-odds:\n"); printf("i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf("k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(core); p7_hmm_Destroy(hmm); p7_trace_Destroy(tr); esl_sq_Destroy(sq); esl_dmatrix_Destroy(imx); esl_dmatrix_Destroy(kmx); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return eslOK; }
static int process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_alifile, char **ret_postalifile) { ESL_GETOPTS *go = esl_getopts_Create(options); int status; if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); if (puts("\nMask range options (format: --xxx 10-20,30-40 ) :") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); if (puts("\nOptions for selecting alphabet rather than guessing it:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); if (puts("\nAlternative model construction strategies:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); if (puts("\nAlternative relative sequence weighting strategies:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); if (puts("\nOther options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 8, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) > 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_alifile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <msafile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { if ((*ret_postalifile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <postmsafile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } } if (strcmp(*ret_alifile, "-") == 0 && ! esl_opt_IsOn(go, "--informat")) { if (puts("Must specify --informat to read <alifile> from stdin ('-')") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } *ret_go = go; return eslOK; FAILURE: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); printf("\nTo see more help on other available options, do:\n %s -h\n\n", argv[0]); esl_getopts_Destroy(go); exit(1); ERROR: if (go) esl_getopts_Destroy(go); exit(status); }
/** * int main(int argc, char **argv) * Main driver */ int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; char *outhmmfile = NULL; P7_HMMFILE *hfp = NULL; FILE *outhmmfp; /* HMM output file handle */ P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float average_internal_transitions[ p7H_NTRANSITIONS ]; int k; char errmsg[eslERRBUFSIZE]; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { profillic_p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <input hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to read <output hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } profillic_p7_banner(stdout, argv[0], banner); /* Initializations: open the input HMM file for reading */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Initializations: open the output HMM file for writing */ if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if (bg == NULL) bg = p7_bg_Create(abc); esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.); for( k = 1; k < hmm->M; k++ ) { esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS); } // Match transitions esl_vec_FNorm(average_internal_transitions, 3); // Insert transitions esl_vec_FNorm(average_internal_transitions + 3, 2); // Delete transitions esl_vec_FNorm(average_internal_transitions + 5, 2); // Ok now set them. for( k = 1; k < hmm->M; k++ ) { esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] ); } if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed"); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); if (outhmmfp != NULL) fclose(outhmmfp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_STOPWATCH *w = esl_stopwatch_Create(); struct cfg_s cfg; p7_Init(); /* Process command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nCommon options:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup, 2 = indentation; 80=textwidth*/ puts("\nChoice of score type :"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\nChoice of alignment mode :"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); puts("\nChoice of multi vs. unihit config :"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); puts("\nChoice of generic vs. vector DP implementation :"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); puts("\nOutput options (use only in serial mode, for single HMM input):"); esl_opt_DisplayHelp(stdout, go, 6, 2, 80); puts("\nControlling range of fitted tail masses :"); esl_opt_DisplayHelp(stdout, go, 7, 2, 80); puts("\nRecalibrating E-values, and replacing HMM's existing parameters :"); esl_opt_DisplayHelp(stdout, go, 8, 2, 80); puts("\nDebugging :"); esl_opt_DisplayHelp(stdout, go, 9, 2, 80); puts("\nExperiments :"); esl_opt_DisplayHelp(stdout, go, 10, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); puts("\nwhere general options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup, 2 = indentation; 80=textwidth*/ printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } /* Validate combinations of score/config/implementation (4x3x2x2, score x mode x hit x DPimpl: 20 of 48 possible combos valid */ if (esl_opt_GetBoolean(go, "--vector") && ! esl_opt_GetBoolean(go, "--local")) p7_Fail("SIMD vector implementations only work for local alignment."); /* -16/48 */ if (esl_opt_GetBoolean(go, "--msv") && ! esl_opt_GetBoolean(go, "--local")) p7_Fail("MSV scoring is local by definition: use --local."); /* -4/48 */ if (esl_opt_GetBoolean(go, "--vit") && ! esl_opt_GetBoolean(go, "--local")) p7_Fail("no p7_GViterbiDual for new dual-mode profile implemented yet"); /* -4/48 */ /* Initialize configuration shared across all kinds of masters * and workers in this .c file. */ cfg.hmmfile = esl_opt_GetArg(go, 1); cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); // cfg.abc = esl_alphabet_Create(eslAMINO); cfg.my_rank = 0; /* MPI init will change this soon, if --mpi was set */ cfg.nproc = 0; /* MPI init will change this soon, if --mpi was set */ cfg.do_mpi = FALSE; /* --mpi will change this soon (below) if necessary */ cfg.do_stall = esl_opt_GetBoolean(go, "--stall"); cfg.N = esl_opt_GetInteger(go, "-N"); cfg.L = esl_opt_GetInteger(go, "-L"); cfg.hfp = NULL; cfg.ofp = NULL; cfg.survfp = NULL; cfg.efp = NULL; cfg.ffp = NULL; cfg.xfp = NULL; cfg.alfp = NULL; cfg.bg = NULL; /* This is our stall point, if we need to wait until we get a * debugger attached to this process for debugging (especially * useful for MPI): */ while (cfg.do_stall); /* Start timing. */ esl_stopwatch_Start(w); /* Main body: * Handed off to serial version or MPI masters and workers as appropriate. */ #ifdef HAVE_MPI if (esl_opt_GetBoolean(go, "--mpi")) { /* Initialize MPI, figure out who we are, and whether we're running * this show (proc 0) or working in it (procs >0). */ cfg.do_mpi = TRUE; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &(cfg.my_rank)); MPI_Comm_size(MPI_COMM_WORLD, &(cfg.nproc)); if (cfg.my_rank == 0 && cfg.nproc < 2) p7_Fail("Need at least 2 MPI processes to run --mpi mode."); if (cfg.my_rank > 0) mpi_worker(go, &cfg); else mpi_master(go, &cfg); esl_stopwatch_Stop(w); esl_stopwatch_MPIReduce(w, 0, MPI_COMM_WORLD); MPI_Finalize(); /* both workers and masters reach this line */ } else #endif /*HAVE_MPI*/ { /* No MPI? Then we're just the serial master. */ serial_master(go, &cfg); esl_stopwatch_Stop(w); } /* Stop timing. */ if (cfg.my_rank == 0) esl_stopwatch_Display(stdout, w, "# CPU time: "); /* Clean up and exit. */ if (cfg.my_rank == 0) { if (cfg.hfp != NULL) p7_hmmfile_Close(cfg.hfp); if (esl_opt_IsOn(go, "-o")) fclose(cfg.ofp); if (cfg.survfp != NULL) fclose(cfg.survfp); if (cfg.efp != NULL) fclose(cfg.efp); if (cfg.ffp != NULL) fclose(cfg.ffp); if (cfg.xfp != NULL) fclose(cfg.xfp); if (cfg.alfp != NULL) fclose(cfg.alfp); } p7_bg_Destroy(cfg.bg); esl_alphabet_Destroy(cfg.abc); esl_randomness_Destroy(cfg.r); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; int nsmall, nbig; int i; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE){ puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) { if (msa->wgt[i] < 0.2) nsmall++; if (msa->wgt[i] > 5.0) nbig++; } printf("%-20s %5d %5d %8.4f %8.4f %5d %5d\n", msa->name, msa->nseq, msa->alen, esl_vec_DMin(msa->wgt, msa->nseq), esl_vec_DMax(msa->wgt, msa->nseq), nsmall, nbig); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); return eslOK; }
int main(int argc, char **argv) { ESL_STOPWATCH *w; ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; double cpu; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("failed to parse cmd line: %s", go->errbuf); esl_getopts_Destroy(go); w = esl_stopwatch_Create(); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } esl_stopwatch_Start(w); if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); esl_stopwatch_Stop(w); cpu = w->user; printf("%-20s %6d %6d %.3f\n", msa->name, msa->alen, msa->nseq, cpu); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); esl_stopwatch_Destroy(w); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *seqfile = NULL; ESL_SQFILE *sqfp = NULL; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; int64_t nseq = 0; int64_t nres = 0; int64_t small = 0; int64_t large = 0; double *monoc = NULL; /* monoresidue composition per sequence */ double *monoc_all = NULL; /* monoresidue composition over all seqs */ int do_comp = FALSE; int status = eslOK; int wstatus; int i; int x; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); seqfile = esl_opt_GetArg(go, 1); do_comp = esl_opt_GetBoolean(go, "-c"); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_FormatCode(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; else if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; else { status = esl_sqfile_GuessAlphabet(sqfp, &alphatype); if (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Sequence file parse error, line %d of file %s:\n%s\n", sqfp->linenumber, seqfile, sqfp->errbuf); else if (status == eslENODATA) esl_fatal("Sequence file %s contains no data?", seqfile); else if (status != eslOK) esl_fatal("Failed to guess alphabet (error code %d)\n", status); } abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); if (do_comp) { ESL_ALLOC(monoc, (abc->Kp) * sizeof(double)); ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double)); esl_vec_DSet(monoc_all, abc->Kp, 0.0); esl_vec_DSet(monoc, abc->Kp, 0.0); } while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF) { if (wstatus == eslOK) { if (do_comp) for (i = 1; i <= sq->n; i++) monoc[sq->dsq[i]]++; } else if (wstatus == eslEOD) { if (nseq == 0) { small = large = sq->L; } else { small = ESL_MIN(small, sq->L); large = ESL_MAX(large, sq->L); } if (esl_opt_GetBoolean(go, "-a")) { printf("= %-20s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : ""); } nres += sq->L; nseq++; esl_sq_Reuse(sq); if (do_comp) { esl_vec_DAdd(monoc_all, monoc, abc->Kp); esl_vec_DSet(monoc, abc->Kp, 0.0); } } else if (wstatus == eslEFORMAT) { esl_fatal("Failed to parse sequence at line %ld, file %s:\n%s", (long) sqfp->linenumber, sqfp->filename, sqfp->errbuf); } else esl_fatal("Failed in reading sequence:\n%s\n", sqfp->errbuf); } printf("Format: %s\n", esl_sqio_DescribeFormat(sqfp->format)); printf("Alphabet type: %s\n", esl_abc_DescribeType(abc->type)); printf("Number of sequences: %" PRId64 "\n", nseq); printf("Total # residues: %" PRId64 "\n", nres); printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); printf("Average length: %.1f\n", (float) nres / (float) nseq); if (do_comp) { printf("\nResidue composition:\n"); for (x = 0; x < abc->Kp; x++) if (x < abc->K || monoc_all[x] > 0) printf("residue: %c %10.0f %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres); free(monoc); free(monoc_all); } esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); return 0; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile1= NULL; /* alignment 1 file name */ char *alifile2= NULL; /* alignment 2 file name */ int fmt; /* format code for alifiles */ ESLX_MSAFILE *afp1 = NULL; /* open alignment file 1 */ ESLX_MSAFILE *afp2 = NULL; /* open alignment file 2 */ ESL_MSA *msa1 = NULL; /* multiple sequence alignment 1 */ ESL_MSA *msa2 = NULL; /* multiple sequence alignment 2 */ int status; /* easel return code */ char errbuf[eslERRBUFSIZE*4]; int *msa1_to_msa2_map; /* map from <msafile1> to <msafile2> */ char *sub_msa1_to_msa2_mask; /* with --sub the map from <msafile1> to <msafile2> in mask form */ FILE *subfp = NULL; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile1 = esl_opt_GetArg(go, 1); alifile2 = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA files ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = eslx_msafile_Open(&abc, alifile1, NULL, fmt, NULL, &afp1)) != eslOK) eslx_msafile_OpenFailure(afp1, status); if ( (status = eslx_msafile_Open(&abc, alifile2, NULL, fmt, NULL, &afp2)) != eslOK) eslx_msafile_OpenFailure(afp2, status); /****************************************************************** * Read first alignment from each file, we only use the first one ******************************************************************/ if ((status = eslx_msafile_Read(afp1, &msa1)) != eslOK) eslx_msafile_ReadFailure(afp1, status); if ((status = eslx_msafile_Read(afp2, &msa2)) != eslOK) eslx_msafile_ReadFailure(afp2, status); /* map the alignments in msa1 and msa2 */ if(! esl_opt_IsOn(go, "--submap")) { if((status = map_msas(go, errbuf, msa1, msa2, &msa1_to_msa2_map)) != eslOK) goto ERROR; free(msa1_to_msa2_map); } /* --submap: if nec, map <msafile1> to a subset of it's own columns in <msafile2> */ else { /* --submap was enabled */ if ((subfp = fopen(esl_opt_GetString(go, "--submap"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --submap output file %s\n", esl_opt_GetString(go, "--submap")); if((status = map_sub_msas(go, errbuf, msa1, msa2, &sub_msa1_to_msa2_mask)) != eslOK) goto ERROR; fprintf(subfp, "%s\n", sub_msa1_to_msa2_mask); fclose(subfp); subfp = NULL; printf("# Mask of 1/0s with 1 indicating aln column in %s maps to a column in %s saved to file %s.\n", alifile1, alifile2, esl_opt_GetString(go, "--submap")); free(sub_msa1_to_msa2_mask); } /* Cleanup, normal return */ eslx_msafile_Close(afp1); eslx_msafile_Close(afp2); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); return 0; ERROR: if (afp1) eslx_msafile_Close(afp1); if (afp2) eslx_msafile_Close(afp2); if (go) esl_getopts_Destroy(go); if (msa1) esl_msa_Destroy(msa1); if (msa2) esl_msa_Destroy(msa2); if (subfp) fclose(subfp); esl_fatal(errbuf); return 1; /* never reached */ }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; int nselect = 0; char *filename = NULL; FILE *fp = NULL; char **larr = NULL; char *buf = NULL; int buflen = 0; char *tmp = NULL; int i,j; int n; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); nselect = atoi(esl_opt_GetArg(go, 1)); filename = esl_opt_GetArg(go, 2); r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); if ((larr = malloc(sizeof(char *) * nselect)) == NULL) esl_fatal("allocation failed"); if (strcmp(filename, "-") == 0) fp = stdin; else { if ((fp = fopen(filename, "r")) == NULL) esl_fatal("Failed to open file %s\n", filename); } n = 0; while (esl_fgets(&buf, &buflen, fp) == eslOK) { n++; i = esl_rnd_Roll(r, n); if (i < nselect) { for (j = i; j < nselect && j < n; j++) { tmp = larr[j]; larr[j] = buf; buf = tmp; } free(buf); buf = NULL; buflen = 0; } } for (i = 0; i < nselect; i++) printf("%s", larr[i]); if (fp != stdin) fclose(fp); for (i = 0; i < nselect; i++) free(larr[i]); free(larr); free(buf); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_ALPHABET *nt_abc = esl_alphabet_Create(eslDNA); ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO); ESL_GENCODE *gcode = NULL; ESL_GENCODE_WORKSTATE *wrk = NULL; char *seqfile = NULL; int informat = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; int status; /***************************************************************** * command line parsing *****************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, /*docgroup=*/0, /*indent=*/2, /*textwidth=*/80); puts("\nAvailable NCBI genetic code tables (for -c <id>):"); esl_gencode_DumpAltCodeTable(stdout); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } seqfile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (informat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); status = esl_sqfile_OpenDigital(nt_abc, seqfile, informat, /*env=*/NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("Failed to find (or open) sequence file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Failed to recognize format of sequence file %s", seqfile); else if (status != eslOK) esl_fatal("Failure in opening sequence file %s; code %d", seqfile, status); /* A limitation. The esl_sqio_ReadWindow() interface needs to use SSI positioning * to read reverse complement, and that doesn't work on nonrewindable streams. */ if ( esl_opt_GetBoolean(go, "-W") && ! esl_sqfile_IsRewindable(sqfp) && ! esl_opt_GetBoolean(go, "--watson")) esl_fatal("esl-translate can't read reverse complement from a nonrewindable stream (stdin pipe, .gz file, etc)."); /* Set up the genetic code. Default = NCBI 1, the standard code; allow ORFs to start at any aa */ gcode = esl_gencode_Create(nt_abc, aa_abc); esl_gencode_Set(gcode, esl_opt_GetInteger(go, "-c")); // default = 1, the standard genetic code if (esl_opt_GetBoolean(go, "-m")) esl_gencode_SetInitiatorOnlyAUG(gcode); else if (! esl_opt_GetBoolean(go, "-M")) esl_gencode_SetInitiatorAny(gcode); // note this is the default, if neither -m or -M are set /* Set up the workstate structure, which contains both stateful * info about our position in <sqfp> and the DNA <sq>, as well as * one-time config info from options */ wrk = esl_gencode_WorkstateCreate(go, gcode); /* The two styles of main processing loop: */ if (esl_opt_GetBoolean(go, "-W")) do_by_windows(gcode, wrk, sqfp); else do_by_sequences(gcode, wrk, sqfp); esl_gencode_WorkstateDestroy(wrk); esl_sqfile_Close(sqfp); esl_gencode_Destroy(gcode); esl_alphabet_Destroy(aa_abc); esl_alphabet_Destroy(nt_abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ char *hmmfile = NULL; /* file to read HMM(s) from */ P7_HMMFILE *hfp = NULL; /* open hmmfile */ P7_HMM *hmm = NULL; /* HMM to emit from */ FILE *ofp = NULL; /* output stream */ int outfmt = 0; int nhmms = 0; int status; char errbuf[eslERRBUFSIZE]; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) cmdline_failure(argv[0], "Failed to get <hmmfile> on cmdline: %s\n", go->errbuf); if ( esl_opt_IsOn(go, "-o") ) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s", esl_opt_GetString(go, "-o")); } else ofp = stdout; if (esl_opt_GetBoolean(go, "-a")) outfmt = eslMSAFILE_STOCKHOLM; else outfmt = eslSQFILE_FASTA; r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "--seed")); status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEFORMAT) esl_fatal("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) esl_fatal("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s\n", hfp->fname); nhmms++; if (esl_opt_GetBoolean(go, "-c")) emit_consensus(go, ofp, outfmt, hmm); else if (esl_opt_GetBoolean(go, "-C")) emit_fancycons(go, ofp, outfmt, hmm); else if (esl_opt_GetBoolean(go, "-a")) emit_alignment(go, ofp, outfmt, r, hmm); else emit_sequences(go, ofp, outfmt, r, hmm); p7_hmm_Destroy(hmm); } if (nhmms == 0) esl_fatal("Empty HMM file %s? No HMM data found.\n"); if (esl_opt_IsOn(go, "-o")) { fclose(ofp); } esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); p7_hmmfile_Close(hfp); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; int i, j; int status = eslOK; P7_HMMFILE *hfp = NULL; /* open input HMM file */ P7_HMM *hmm = NULL; /* one HMM query */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ P7_BG *bg = NULL; char errbuf[eslERRBUFSIZE]; char* hmmfile; float *rel_ents = NULL; float **heights = NULL; float **probs = NULL; float *ins_P = NULL; float *ins_expL = NULL; float *occupancy = NULL; int mode = HMMLOGO_RELENT_ALL; //default go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) { p7_banner (stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 100); exit(0); } if (esl_opt_ArgNumber(go) != 1) esl_fatal(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--height_relent_all")) mode = HMMLOGO_RELENT_ALL; else if (esl_opt_IsOn(go, "--height_relent_abovebg")) mode = HMMLOGO_RELENT_ABOVEBG; else if (esl_opt_IsOn(go, "--height_score")) mode = HMMLOGO_SCORE; else mode = HMMLOGO_RELENT_ALL; //default /* Open the query profile HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); bg = p7_bg_Create(abc); ESL_ALLOC(rel_ents, (hmm->M+1) * sizeof(float)); ESL_ALLOC(heights, (hmm->M+1) * sizeof(float*)); ESL_ALLOC(probs, (hmm->M+1) * sizeof(float*)); for (i = 1; i <= hmm->M; i++) { ESL_ALLOC(heights[i], abc->K * sizeof(float)); ESL_ALLOC(probs[i], abc->K * sizeof(float)); } /* residue heights */ if (mode == HMMLOGO_RELENT_ALL) { printf ("max expected height = %.2f\n", hmmlogo_maxHeight(bg) ); hmmlogo_RelativeEntropy_all(hmm, bg, rel_ents, probs, heights); } else if (mode == HMMLOGO_RELENT_ABOVEBG) { printf ("max expected height = %.2f\n", hmmlogo_maxHeight(bg) ); hmmlogo_RelativeEntropy_above_bg(hmm, bg, rel_ents, probs, heights); } else if (mode == HMMLOGO_SCORE) { hmmlogo_ScoreHeights(hmm, bg, heights ); } printf ("Residue heights\n"); for (i = 1; i <= hmm->M; i++) { printf("%d: ", i); for (j=0; j<abc->K; j++) printf("%6.3f ", heights[i][j] ); if (mode != HMMLOGO_SCORE) printf(" (%6.3f)", rel_ents[i]); printf("\n"); } if (rel_ents != NULL) free(rel_ents); if (heights != NULL) { for (i = 1; i <= hmm->M; i++) if (heights[i] != NULL) free(heights[i]); free(heights); } /* indel values */ if (! esl_opt_IsOn(go, "--no_indel")) { ESL_ALLOC(ins_P, (hmm->M+1) * sizeof(float)); ESL_ALLOC(ins_expL, (hmm->M+1) * sizeof(float)); ESL_ALLOC(occupancy, (hmm->M+1) * sizeof(float)); hmmlogo_IndelValues(hmm, ins_P, ins_expL, occupancy); printf ("Indel values\n"); for (i = 1; i <= hmm->M; i++) printf("%d: %6.3f %6.3f %6.3f\n", i, ins_P[i], ins_expL[i], occupancy[i] ); free(ins_P); free(ins_expL); free(occupancy); } p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); p7_bg_Destroy(bg); exit(0); ERROR: if (rel_ents != NULL) free(rel_ents); if (heights != NULL) { for (i = 1; i <= hmm->M; i++) if (heights[i] != NULL) free(heights[i]); free(heights); } if (hfp != NULL) p7_hmmfile_Close(hfp); if (abc != NULL) esl_alphabet_Destroy(abc); if (ins_P != NULL) free(ins_P); if (ins_expL != NULL) free(ins_expL); if (occupancy != NULL) free(occupancy); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *hmmfile = NULL; /* HMM file name */ char *seqfile = NULL; /* sequence file name */ char *mapfile = NULL; /* optional mapped MSA file name */ int infmt = eslSQFILE_UNKNOWN; int outfmt = eslMSAFILE_STOCKHOLM; P7_HMMFILE *hfp = NULL; /* open HMM file */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ char *outfile = NULL; /* output filename */ FILE *ofp = stdout; /* output stream */ ESL_SQ **sq = NULL; /* array of sequences */ void *p = NULL; /* tmp ptr for reallocation */ int nseq = 0; /* # of sequences in <seqfile> */ int mapseq = 0; /* # of sequences in mapped MSA */ int totseq = 0; /* # of seqs in all sources */ ESL_ALPHABET *abc = NULL; /* alphabet (set from the HMM file)*/ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; /* array of tracebacks */ ESL_MSA *msa = NULL; /* resulting multiple alignment */ int msaopts = 0; /* flags to p7_tracealign_Seqs() */ int idx; /* counter over seqs, traces */ int status; /* easel/hmmer return code */ char errbuf[eslERRBUFSIZE]; /* Parse the command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); seqfile = esl_opt_GetArg(go, 2); if (strcmp(hmmfile, "-") == 0 && strcmp(seqfile, "-") == 0) cmdline_failure(argv[0], "Either <hmmfile> or <seqfile> may be '-' (to read from stdin), but not both.\n"); msaopts |= p7_ALL_CONSENSUS_COLS; /* default as of 3.1 */ if (esl_opt_GetBoolean(go, "--trim")) msaopts |= p7_TRIM; /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--informat")) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Open output stream */ if ( (outfile = esl_opt_GetString(go, "-o")) != NULL) { if ((ofp = fopen(outfile, "w")) == NULL) cmdline_failure(argv[0], "failed to open -o output file %s for writing\n", outfile); } /* If caller forced an alphabet on us, create the one the caller wants */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* Read one HMM, and make sure there's only one. */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); status = p7_hmmfile_Read(hfp, &abc, NULL); if (status != eslEOF) p7_Fail("HMM file %s does not contain just one HMM\n", hfp->fname); p7_hmmfile_Close(hfp); /* We're going to build up two arrays: sequences and traces. * If --mapali option is chosen, the first set of sequences/traces is from the provided alignment */ if ( (mapfile = esl_opt_GetString(go, "--mapali")) != NULL) { map_alignment(mapfile, hmm, &sq, &tr, &mapseq); } totseq = mapseq; /* Read digital sequences into an array (possibly concat'ed onto mapped seqs) */ status = esl_sqfile_OpenDigital(abc, seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", seqfile); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, seqfile); ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq + 1)); sq[totseq] = esl_sq_CreateDigital(abc); nseq = 0; while ((status = esl_sqio_Read(sqfp, sq[totseq+nseq])) == eslOK) { nseq++; ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq+nseq+1)); sq[totseq+nseq] = esl_sq_CreateDigital(abc); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); totseq += nseq; /* Remaining initializations, including trace array allocation */ ESL_RALLOC(tr, p, sizeof(P7_TRACE *) * totseq); for (idx = mapseq; idx < totseq; idx++) tr[idx] = p7_trace_CreateWithPP(); p7_tracealign_computeTraces(hmm, sq, mapseq, totseq - mapseq, tr); p7_tracealign_Seqs(sq, tr, totseq, hmm->M, msaopts, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (idx = 0; idx <= totseq; idx++) esl_sq_Destroy(sq[idx]); /* including sq[nseq] because we overallocated */ for (idx = 0; idx < totseq; idx++) p7_trace_Destroy(tr[idx]); free(sq); free(tr); esl_msa_Destroy(msa); p7_hmm_Destroy(hmm); if (ofp != stdout) fclose(ofp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return eslOK; ERROR: return status; }
/* process_commandline() * * Processes the commandline, filling in fields in <cfg> and creating and returning * an <ESL_GETOPTS> options structure. The help page (hmmsearch -h) is formatted * here. */ static int process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_hmmfile, char **ret_seqfile) { ESL_GETOPTS *go = esl_getopts_Create(options); int status; if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 80=textwidth*/ if (puts("\nOptions controlling output:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); if (puts("\nOptions controlling reporting thresholds:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); if (puts("\nOptions controlling inclusion (significance) thresholds:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); if (puts("\nOptions for model-specific thresholding:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 6, 2, 80); if (puts("\nOptions controlling acceleration heuristics:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 7, 2, 80); if (puts("\nOther expert options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 12, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_hmmfile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <hmmdb> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_seqfile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <seqfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* Validate any attempted use of stdin streams */ if (strcmp(*ret_hmmfile, "-") == 0) { if (puts("nhmmscan cannot read <hmm database> from stdin stream, because it must have hmmpress'ed auxfiles") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } *ret_go = go; return eslOK; FAILURE: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); if (puts("\nwhere most common options are:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 80=textwidth*/ if (printf("\nTo see more help on available options, do %s -h\n\n", argv[0]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_getopts_Destroy(go); exit(1); ERROR: if (go) esl_getopts_Destroy(go); exit(status); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0); cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); if (esl_opt_GetBoolean(go, "--pid")) { if (snprintf(outfile, 256, "%s.pid", basename) >= 256) esl_fatal("Failed to construct %%id table file name"); if ((cfg.pidfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile); } else cfg.pidfp = NULL; /* Open the MSA file, digital mode; determine alphabet */ if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); esl_msa_ConvertDegen2X(origmsa); esl_msa_Hash(origmsa); remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); if ( esl_stack_ObjectCount(teststack) >= 2) { /* randomize test domain order, and apply size limit if any */ esl_stack_Shuffle(cfg.r, teststack); if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest); ntestdom = esl_stack_ObjectCount(teststack); /* randomize training set alignment order, and apply size limit if any */ esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa); if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain); esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE); if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); if (cfg.pidfp) fclose(cfg.pidfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }