static void utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen) { ESL_ALPHABET *abc = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa1 = NULL; ESL_MSA *msa2 = NULL; char tmpfile1[32] = "esltmpXXXXXX"; char tmpfile2[32] = "esltmpXXXXXX"; FILE *ofp = NULL; int status; /* guessing both the format and the alphabet should work: this is a digital open */ /* PSIBLAST format is autodetected as SELEX, which is fine - selex parser is more general */ if ( (status = eslx_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: digital open", testnumber); if (afp->format != eslMSAFILE_SELEX) esl_fatal("psiblast good file test %d failed: format autodetection", testnumber); if (abc->type != expected_alphatype) esl_fatal("psiblast good file test %d failed: alphabet autodetection", testnumber); afp->format = eslMSAFILE_PSIBLAST; /* This is a digital read, using <abc>. */ if ( (status = esl_msafile_psiblast_Read(afp, &msa1)) != eslOK) esl_fatal("psiblast good file test %d failed: msa read, digital", testnumber); if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("psiblast good file test %d failed: nseq/alen", testnumber); if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa1 invalid", testnumber); eslx_msafile_Close(afp); /* write it back out to a new tmpfile (digital write) */ if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber); if ( (status = esl_msafile_psiblast_Write(ofp, msa1)) != eslOK) esl_fatal("psiblast good file test %d failed: msa write, digital", testnumber); fclose(ofp); /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */ if ( (status = eslx_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: text mode open", testnumber); if ( (status = esl_msafile_psiblast_Read(afp, &msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: msa read, text", testnumber); if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("psiblast good file test %d failed: nseq/alen", testnumber); if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa2 invalid", testnumber); eslx_msafile_Close(afp); /* write it back out to a new tmpfile (text write) */ if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber); if ( (status = esl_msafile_psiblast_Write(ofp, msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: msa write, text", testnumber); fclose(ofp); esl_msa_Destroy(msa2); /* open and read it in digital mode */ if ( (status = eslx_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital mode open", testnumber); if ( (status = esl_msafile_psiblast_Read(afp, &msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital msa read", testnumber); if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa2 invalid", testnumber); eslx_msafile_Close(afp); /* this msa <msa2> should be identical to <msa1> */ if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("psiblast good file test %d failed: msa compare", testnumber); remove(tmpfile1); remove(tmpfile2); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); esl_alphabet_Destroy(abc); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(0); char *msafile = esl_opt_GetArg(go, 1); int fmt = eslMSAFILE_UNKNOWN; ESL_ALPHABET *abc = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int textmode = esl_opt_GetBoolean(go, "--text"); int nali = 0; int status; /* If you know the alphabet you want, create it - you'll pass it to eslx_msafile_Open() */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* Open in text or digital mode. * To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>. * To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>. * To open in text mode instead of digital, you pass <NULL> for the alphabet argument. * eslx_msafile_OpenFailure() is a convenience, printing various diagnostics of any * open failure to <stderr>. You can of course handle your own diagnostics instead. */ if (textmode) status = eslx_msafile_Open(NULL, msafile, NULL, fmt, NULL, &afp); else status = eslx_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); fmt = afp->format; while ((status = eslx_msafile_Read(afp, &msa)) == eslOK) { /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data; * if text MSA: msa->aseq[idx=0..nseq-1][acol=0..alen-1] */ nali++; /* permute it */ esl_msashuffle_PermuteSequenceOrder(rng, msa); eslx_msafile_Write(stdout, msa, fmt); esl_msa_Destroy(msa); } if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); /* a convenience, like eslx_msafile_OpenFailure() */ esl_alphabet_Destroy(abc); eslx_msafile_Close(afp); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); exit(0); }
static void read_test_msas_digital(char *pbfile, char *stkfile) { char msg[] = "PSIBLAST msa digital read unit test failed"; ESL_ALPHABET *abc = NULL; ESLX_MSAFILE *afp1 = NULL; ESLX_MSAFILE *afp2 = NULL; ESL_MSA *msa1, *msa2, *msa3, *msa4; FILE *pbfp, *stkfp; char pbfile2[32] = "esltmppb2XXXXXX"; char stkfile2[32] = "esltmpstk2XXXXXX"; if ( eslx_msafile_Open(&abc, pbfile, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg); if ( !abc || abc->type != eslAMINO) esl_fatal(msg); if ( eslx_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Read (afp1, &msa1) != eslOK) esl_fatal(msg); if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg); if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg); if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg); eslx_msafile_Close(afp2); eslx_msafile_Close(afp1); /* Now write stk to psiblast file, and vice versa; then retest */ if ( esl_tmpfile_named(pbfile2, &pbfp) != eslOK) esl_fatal(msg); if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Write (pbfp, msa2) != eslOK) esl_fatal(msg); if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg); fclose(pbfp); fclose(stkfp); if ( eslx_msafile_Open(&abc, pbfile2, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg); if ( eslx_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslOK) esl_fatal(msg); if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg); if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg); remove(pbfile2); remove(stkfile2); eslx_msafile_Close(afp2); eslx_msafile_Close(afp1); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); esl_msa_Destroy(msa3); esl_msa_Destroy(msa4); esl_alphabet_Destroy(abc); }
static void read_test_msas_text(char *pbfile, char *stkfile) { char msg[] = "PSIBLAST msa text-mode read unit test failed"; ESLX_MSAFILE *afp1 = NULL; ESLX_MSAFILE *afp2 = NULL; ESL_MSA *msa1, *msa2, *msa3, *msa4; FILE *pbfp, *stkfp; char pbfile2[32] = "esltmppb2XXXXXX"; char stkfile2[32] = "esltmpstk2XXXXXX"; /* vvvv-- everything's the same as the digital utest except these NULLs */ if ( eslx_msafile_Open(NULL, pbfile, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg); if ( eslx_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Read (afp1, &msa1) != eslOK) esl_fatal(msg); if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg); if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg); if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg); eslx_msafile_Close(afp2); eslx_msafile_Close(afp1); if ( esl_tmpfile_named(pbfile2, &pbfp) != eslOK) esl_fatal(msg); if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Write (pbfp, msa2) != eslOK) esl_fatal(msg); if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg); fclose(pbfp); fclose(stkfp); if ( eslx_msafile_Open(NULL, pbfile2, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg); if ( eslx_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg); if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslOK) esl_fatal(msg); if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg); if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg); remove(pbfile2); remove(stkfile2); eslx_msafile_Close(afp2); eslx_msafile_Close(afp1); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); esl_msa_Destroy(msa3); esl_msa_Destroy(msa4); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *filename = esl_opt_GetArg(go, 1); int infmt = eslMSAFILE_UNKNOWN; ESL_ALPHABET *abc = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int status; if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_PSIBLAST; /* override format autodetection */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* Text mode: pass NULL for alphabet. * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed */ if (esl_opt_GetBoolean(go, "-t")) status = eslx_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp); else status = eslx_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if ((status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)")); printf("# of seqs: %d\n", msa->nseq); printf("# of cols: %d\n", (int) msa->alen); printf("\n"); if (! esl_opt_GetBoolean(go, "-q")) esl_msafile_psiblast_Write(stdout, msa); esl_msa_Destroy(msa); eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *msafile = esl_opt_GetArg(go, 1); ESL_ALPHABET *abc = NULL; int infmt = eslMSAFILE_UNKNOWN; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; FILE *ofp = stdout; int nali = 0; int namewidth; double pid; int nid, n; int i,j; int status; /* allow user to assert the input MSA alphabet */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* allow user to assert the input MSA format */ if (esl_opt_IsOn(go, "--informat") && (infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat")); /* digital open */ if ( ( status = eslx_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ((status = eslx_msafile_Read(afp, &msa)) == eslOK) { nali++; namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq); for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { esl_dst_XPairId(abc, msa->ax[i], msa->ax[j], &pid, &nid, &n); fprintf(ofp, "%-*s %-*s %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n); } esl_msa_Destroy(msa); } if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); eslx_msafile_Close(afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESLX_MSAFILE *afp; ESL_MSA *msa; int i; int status; if ( (status = eslx_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); if ( (status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); eslx_msafile_Close(afp); esl_msaweight_GSC(msa); for (i = 0; i < msa->nseq; i++) printf("%20s %f\n", msa->sqname[i], msa->wgt[i]); return 0; }
int main(int argc, char **argv) { char *filename = argv[1]; int fmt = eslMSAFILE_PSIBLAST; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int status; if ( (status = eslx_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); if ( (status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen); esl_msafile_psiblast_Write(stdout, msa); esl_msa_Destroy(msa); eslx_msafile_Close(afp); exit(0); }
int main(int argc, char **argv) { ESLX_MSAFILE *afp; ESL_MSA *msa; ESL_DMATRIX *P; int i,j; double min, avg, max; int status; if ((status = eslx_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P); min = 1.0; max = 0.0; avg = 0.0; for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { avg += P->mx[i][j]; if (P->mx[i][j] < min) min = P->mx[i][j]; if (P->mx[i][j] > max) max = P->mx[i][j]; } avg /= (double) (msa->nseq * (msa->nseq-1) / 2); printf("Average pairwise %% id: %.1f%%\n", avg * 100.); printf("Minimum pairwise %% id: %.1f%%\n", min * 100.); printf("Maximum pairwise %% id: %.1f%%\n", max * 100.); esl_dmatrix_Destroy(P); esl_msa_Destroy(msa); eslx_msafile_Close(afp); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt; /* format code for alifiles */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *msa = NULL; /* multiple sequence alignment */ int status; /* easel return code */ int do_info = TRUE; /* TRUE if -i */ int do_max = FALSE; /* TRUE if -x */ int do_ffreq = FALSE; /* TRUE if --ffreq */ int do_fmin = FALSE; /* TRUE if --fmin */ float fthresh = 0.; /* <x> from -f <x> */ int do_remove_bps = FALSE; /* TRUE if -r */ int do_consistent = FALSE; /* TRUE if -c */ int do_indi2cons = FALSE; /* TRUE if --indi <x> */ int have_cons; /* TRUE if first alignment has consensus sequence */ int do_newcons = FALSE; /* TRUE if we're creating a new consensus structure * and outputing a new alignment (if -x -f -c or --indi) */ int do_a = FALSE; /* TRUE if -a */ char *indi; /* for <x> from --indi <x> */ int nindi_read; /* number of individual sequence SS lines we've read for current alignment */ int a; /* counter over seqs */ int i, i2; /* counter over residues */ int j, j2; /* counter over residues */ int nali; /* counter over alignments */ int **bp = NULL; /* bp[i][j] is number of individual bps exist between aln cols i and j */ int *cur_ct = NULL; /* ct array of basepairs for current sequence */ int *cons_ct = NULL; /* ct array of basepairs for SS_cons being created */ int *xcons_ct = NULL; /* ct array of basepairs for existing SS_cons */ int *ngaps = NULL; /* number of gaps in each alignment position */ FILE *ofp; /* output file (default is stdout) */ int be_verbose = FALSE; /* TRUE to print extra info */ int seqthresh; /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/ char *sscons = NULL; /* the new SS_cons line */ FILE *lfp = NULL; /* file to list sequences with conflicting bps to */ int nlist = 0; /* number of sequences listed to list file */ int *nconflictsA; /* number of conflicting bps in seq a's individual structure annotation */ int nconflicts_total = 0; /* total number of conflicts */ int nconflicts_list = 0; /* total number of conflicts in sequences listed to file <x> from -l <x> */ int noverlaps_total = 0; /* total number of overlaps */ int nconsistent_total = 0; /* total number of consistent bps */ int nbps_total = 0; /* total number of bps */ int *nconsistentA; /* number of consistent bps in seq a's individual structure annotation */ int *noverlapsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int *nbpsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int ncons_bps = 0; /* number of bps in consensus structure */ int max_noverlaps_aidx; int max_nconsistent_aidx; int max_nbps_aidx; int *removebp; /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */ int *has_conflict; int *nmates_l2r; /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */ int *nmates_r2l; /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left mates i for j */ int lmax; /* with -l, maximum number of conflicts to allow */ int namewidth = 18; /* length of 'SS_cons(consensus)' */ char *namedashes = NULL; /* to store underline for seq name */ /* --fmin related variables */ int nbps = 0; int prev_nbps = -1; float fmin; int inconsistent_flag; int pknot_flag; int k,l; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\noptions for defining a new consensus structure (all of these require -o):"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\noptions for listing sequences based on structure:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); /* open output file */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = NULL; if (esl_opt_GetString(go, "-l") != NULL) { if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l")); } /* determine if we're creating a structure */ do_max = esl_opt_GetBoolean(go, "-x"); if(!(esl_opt_IsDefault(go, "--ffreq"))) { do_ffreq = TRUE; fthresh = esl_opt_GetReal(go, "--ffreq"); } if(!(esl_opt_IsDefault(go, "--fmin"))) { do_fmin = TRUE; } do_remove_bps = esl_opt_GetBoolean(go, "-r"); do_consistent = esl_opt_GetBoolean(go, "-c"); if(!(esl_opt_IsDefault(go, "--indi"))) { do_indi2cons = TRUE; } if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_newcons = TRUE; } do_a = esl_opt_GetBoolean(go, "-a"); if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_info = FALSE; } /*********************************************** * Read MSAs one at a time. ***********************************************/ nali = 0; have_cons = FALSE; lmax = esl_opt_GetInteger(go, "--lmax"); if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); nali++; /* determine max length name */ namewidth = 18; /* length of 'SS_cons(consensus)' */ for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i])); if(namedashes != NULL) { free(namedashes); } ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(i = 0; i < namewidth; i++) namedashes[i] = '-'; ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1)); ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1)); ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1)); ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1)); esl_vec_ISet(cur_ct, (msa->alen+1), 0); esl_vec_ISet(cons_ct, (msa->alen+1), 0); esl_vec_ISet(xcons_ct, (msa->alen+1), 0); esl_vec_ISet(removebp, (msa->alen+1), FALSE); esl_vec_ISet(has_conflict, (msa->alen+1), FALSE); esl_vec_ISet(nmates_l2r, (msa->alen+1), 0); esl_vec_ISet(nmates_r2l, (msa->alen+1), 0); ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq); ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq); ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq); ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq); esl_vec_ISet(nconflictsA, msa->nseq, 0); esl_vec_ISet(noverlapsA, msa->nseq, 0); esl_vec_ISet(nconsistentA, msa->nseq, 0); esl_vec_ISet(nbpsA, msa->nseq, 0); max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0; nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0; for(i = 1; i <= msa->alen; i++) { ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1)); esl_vec_ISet(bp[i], (msa->alen+1), 0); } /* make sure we have ss_cons and indi ss if we need it */ if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_max) esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_consistent) esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_indi2cons) esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_ffreq) esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_fmin) esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss_cons != NULL) { if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { esl_fatal("Existing SS_cons for alignment %d is invalid.", nali); } ncons_bps = 0; for(i = 1; i <= msa->alen; i++) if(xcons_ct[i] != 0 && i < xcons_ct[i]) ncons_bps++; if(nali > 1 && !have_cons) esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); if(nali == 1) have_cons = TRUE; } else if (lfp != NULL) { esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_remove_bps) { esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_consistent) { esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); } else { if(nali > 1 && have_cons) esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); } if(do_info) { printf("# Per-sequence basepair information:\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); } if(have_cons) { printf("#\n"); printf("# indibp: number of basepairs in the individual sequence SS annotation\n"); printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n"); printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n"); printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n"); printf("#\n"); printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n"); printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n"); printf("#\n"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, namedashes, "------", "------", "-----", "------"); } else { printf("# %-*s %6s\n", namewidth, "seqname", "nbp"); printf("# %-*s %6s\n", namewidth, namedashes, "------"); } } nindi_read = 0; for (a = 0; a < msa->nseq; a++) { if(msa->ss != NULL && msa->ss[a] != NULL) { if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { esl_fatal("SS annotation for sequence %d, aln %d is invalid.\n", (a+1), nali); } nindi_read++; for(i = 1; i <= msa->alen; i++) { if(i < cur_ct[i]) { bp[i][cur_ct[i]]++; if(bp[i][cur_ct[i]] == 1) { nmates_l2r[i]++; nmates_r2l[cur_ct[i]]++; } } } for(i = 1; i <= msa->alen; i++) { if(cur_ct[i] != 0 && i < cur_ct[i]) { if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++; if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */ removebp[i] = TRUE; removebp[xcons_ct[i]] = TRUE; } else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */ removebp[cur_ct[i]] = TRUE; removebp[xcons_ct[cur_ct[i]]] = TRUE; } else nconsistentA[a]++; } } if(nconflictsA[a] > lmax) { if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); nconflicts_list += nconflictsA[a]; nlist++; } nbpsA[a] = nconflictsA[a] + nconsistentA[a]; nconflicts_total += nconflictsA[a]; nconsistent_total += nconsistentA[a]; noverlaps_total += noverlapsA[a]; nbps_total += nbpsA[a]; if(do_info && have_cons) printf(" %-*s %6d %6d %6d %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); if(do_info && !have_cons) printf(" %-*s %6d\n", namewidth, msa->sqname[a], nbpsA[a]); if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a; if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a; if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a; } else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); } } if(do_info && have_cons) { if(nindi_read > 0) printf("\n"); printf(" %-*s %6d %6d %6d %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); if(nindi_read > 0) { printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.); printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.); printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.); } else { printf("# No sequences in the alignment have GR SS annotation.\n"); } } if(lfp != NULL) { printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); } /* determine number of gaps per alignment column */ if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR; /* -x: determine max bp structure OR * -a: list all conflicts in individual structures */ if(do_max || do_a) { for(i = 1; i <= msa->alen; i++) { if(nmates_l2r[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[i][j] > 0) { if(do_a) printf("More than 1 right mates for left mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { if(nmates_r2l[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[j][i] > 0) { if(do_a) printf("More than 1 left mates for right mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/ if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { j = i+1; while(bp[i][j] == 0) j++; cons_ct[i] = j; cons_ct[j] = i; } } /* remove pseudoknotted bps greedily */ for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { for(i2 = i+1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/ /* note: remove both if they have equal number of sequences */ if(bp[i][j] <= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i, j);*/ cons_ct[cons_ct[i]] = 0; cons_ct[i] = 0; } if(bp[i][j] >= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i2, j2);*/ cons_ct[cons_ct[i2]] = 0; cons_ct[i2] = 0; } } } } } } } /***************************************/ /*PARANOID, second check for knots for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { printf("BP: %4d:%4d\n", i, j); for(i2 = 1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { if((i < i2)) { printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]); } } } } } } ******************************************/ /***************************************/ /*PARANOID, check cons_ct for consistency for(i = 1; i <= msa->alen; i++) { if(cons_ct[i] != 0) { if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); } } } */ /*PARANOID, write out SS_cons for(i = 1; i <= msa->alen; i++) { if(i < cons_ct[i]) printf("<"); else if(cons_ct[i] != 0) { printf(">"); } else printf("."); } printf("\n"); */ /***************************************/ /* textize alignment */ if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); /* --fmin */ if(do_fmin) { /* define ss_cons */ prev_nbps = -1; fthresh = 0.99; inconsistent_flag = pknot_flag = FALSE; printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > <x> individual SS contain i:j as a pair\n"); printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n"); printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n"); printf("#\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); printf("# Number of seqs: %d\n", msa->nseq); printf("#\n"); printf("# %5s %23s %6s\n", "<x>", "nseq-required-with-bp", "numbps"); printf("# %5s %23s %6s\n", "-----", "-----------------------", "------"); while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { nbps = 0; seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ esl_vec_ISet(cons_ct, msa->alen+1, 0); for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0 || cons_ct[j] != 0) { inconsistent_flag = TRUE; } /* check for pseudoknots */ for(k = i+1; k < j; k++) { l = cons_ct[k]; if((k < l) && (l > j)) { pknot_flag = TRUE; } if((k > l) && (l != 0) && (l < i)) { pknot_flag = TRUE; } } cons_ct[i] = j; cons_ct[j] = i; nbps++; } } } if(inconsistent_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "inconsistent"); else if(pknot_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "pseudoknotted"); else { if(nbps != prev_nbps) { printf(" %.3f %23d %6d\n", fthresh, seqthresh+1, nbps); } fmin = fthresh; } fthresh -= 0.01; prev_nbps = nbps; } fthresh = fmin; esl_vec_ISet(cons_ct, msa->alen+1, 0); } /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */ if(do_ffreq || do_fmin) { if(do_fmin) { printf("#\n# <x> determined to be %.3f\n", fthresh); } if(do_ffreq) { printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > %f individual SS contain i:j as a pair\n", fthresh); } esl_vec_ISet(cons_ct, msa->alen+1, 0); /* define ss_cons */ seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j); } if(cons_ct[j] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j); } cons_ct[i] = j; cons_ct[j] = i; } } } } /* -r: redefine consensus struct by removing any bps that conflict with individual structures */ if(do_remove_bps) { for(i = 1; i <= msa->alen; i++) { if(!(removebp[i])) { cons_ct[i] = xcons_ct[i]; cons_ct[cons_ct[i]] = i; } else { printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]); cons_ct[xcons_ct[i]] = 0; cons_ct[i] = 0; } } } /* -c: define consensus structure as indi sequence with highest number of consistent bps with structure OR */ /* --indi: define consensus structure as indi sequence <x> from --indi <x> */ if(do_consistent || do_indi2cons) { if(do_indi2cons) { indi = esl_opt_GetString(go, "--indi"); for(a = 0; a < msa->nseq; a++) { if(strcmp(indi, msa->sqname[a]) == 0) break; } if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi); } else { /* do_consistent */ a = max_nconsistent_aidx; } if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]); if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { esl_fatal("Second pass... SS annotation for sequence %d, aln %d is invalid.\n", (a), nali); } printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]); if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) { if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; } if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR; printf("# Defined new RF as %s sequence\n", msa->sqname[a]); } } /* write out alignment with new SS_cons */ if(do_newcons) { if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR; if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; } if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR; status = eslx_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM)); if (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n"); else if (status != eslOK) esl_fatal("Writing alignment file failed with error %d\n", status); } free(sscons); free(cur_ct); free(cons_ct); free(xcons_ct); for(i = 1; i <= msa->alen; i++) free(bp[i]); free(bp); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); /* Cleanup, normal return */ if(lfp != NULL) fclose(lfp); if(ofp != NULL) { printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o")); fclose(ofp); } eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; ERROR: if(afp) eslx_msafile_Close(afp); if(go) esl_getopts_Destroy(go); if(msa) esl_msa_Destroy(msa); if(lfp) fclose(lfp); if(ofp) fclose(ofp); esl_fatal("ERROR\n"); return 1; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *alifile = NULL; /* alignment file name */ int infmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ int outfmt = eslMSAFILE_UNKNOWN; /* output format for fetched msa's */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ FILE *ofp = NULL; /* output stream for alignments */ int status; /* easel return code */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) < 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input alignment file format for --informat", esl_opt_GetString(go, "--informat")); } outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid output alignment file format for --outformat", esl_opt_GetString(go, "--outformat")); alifile = esl_opt_GetArg(go, 1); /* Open the alignment file. */ if ( (status = eslx_msafile_Open(NULL, alifile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); /* Open the SSI index, if any */ if (! esl_opt_GetBoolean(go, "--index")) { if (afp->bf->mode_is == eslBUFFER_FILE || afp->bf->mode_is == eslBUFFER_ALLFILE || afp->bf->mode_is == eslBUFFER_MMAP) { char *ssifile = NULL; esl_sprintf(&ssifile, "%s.ssi", afp->bf->filename); status = esl_ssi_Open(ssifile, &(afp->ssi)); if (status == eslERANGE ) esl_fatal("SSI index %s has 64-bit offsets; this system doesn't support them", ssifile); else if (status == eslEFORMAT) esl_fatal("SSI index %s has an unrecognized format. Try recreating, w/ esl-afetch --index", ssifile); else if (status == eslENOTFOUND) afp->ssi = NULL; else if (status != eslOK) esl_fatal("SSI index %s: open failed, error code %d\n", ssifile, status); free(ssifile); } } /* Open the output file, if any */ if (esl_opt_GetBoolean(go, "-O")) { if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetArg(go, 2)); } else if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Hand off control flow as appropriate */ if (esl_opt_GetBoolean(go, "--index")) { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); create_ssi_index(go, afp); } else if (esl_opt_GetBoolean(go, "-f")) { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); multifetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); } else { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); onefetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); if (ofp != stdout) printf("\n\nRetrieved alignment %s.\n", esl_opt_GetArg(go, 2)); } eslx_msafile_Close(afp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
/* regurgitate_pfam_as_afa() * * Given an open Pfam formatted msafile, read the next alignment and * regurgitate it in aligned FASTA (AFA) format without storing * it in a esl_msa data structure. * * We need to do two passes through the file because in Pfam * sequence accessions (#=GS <seqname> AC) and sequence descriptions * (#=GS <seqname> DE) appear altogether before any aligned sequence * data, while in AFA they appear on the same line as the sequence * name (accession, then description). * * Example: * # STOCKHOLM 1.0 * #=GS tRNA1 AC RF00005-1 * #=GS tRNA2 AC RF00005-2 * #=GS tRNA1 DE first tRNA * #=GS tRNA2 DE second tRNA * * tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA * tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGACUCCCCGUAUCGGAG * * converts to AFA: * >tRNA1 RF00005-1 first tRNA * GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAU * CCACAGAAUUCGCA * >tRNA2 RF00005-2 second tRNA * UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGAC * UCCCCGUAUCGGAG * * In the first pass, output the sequence names and accessions we find * as '#=GS <seqname> AC' lines in the Pfam alignment to an accession * tmpfile, and output sequence names and descriptions we find as * as '#=GS <seqname> DE' lines in the Pfam alignment to a description * tmpfile. * * In the second pass, rewind all (up to 3) files: <ac_tmpfile>, * <de_tmpfile> and the Pfam alignment file and start reading them * again. As we're reading them, output the accessions, descriptions * and aligned sequence data in the proper order to an aligned FASTA * file. * * Set <ret_reached_eof> as TRUE if the alignment read and reformatted * appears to be the only one remaining in afp. Set <ret_reached_eof> * as FALSE if afp appears to include at least one more alignment. * * Returns void. Dies upon any input error. */ static void regurgitate_pfam_as_afa(ESLX_MSAFILE *afp, FILE *ofp, char *alifile, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, char *rename, char *rfrom, char *rto, int *ret_reached_eof) { char *p = NULL; esl_pos_t n = 0; esl_pos_t gslen, seqnamelen, taglen; char *seqname = NULL; char *first_seqname = NULL; char *tag = NULL; char *gs = NULL; int nseq_read = 0; int reached_eof; /* variables related to reading accessions */ char ac_tmpfile[16] = "esltmpXXXXXX"; FILE *ac_fp = NULL; /* file ptr for accession tmpfile */ char *ac_buf = NULL; /* buffer for line input w/ sre_fgets() */ int ac_buflen = 0; /* current allocated length for buf */ char *ac_s = NULL; char *ac_seqname = NULL; char *ac = NULL; int have_ac = FALSE; /* variables related to reading descriptions */ char de_tmpfile[16] = "esltmpXXXXXX"; FILE *de_fp = NULL; /* file ptr for description tmpfile */ char *de_buf = NULL; /* buffer for line input w/ sre_fgets() */ int de_buflen = 0; /* current allocated length for buf */ char *de_s = NULL; char *de_seqname = NULL; char *de = NULL; int have_de = FALSE; /* variables related to printing out sequences */ char *aseq = NULL; esl_pos_t aseqlen = 0; int64_t apos; char aseqbuf[61]; int cpl = 60; /* number of residues per afa seq line */ int acpl; /* actual number of character per line */ int status; afp->errmsg[0] = '\0'; /************************************************************************************************** * First pass, go through each line of the Pfam file and output all GS DE and AC annotation to tmpfiles **************************************************************************************************/ /* Check the magic Stockholm header line, allowing blank lines */ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return; else if (status != eslOK) esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (esl_memstrpfx(p, n, "#=GS")) { /* only lines we need to check are AC and DE lines, we don't even check other lines for validity */ if (esl_memtok(&p, &n, " \t", &gs, &gslen) != eslOK) esl_fatal("small mem parse failed (line %d) in a way that can't happen", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <seqname>, <tag>, annotation", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <tag>, annotation", (int) afp->linenumber); if (! esl_memstrcmp(gs, gslen, "#=GS")) esl_fatal("small mem parse failed (line %d): faux #=GS line?", (int) afp->linenumber); if (esl_memstrcmp(tag, taglen, "AC")) { if (! ac_fp && esl_tmpfile(ac_tmpfile, &ac_fp) != eslOK) esl_fatal("small mem parse failed, unable to open accession tmpfile"); fprintf(ac_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p); } if (esl_memstrcmp(tag, taglen, "DE")) { if (! de_fp && esl_tmpfile(de_tmpfile, &de_fp) != eslOK) esl_fatal("small mem parse failed, unable to open description tmpfile"); fprintf(de_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p); } } else if (esl_memstrpfx(p, n, "//")) break; } if (status == eslEOF) esl_fatal("small mem parse failed (line %d): missing // terminator", (int) afp->linenumber); else if (status != eslOK) esl_fatal("small mem parse failed (line %d) with code %d", (int) afp->linenumber, status); /* The regurgitate_*() functions are limited, and only deal with single-record Pfam files. * If there appears to be more data in the file, drop the reached_eof flag. */ while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (esl_memstrpfx(p, n, "# STOCKHOLM 1.")) break; if (n && ! esl_memstrpfx(p, n, "#")) esl_fatal("small mem parse failed (line %d): unexpected data", (int) afp->linenumber); } if (status == eslOK) reached_eof = FALSE; else if (status == eslEOF) reached_eof = TRUE; else esl_fatal("--small parse error. problem reading line %d of msafile", (int) afp->linenumber); /***************************************************************** * Pass 1 complete; rewind (close/reopen) all files *****************************************************************/ eslx_msafile_Close(afp); if ((status = eslx_msafile_Open(NULL, alifile, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK) esl_fatal("--small, second pass, unable to open file %s for reading", alifile); if (ac_fp) { /* open the tmpfile with the seq accessions */ rewind(ac_fp); if((status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp)) != eslOK) esl_fatal("--small accession tmpfile parse failed"); ac_s = ac_buf; if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); if (esl_strtok_adv(&ac_s, "\n\r", &ac, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); } if (de_fp) { /* open the tmpfile with the seq descriptions */ rewind(de_fp); if((status = esl_fgets(&(de_buf), &(de_buflen), de_fp)) != eslOK) esl_fatal("--small description tmpfile parse failed"); de_s = de_buf; if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); if (esl_strtok_adv(&de_s, "\n\r", &de, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); } /****************************************************************************************** * Pass 2, step through files, outputting appropriately ******************************************************************************************/ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return; else if (status != eslOK) esl_fatal("small mem parse pass 2 error. problem reading line %d of msafile", (int) afp->linenumber); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse pass 2 failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (!n || *p == '#') continue; /* skip blank lines, comments */ else if (esl_memstrpfx(p, n, "//")) break; /* end of alignment: end of record */ else { /* sequence line. parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no seq name", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &aseq, &aseqlen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no aseq", (int) afp->linenumber); /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */ if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, seqnamelen, &(first_seqname))) != eslOK) esl_fatal("small mem parse failed: unable to copy seqname"); } else if (esl_memstrcmp(seqname, seqnamelen, first_seqname)) esl_fatal("--small parse pass 2 failed (line %d): two seqs named %s. Alignment appears to be in interleaved Stockholm (not Pfam) format.", (int) afp->linenumber, seqname); nseq_read++; /* determine if we have an accession and/or description for this sequence */ have_de = have_ac = FALSE; if (ac_seqname && (esl_memstrcmp(seqname, seqnamelen, ac_seqname))) have_ac = TRUE; if (de_seqname && (esl_memstrcmp(seqname, seqnamelen, de_seqname))) have_de = TRUE; if (rename) fprintf(ofp, ">%s.%d%s%s%s%s\n", rename, nseq_read, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); else fprintf(ofp, ">%.*s%s%s%s%s\n", (int) seqnamelen, seqname, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); /* load next ac, de */ if (have_ac) { status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp); if (status == eslEOF) ac_seqname = NULL; else if (status == eslOK) { ac_s = ac_buf; if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); if (esl_strtok_adv(&ac_s, "\n\r", &ac, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); } } if (have_de) { status = esl_fgets(&(de_buf), &(de_buflen), de_fp); if(status == eslEOF) de_seqname = NULL; else if (status == eslOK) { de_s = de_buf; if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); if (esl_strtok_adv(&de_s, "\n\r", &de, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); } } /* now print sequence, after converting symbols as nec */ /* remember, aseq itself is part of an ESL_BUFFER and you can't write to it, so symconverts have to be on the copy */ for (apos = 0; apos < aseqlen; apos += cpl) { acpl = (aseqlen - apos > cpl ? cpl : aseqlen - apos); strncpy(aseqbuf, aseq + apos, acpl); aseqbuf[acpl] = '\0'; if (rfrom) symconvert(aseqbuf, rfrom, rto); if (gapsym) symconvert(aseqbuf, "-_.", gapsym); if (force_lower) symconvert(aseqbuf, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"); if (force_upper) symconvert(aseqbuf, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); if (force_rna) symconvert(aseqbuf, "Tt", "Uu"); if (force_dna) symconvert(aseqbuf, "Uu", "Tt"); if (iupac_to_n) symconvert(aseqbuf, "RYMKSWHBVDrymkswhbvd", "NNNNNNNNNNnnnnnnnnnn"); if (x_is_bad) symconvert(aseqbuf, "Xx", "Nn"); fprintf(ofp, "%s\n", aseqbuf); } } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status != eslOK) esl_fatal("--small parse pass 2 failed (line %d): didn't find // at end of alignment", (int) afp->linenumber); if (ac_seqname) esl_fatal("--small parse pass 2 failed, sequence %s with #=GS AC line does not exist in alignment or is in different order.", ac_seqname); if (de_seqname) esl_fatal("--small parse pass 2 failed, sequence %s with #=GS DE line does not exist in alignment or is in different order.", de_seqname); if (ac_fp) fclose(ac_fp); if (de_fp) fclose(de_fp); eslx_msafile_Close(afp); if (first_seqname) free(first_seqname); if (ac_buf) free(ac_buf); if (de_buf) free(de_buf); *ret_reached_eof = reached_eof; return; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; int nsmall, nbig; int i; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE){ puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) { if (msa->wgt[i] < 0.2) nsmall++; if (msa->wgt[i] > 5.0) nbig++; } printf("%-20s %5d %5d %8.4f %8.4f %5d %5d\n", msa->name, msa->nseq, msa->alen, esl_vec_DMin(msa->wgt, msa->nseq), esl_vec_DMax(msa->wgt, msa->nseq), nsmall, nbig); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); return eslOK; }
int main(int argc, char **argv) { ESL_STOPWATCH *w; ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; double cpu; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("failed to parse cmd line: %s", go->errbuf); esl_getopts_Destroy(go); w = esl_stopwatch_Create(); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } esl_stopwatch_Start(w); if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); esl_stopwatch_Stop(w); cpu = w->user; printf("%-20s %6d %6d %.3f\n", msa->name, msa->alen, msa->nseq, cpu); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); esl_stopwatch_Destroy(w); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; float *sqd; int status; int nbad; int nali = 0; int nbadali = 0; int nwgt = 0; int nbadwgt = 0; int i; int be_quiet; int do_gsc; int do_pb; int do_blosum; double maxid; double tol; int maxN; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } be_quiet = esl_opt_GetBoolean(go, "-q"); do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); tol = esl_opt_GetReal (go, "--tol"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } msafile = esl_opt_GetArg(go, 1); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } nali++; nwgt += msa->nseq; ESL_ALLOC(sqd, sizeof(float) * msa->nseq); if (do_gsc) { esl_msaweight_GSC(msa); GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_pb) { esl_msaweight_PB(msa); PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_blosum) { esl_msaweight_BLOSUM(msa, maxid); BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd); /* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */ esl_vec_FNorm (sqd, msa->nseq); esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq); } if (! be_quiet) { for (i = 0; i < msa->nseq; i++) fprintf(stdout, "%-20s %.3f %.3f\n", msa->sqname[i], msa->wgt[i], sqd[i]); } nbad = 0; for (i = 0; i < msa->nseq; i++) if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) nbad++; if (nbad > 0) nbadali++; nbadwgt += nbad; if (nbad > 0) printf("%-20s :: alignment shows %d weights that differ (out of %d) \n", msa->name, nbad, msa->nseq); esl_msa_Destroy(msa); free(sqd); } eslx_msafile_Close(afp); if (nbadali == 0) printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali); else { printf("%d of %d weights mismatched at (> %f fractional difference)\n", nbadwgt, nwgt, tol); printf("involving %d of %d total alignments\n", nbadali, nali); } return eslOK; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ ESLX_MSAFILE *afp = NULL; /* open msa file */ ESL_MSAFILE2 *old_afp = NULL; /* open msa file, legacy (--small) */ ESL_MSA *msa = NULL; /* one multiple sequence alignment */ int nali; /* number of alignments read */ int i; /* counter over seqs */ int64_t alen; /* alignment length */ int nseq; /* number of sequences in the msa */ int64_t rlen; /* a raw (unaligned) seq length */ int64_t small, large; /* smallest, largest sequence */ int64_t nres; /* total # of residues in msa */ double avgid; /* average fractional pair id */ int max_comparisons; /* maximum # comparisons for avg id */ int do_stall; /* used to stall when debugging */ double **abc_ct = NULL; /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */ double ***bp_ct = NULL; /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair * * count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum * * of 'i:j' for a pair between i and j, where i < j. */ double **pp_ct = NULL; /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */ int *i_am_rf = NULL; /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */ int *rf2a_map = NULL; /* [0..rfpos..rflen-1] = apos, * apos is the alignment position (0..msa->alen-1) that * is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */ int rflen = -1; /* nongap RF length */ char errbuf[eslERRBUFSIZE]; int status; /* easel return code */ /* optional output files */ FILE *iinfofp = NULL; /* output file for --iinfo */ FILE *pcinfofp = NULL; /* output file for --pcinfo */ FILE *psinfofp = NULL; /* output file for --psinfo */ FILE *rinfofp = NULL; /* output file for --rinfo */ FILE *icinfofp = NULL; /* output file for --icinfo */ FILE *listfp = NULL; /* output file for --list */ FILE *cinfofp = NULL; /* output file for --cinfo */ FILE *bpinfofp = NULL; /* output file for --bpinfo */ int use_weights; /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */ int weights_exist; /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\n optional output files:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); max_comparisons = 1000; do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* We'd like to get rid of the legacy msafile interface, but it * includes small memory functionality for Pfam format which we have * to replace first. For now, use both interfaces, new and legacy */ if ( esl_opt_GetBoolean(go, "--small") ) { if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified."); status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); } else { if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); } /************************************** * Open optional output files, as nec * **************************************/ /* determine name for first list file, if nec */ if( esl_opt_IsOn(go, "--list")) { if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list")); } if( esl_opt_IsOn(go, "--icinfo")) { if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo")); } if( esl_opt_IsOn(go, "--rinfo")) { if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo")); } if( esl_opt_IsOn(go, "--pcinfo")) { if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo")); } if( esl_opt_IsOn(go, "--psinfo")) { if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo")); } if( esl_opt_IsOn(go, "--iinfo")) { if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo")); } if( esl_opt_IsOn(go, "--cinfo")) { if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo")); } if( esl_opt_IsOn(go, "--bpinfo")) { if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo")); } /*********************************************** * Read MSAs one at a time. ***********************************************/ if (esl_opt_GetBoolean(go, "-1")) { puts("#"); if(! esl_opt_GetBoolean(go, "--small")) { printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id"); printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---"); } else { printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen"); printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------"); } } nali = 0; fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format); while ( (status = ( esl_opt_GetBoolean(go, "--small") ? esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) : eslx_msafile_Read (afp, &msa))) == eslOK) { nali++; nres = 0; if (! esl_opt_GetBoolean(go, "--small")) { nseq = msa->nseq; alen = msa->alen; small = large = -1; for (i = 0; i < msa->nseq; i++) { rlen = esl_abc_dsqrlen(msa->abc, msa->ax[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid); } else { /* --small invoked */ for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K); } if (esl_opt_GetBoolean(go, "-1")) { printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, nali, msa->name, eslx_msafile_DecodeFormat(fmt), nseq, alen, nres); if (! esl_opt_GetBoolean(go, "--small")) { printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n", small, large, (double) nres / (double) msa->nseq, 100.*avgid); } else { printf(" %10.1f\n", (double) nres / (double) nseq); } } else { printf("Alignment number: %d\n", nali); if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); printf("Format: %s\n", eslx_msafile_DecodeFormat(fmt)); printf("Number of sequences: %d\n", nseq); printf("Alignment length: %" PRId64 "\n", alen); printf("Total # residues: %" PRId64 "\n", nres); if(! esl_opt_GetBoolean(go, "--small")) { printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); } printf("Average length: %.1f\n", (double) nres / (double) nseq); if(! esl_opt_GetBoolean(go, "--small")) { printf("Average identity: %.0f%%\n", 100.*avgid); } printf("//\n"); } /* Dump data to optional output files, if nec */ if(esl_opt_IsOn(go, "--list")) { if(! esl_opt_GetBoolean(go, "--small")) { /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */ for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]); } } /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions * and rf2a_map, a map of non-gap RF positions to overall alignment positions */ if(msa->rf != NULL) { if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf); } else i_am_rf = NULL; weights_exist = check_msa_weights(msa); use_weights = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE; if( (! esl_opt_GetBoolean(go, "--small")) && (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo") || esl_opt_IsOn(go, "--pcinfo") || esl_opt_IsOn(go, "--cinfo") || esl_opt_IsOn(go, "--bpinfo"))) { /* collect counts of each residue and PPs (if they exist) from the msa */ if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = count_msa(msa, errbuf, nali, esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */ esl_opt_GetBoolean(go, "--weight"), /* use msa->wgt sequence weights? */ &abc_ct, ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */ (msa->pp != NULL ? &pp_ct : NULL))) /* get PP counts? */ != eslOK) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--icinfo")) { if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--rinfo")) { if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--pcinfo")) { if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--psinfo")) { if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--iinfo")) { if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali); if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--cinfo")) { if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--bpinfo")) { if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali); if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } esl_msa_Destroy(msa); if(abc_ct != NULL) { esl_Free2D((void **) abc_ct, alen); abc_ct = NULL; } if(bp_ct != NULL) { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct = NULL; } if(pp_ct != NULL) { esl_Free2D((void **) pp_ct, alen); pp_ct = NULL; } if(i_am_rf != NULL) { free(i_am_rf); i_am_rf = NULL; } if(rf2a_map != NULL) { free(rf2a_map); rf2a_map = NULL; } } /* If an msa read failed, we've dropped out to here with an informative status code. * we have to handle failures from new vs. legacy msa parsing differently */ if (esl_opt_GetBoolean(go, "--small")) { if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); } else { if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); } /* Cleanup, normal return */ if(listfp != NULL) { fclose(listfp); printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list")); } if(icinfofp != NULL) { fclose(icinfofp); printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); } if(rinfofp != NULL) { fclose(rinfofp); printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); } if(pcinfofp != NULL) { fclose(pcinfofp); printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); } if(psinfofp != NULL) { fclose(psinfofp); printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); } if(iinfofp != NULL) { printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); fclose(iinfofp); } if(cinfofp != NULL) { printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); fclose(cinfofp); } if(bpinfofp != NULL) { printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); fclose(bpinfofp); } if (afp) eslx_msafile_Close(afp); if (old_afp) esl_msafile2_Close(old_afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile1= NULL; /* alignment 1 file name */ char *alifile2= NULL; /* alignment 2 file name */ int fmt; /* format code for alifiles */ ESLX_MSAFILE *afp1 = NULL; /* open alignment file 1 */ ESLX_MSAFILE *afp2 = NULL; /* open alignment file 2 */ ESL_MSA *msa1 = NULL; /* multiple sequence alignment 1 */ ESL_MSA *msa2 = NULL; /* multiple sequence alignment 2 */ int status; /* easel return code */ char errbuf[eslERRBUFSIZE*4]; int *msa1_to_msa2_map; /* map from <msafile1> to <msafile2> */ char *sub_msa1_to_msa2_mask; /* with --sub the map from <msafile1> to <msafile2> in mask form */ FILE *subfp = NULL; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile1 = esl_opt_GetArg(go, 1); alifile2 = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA files ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = eslx_msafile_Open(&abc, alifile1, NULL, fmt, NULL, &afp1)) != eslOK) eslx_msafile_OpenFailure(afp1, status); if ( (status = eslx_msafile_Open(&abc, alifile2, NULL, fmt, NULL, &afp2)) != eslOK) eslx_msafile_OpenFailure(afp2, status); /****************************************************************** * Read first alignment from each file, we only use the first one ******************************************************************/ if ((status = eslx_msafile_Read(afp1, &msa1)) != eslOK) eslx_msafile_ReadFailure(afp1, status); if ((status = eslx_msafile_Read(afp2, &msa2)) != eslOK) eslx_msafile_ReadFailure(afp2, status); /* map the alignments in msa1 and msa2 */ if(! esl_opt_IsOn(go, "--submap")) { if((status = map_msas(go, errbuf, msa1, msa2, &msa1_to_msa2_map)) != eslOK) goto ERROR; free(msa1_to_msa2_map); } /* --submap: if nec, map <msafile1> to a subset of it's own columns in <msafile2> */ else { /* --submap was enabled */ if ((subfp = fopen(esl_opt_GetString(go, "--submap"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --submap output file %s\n", esl_opt_GetString(go, "--submap")); if((status = map_sub_msas(go, errbuf, msa1, msa2, &sub_msa1_to_msa2_mask)) != eslOK) goto ERROR; fprintf(subfp, "%s\n", sub_msa1_to_msa2_mask); fclose(subfp); subfp = NULL; printf("# Mask of 1/0s with 1 indicating aln column in %s maps to a column in %s saved to file %s.\n", alifile1, alifile2, esl_opt_GetString(go, "--submap")); free(sub_msa1_to_msa2_mask); } /* Cleanup, normal return */ eslx_msafile_Close(afp1); eslx_msafile_Close(afp2); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); return 0; ERROR: if (afp1) eslx_msafile_Close(afp1); if (afp2) eslx_msafile_Close(afp2); if (go) esl_getopts_Destroy(go); if (msa1) esl_msa_Destroy(msa1); if (msa2) esl_msa_Destroy(msa2); if (subfp) fclose(subfp); esl_fatal(errbuf); return 1; /* never reached */ }
static int map_alignment(const char *msafile, const P7_HMM *hmm, ESL_SQ ***ret_sq, P7_TRACE ***ret_tr, int *ret_ntot) { ESL_SQ **sq = NULL; P7_TRACE **tr = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = (ESL_ALPHABET *) hmm->abc; /* removing const'ness to make compiler happy. Safe. */ int *matassign = NULL; uint32_t chksum = 0; int i,k; int status; status = eslx_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); status = eslx_msafile_Read(afp, &msa); if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (! (hmm->flags & p7H_CHKSUM) ) esl_fatal("HMM has no checksum. --mapali unreliable without it."); if (! (hmm->flags & p7H_MAP) ) esl_fatal("HMM has no map. --mapali can't work without it."); esl_msa_Checksum(msa, &chksum); if (hmm->checksum != chksum) esl_fatal("--mapali MSA %s isn't same as the one HMM came from (checksum mismatch)", msafile); ESL_ALLOC(sq, sizeof(ESL_SQ *) * msa->nseq); ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq); ESL_ALLOC(matassign, sizeof(int) * (msa->alen + 1)); esl_vec_ISet(matassign, msa->alen+1, 0); for (k = 1; k <= hmm->M; k++) matassign[hmm->map[k]] = 1; p7_trace_FauxFromMSA(msa, matassign, p7_DEFAULT, tr); /* The 'faux' core traces constructed by FauxFromMSA() may contain * D->I and I->D transitions. They may *only* now be passed to * p7_tracealign_Seqs(), which can deal with these 'illegal' * transitions, in order to exactly reproduce the input --mapali * alignment. */ for (i = 0; i < msa->nseq; i++) esl_sq_FetchFromMSA(msa, i, &(sq[i])); *ret_ntot = msa->nseq; *ret_tr = tr; *ret_sq = sq; eslx_msafile_Close(afp); esl_msa_Destroy(msa); free(matassign); return eslOK; ERROR: *ret_ntot = 0; *ret_tr = NULL; *ret_sq = NULL; if (afp != NULL) eslx_msafile_Close(afp); if (msa != NULL) esl_msa_Destroy(msa); if (matassign != NULL) free(matassign); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0); cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); if (esl_opt_GetBoolean(go, "--pid")) { if (snprintf(outfile, 256, "%s.pid", basename) >= 256) esl_fatal("Failed to construct %%id table file name"); if ((cfg.pidfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile); } else cfg.pidfp = NULL; /* Open the MSA file, digital mode; determine alphabet */ if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); esl_msa_ConvertDegen2X(origmsa); esl_msa_Hash(origmsa); remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); if ( esl_stack_ObjectCount(teststack) >= 2) { /* randomize test domain order, and apply size limit if any */ esl_stack_Shuffle(cfg.r, teststack); if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest); ntestdom = esl_stack_ObjectCount(teststack); /* randomize training set alignment order, and apply size limit if any */ esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa); if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain); esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE); if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); if (cfg.pidfp) fclose(cfg.pidfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }