int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *msafile = esl_opt_GetArg(go, 1); ESL_ALPHABET *abc = NULL; int infmt = eslMSAFILE_UNKNOWN; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; FILE *ofp = stdout; int nali = 0; int namewidth; double pid; int nid, n; int i,j; int status; /* allow user to assert the input MSA alphabet */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* allow user to assert the input MSA format */ if (esl_opt_IsOn(go, "--informat") && (infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat")); /* digital open */ if ( ( status = eslx_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ((status = eslx_msafile_Read(afp, &msa)) == eslOK) { nali++; namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq); for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { esl_dst_XPairId(abc, msa->ax[i], msa->ax[j], &pid, &nid, &n); fprintf(ofp, "%-*s %-*s %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n); } esl_msa_Destroy(msa); } if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); eslx_msafile_Close(afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *alifile = NULL; /* alignment file name */ int infmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ int outfmt = eslMSAFILE_UNKNOWN; /* output format for fetched msa's */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ FILE *ofp = NULL; /* output stream for alignments */ int status; /* easel return code */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) < 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input alignment file format for --informat", esl_opt_GetString(go, "--informat")); } outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid output alignment file format for --outformat", esl_opt_GetString(go, "--outformat")); alifile = esl_opt_GetArg(go, 1); /* Open the alignment file. */ if ( (status = eslx_msafile_Open(NULL, alifile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); /* Open the SSI index, if any */ if (! esl_opt_GetBoolean(go, "--index")) { if (afp->bf->mode_is == eslBUFFER_FILE || afp->bf->mode_is == eslBUFFER_ALLFILE || afp->bf->mode_is == eslBUFFER_MMAP) { char *ssifile = NULL; esl_sprintf(&ssifile, "%s.ssi", afp->bf->filename); status = esl_ssi_Open(ssifile, &(afp->ssi)); if (status == eslERANGE ) esl_fatal("SSI index %s has 64-bit offsets; this system doesn't support them", ssifile); else if (status == eslEFORMAT) esl_fatal("SSI index %s has an unrecognized format. Try recreating, w/ esl-afetch --index", ssifile); else if (status == eslENOTFOUND) afp->ssi = NULL; else if (status != eslOK) esl_fatal("SSI index %s: open failed, error code %d\n", ssifile, status); free(ssifile); } } /* Open the output file, if any */ if (esl_opt_GetBoolean(go, "-O")) { if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetArg(go, 2)); } else if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Hand off control flow as appropriate */ if (esl_opt_GetBoolean(go, "--index")) { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); create_ssi_index(go, afp); } else if (esl_opt_GetBoolean(go, "-f")) { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); multifetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); } else { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); onefetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); if (ofp != stdout) printf("\n\nRetrieved alignment %s.\n", esl_opt_GetArg(go, 2)); } eslx_msafile_Close(afp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *hmmfile = NULL; /* HMM file name */ char *seqfile = NULL; /* sequence file name */ char *mapfile = NULL; /* optional mapped MSA file name */ int infmt = eslSQFILE_UNKNOWN; int outfmt = eslMSAFILE_STOCKHOLM; P7_HMMFILE *hfp = NULL; /* open HMM file */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ char *outfile = NULL; /* output filename */ FILE *ofp = stdout; /* output stream */ ESL_SQ **sq = NULL; /* array of sequences */ void *p = NULL; /* tmp ptr for reallocation */ int nseq = 0; /* # of sequences in <seqfile> */ int mapseq = 0; /* # of sequences in mapped MSA */ int totseq = 0; /* # of seqs in all sources */ ESL_ALPHABET *abc = NULL; /* alphabet (set from the HMM file)*/ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; /* array of tracebacks */ ESL_MSA *msa = NULL; /* resulting multiple alignment */ int msaopts = 0; /* flags to p7_tracealign_Seqs() */ int idx; /* counter over seqs, traces */ int status; /* easel/hmmer return code */ char errbuf[eslERRBUFSIZE]; /* Parse the command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); seqfile = esl_opt_GetArg(go, 2); if (strcmp(hmmfile, "-") == 0 && strcmp(seqfile, "-") == 0) cmdline_failure(argv[0], "Either <hmmfile> or <seqfile> may be '-' (to read from stdin), but not both.\n"); msaopts |= p7_ALL_CONSENSUS_COLS; /* default as of 3.1 */ if (esl_opt_GetBoolean(go, "--trim")) msaopts |= p7_TRIM; /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--informat")) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Open output stream */ if ( (outfile = esl_opt_GetString(go, "-o")) != NULL) { if ((ofp = fopen(outfile, "w")) == NULL) cmdline_failure(argv[0], "failed to open -o output file %s for writing\n", outfile); } /* If caller forced an alphabet on us, create the one the caller wants */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* Read one HMM, and make sure there's only one. */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); status = p7_hmmfile_Read(hfp, &abc, NULL); if (status != eslEOF) p7_Fail("HMM file %s does not contain just one HMM\n", hfp->fname); p7_hmmfile_Close(hfp); /* We're going to build up two arrays: sequences and traces. * If --mapali option is chosen, the first set of sequences/traces is from the provided alignment */ if ( (mapfile = esl_opt_GetString(go, "--mapali")) != NULL) { map_alignment(mapfile, hmm, &sq, &tr, &mapseq); } totseq = mapseq; /* Read digital sequences into an array (possibly concat'ed onto mapped seqs) */ status = esl_sqfile_OpenDigital(abc, seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", seqfile); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, seqfile); ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq + 1)); sq[totseq] = esl_sq_CreateDigital(abc); nseq = 0; while ((status = esl_sqio_Read(sqfp, sq[totseq+nseq])) == eslOK) { nseq++; ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq+nseq+1)); sq[totseq+nseq] = esl_sq_CreateDigital(abc); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); totseq += nseq; /* Remaining initializations, including trace array allocation */ ESL_RALLOC(tr, p, sizeof(P7_TRACE *) * totseq); for (idx = mapseq; idx < totseq; idx++) tr[idx] = p7_trace_CreateWithPP(); p7_tracealign_computeTraces(hmm, sq, mapseq, totseq - mapseq, tr); p7_tracealign_Seqs(sq, tr, totseq, hmm->M, msaopts, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (idx = 0; idx <= totseq; idx++) esl_sq_Destroy(sq[idx]); /* including sq[nseq] because we overallocated */ for (idx = 0; idx < totseq; idx++) p7_trace_Destroy(tr[idx]); free(sq); free(tr); esl_msa_Destroy(msa); p7_hmm_Destroy(hmm); if (ofp != stdout) fclose(ofp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return eslOK; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ ESLX_MSAFILE *afp = NULL; /* open msa file */ ESL_MSAFILE2 *old_afp = NULL; /* open msa file, legacy (--small) */ ESL_MSA *msa = NULL; /* one multiple sequence alignment */ int nali; /* number of alignments read */ int i; /* counter over seqs */ int64_t alen; /* alignment length */ int nseq; /* number of sequences in the msa */ int64_t rlen; /* a raw (unaligned) seq length */ int64_t small, large; /* smallest, largest sequence */ int64_t nres; /* total # of residues in msa */ double avgid; /* average fractional pair id */ int max_comparisons; /* maximum # comparisons for avg id */ int do_stall; /* used to stall when debugging */ double **abc_ct = NULL; /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */ double ***bp_ct = NULL; /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair * * count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum * * of 'i:j' for a pair between i and j, where i < j. */ double **pp_ct = NULL; /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */ int *i_am_rf = NULL; /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */ int *rf2a_map = NULL; /* [0..rfpos..rflen-1] = apos, * apos is the alignment position (0..msa->alen-1) that * is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */ int rflen = -1; /* nongap RF length */ char errbuf[eslERRBUFSIZE]; int status; /* easel return code */ /* optional output files */ FILE *iinfofp = NULL; /* output file for --iinfo */ FILE *pcinfofp = NULL; /* output file for --pcinfo */ FILE *psinfofp = NULL; /* output file for --psinfo */ FILE *rinfofp = NULL; /* output file for --rinfo */ FILE *icinfofp = NULL; /* output file for --icinfo */ FILE *listfp = NULL; /* output file for --list */ FILE *cinfofp = NULL; /* output file for --cinfo */ FILE *bpinfofp = NULL; /* output file for --bpinfo */ int use_weights; /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */ int weights_exist; /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\n optional output files:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); max_comparisons = 1000; do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* We'd like to get rid of the legacy msafile interface, but it * includes small memory functionality for Pfam format which we have * to replace first. For now, use both interfaces, new and legacy */ if ( esl_opt_GetBoolean(go, "--small") ) { if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified."); status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); } else { if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); } /************************************** * Open optional output files, as nec * **************************************/ /* determine name for first list file, if nec */ if( esl_opt_IsOn(go, "--list")) { if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list")); } if( esl_opt_IsOn(go, "--icinfo")) { if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo")); } if( esl_opt_IsOn(go, "--rinfo")) { if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo")); } if( esl_opt_IsOn(go, "--pcinfo")) { if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo")); } if( esl_opt_IsOn(go, "--psinfo")) { if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo")); } if( esl_opt_IsOn(go, "--iinfo")) { if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo")); } if( esl_opt_IsOn(go, "--cinfo")) { if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo")); } if( esl_opt_IsOn(go, "--bpinfo")) { if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo")); } /*********************************************** * Read MSAs one at a time. ***********************************************/ if (esl_opt_GetBoolean(go, "-1")) { puts("#"); if(! esl_opt_GetBoolean(go, "--small")) { printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id"); printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---"); } else { printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen"); printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------"); } } nali = 0; fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format); while ( (status = ( esl_opt_GetBoolean(go, "--small") ? esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) : eslx_msafile_Read (afp, &msa))) == eslOK) { nali++; nres = 0; if (! esl_opt_GetBoolean(go, "--small")) { nseq = msa->nseq; alen = msa->alen; small = large = -1; for (i = 0; i < msa->nseq; i++) { rlen = esl_abc_dsqrlen(msa->abc, msa->ax[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid); } else { /* --small invoked */ for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K); } if (esl_opt_GetBoolean(go, "-1")) { printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, nali, msa->name, eslx_msafile_DecodeFormat(fmt), nseq, alen, nres); if (! esl_opt_GetBoolean(go, "--small")) { printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n", small, large, (double) nres / (double) msa->nseq, 100.*avgid); } else { printf(" %10.1f\n", (double) nres / (double) nseq); } } else { printf("Alignment number: %d\n", nali); if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); printf("Format: %s\n", eslx_msafile_DecodeFormat(fmt)); printf("Number of sequences: %d\n", nseq); printf("Alignment length: %" PRId64 "\n", alen); printf("Total # residues: %" PRId64 "\n", nres); if(! esl_opt_GetBoolean(go, "--small")) { printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); } printf("Average length: %.1f\n", (double) nres / (double) nseq); if(! esl_opt_GetBoolean(go, "--small")) { printf("Average identity: %.0f%%\n", 100.*avgid); } printf("//\n"); } /* Dump data to optional output files, if nec */ if(esl_opt_IsOn(go, "--list")) { if(! esl_opt_GetBoolean(go, "--small")) { /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */ for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]); } } /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions * and rf2a_map, a map of non-gap RF positions to overall alignment positions */ if(msa->rf != NULL) { if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf); } else i_am_rf = NULL; weights_exist = check_msa_weights(msa); use_weights = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE; if( (! esl_opt_GetBoolean(go, "--small")) && (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo") || esl_opt_IsOn(go, "--pcinfo") || esl_opt_IsOn(go, "--cinfo") || esl_opt_IsOn(go, "--bpinfo"))) { /* collect counts of each residue and PPs (if they exist) from the msa */ if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = count_msa(msa, errbuf, nali, esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */ esl_opt_GetBoolean(go, "--weight"), /* use msa->wgt sequence weights? */ &abc_ct, ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */ (msa->pp != NULL ? &pp_ct : NULL))) /* get PP counts? */ != eslOK) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--icinfo")) { if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--rinfo")) { if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--pcinfo")) { if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--psinfo")) { if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--iinfo")) { if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali); if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--cinfo")) { if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--bpinfo")) { if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali); if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } esl_msa_Destroy(msa); if(abc_ct != NULL) { esl_Free2D((void **) abc_ct, alen); abc_ct = NULL; } if(bp_ct != NULL) { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct = NULL; } if(pp_ct != NULL) { esl_Free2D((void **) pp_ct, alen); pp_ct = NULL; } if(i_am_rf != NULL) { free(i_am_rf); i_am_rf = NULL; } if(rf2a_map != NULL) { free(rf2a_map); rf2a_map = NULL; } } /* If an msa read failed, we've dropped out to here with an informative status code. * we have to handle failures from new vs. legacy msa parsing differently */ if (esl_opt_GetBoolean(go, "--small")) { if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); } else { if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); } /* Cleanup, normal return */ if(listfp != NULL) { fclose(listfp); printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list")); } if(icinfofp != NULL) { fclose(icinfofp); printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); } if(rinfofp != NULL) { fclose(rinfofp); printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); } if(pcinfofp != NULL) { fclose(pcinfofp); printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); } if(psinfofp != NULL) { fclose(psinfofp); printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); } if(iinfofp != NULL) { printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); fclose(iinfofp); } if(cinfofp != NULL) { printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); fclose(cinfofp); } if(bpinfofp != NULL) { printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); fclose(bpinfofp); } if (afp) eslx_msafile_Close(afp); if (old_afp) esl_msafile2_Close(old_afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }