/* Function: esl_wuss2ct() * Incept: SRE, Tue Feb 15 08:44:54 2005 [St. Louis] * * Purpose: Given a secondary structure string <ss>, <0..len-1>, * in WUSS notation, convert it to a CT array, <1..len>, * in <ct>. Caller provides a <ct> allocated for at least * <len+1> ints. <ct[i]> is the position that residue i * base pairs to, or 0 if i is unpaired. <ct[0]> is undefined * (but if you care: it is set to 0). * * WUSS notation is interpreted loosely here, as input * WUSS. Any matching bracket pair or upper/lower case * alphabetic pair is interpreted as a base pair; any other * WUSS annotation is interpreted as unpaired. * * Returns: <eslOK> on success. Returns <eslESYNTAX> if the WUSS * string isn't valid. * * Throws: <eslEMEM> on allocation failure. */ int esl_wuss2ct(char *ss, int len, int *ct) { ESL_STACK *pda[27]; /* 1 secondary structure + up to 26 levels of pk's */ int i; int pos, pair; int status; /* success or failure return status */ /* Initialization: always initialize the main pda (0); * we'll init the pk pda's on demand. */ if ((pda[0] = esl_stack_ICreate()) == NULL) goto FINISH; for (i = 1; i <= 26; i++) pda[i] = NULL; for (pos = 0; pos <= len; pos++) ct[pos] = 0; for (pos = 1; pos <= len; pos++) { if (!isprint((int) ss[pos-1])) /* armor against garbage */ { status = eslESYNTAX; goto FINISH; } /* left side of a pair: push position onto stack 0 (pos = 1..L) */ else if (ss[pos-1] == '<' || ss[pos-1] == '(' || ss[pos-1] == '[' || ss[pos-1] == '{') { if ((status = esl_stack_IPush(pda[0], pos)) != eslOK) goto FINISH; } /* right side of a pair; resolve pair; check for agreement */ else if (ss[pos-1] == '>' || ss[pos-1] == ')' || ss[pos-1] == ']' || ss[pos-1] == '}') { if (esl_stack_IPop(pda[0], &pair) == eslEOD) { status = eslESYNTAX; goto FINISH;} /* no closing bracket */ else if ((ss[pair-1] == '<' && ss[pos-1] != '>') || (ss[pair-1] == '(' && ss[pos-1] != ')') || (ss[pair-1] == '[' && ss[pos-1] != ']') || (ss[pair-1] == '{' && ss[pos-1] != '}')) { status = eslESYNTAX; goto FINISH; } /* brackets don't match */ else { ct[pos] = pair; ct[pair] = pos; } } /* same stuff for pseudoknots */ else if (isupper((int) ss[pos-1])) { /* Create the PK stacks on demand. */ i = ss[pos-1] - 'A' + 1; if (pda[i] == NULL) if ((pda[i] = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto FINISH; } if ((status = esl_stack_IPush(pda[i], pos)) != eslOK) goto FINISH; } else if (islower((int) ss[pos-1])) { i = ss[pos-1] - 'a' + 1; if (pda[i] == NULL || esl_stack_IPop(pda[i], &pair) == eslEOD) { status = eslESYNTAX; goto FINISH;} else { ct[pos] = pair; ct[pair] = pos; } } else if (strchr(":,_-.~", ss[pos-1]) == NULL) { status = eslESYNTAX; goto FINISH; } /* bogus character */ } status = eslOK; FINISH: for (i = 0; i <= 26; i++) if (pda[i] != NULL) { /* nothing should be left on stacks */ if (esl_stack_ObjectCount(pda[i]) != 0) status = eslESYNTAX; esl_stack_Destroy(pda[i]); } return status; }
/* Function: esl_ct2simplewuss() * Incept: ER, Wed Aug 22 13:31:54 EDT 2012 [Janelia] * * Purpose: Convert a CT array <ct> for <n> residues (1..n) to a simple WUSS * format string <ss>. <ss> must be allocated for at least * n+1 chars (+1 for the terminal NUL). * * This function can be used with the <ct> of a secondary * structure including arbitrary pseudoknots, or for the * <ct> or a tertiary structure (say cWH, tWH, cSS,... H bonds). * * The string <ss> has basepairs annotated as <>, Aa, Bb, ..., Zz; * unpaired bases are annotated as '.'. * * Attemting to convert a <ct> that requires more letters * than [A-Z] will return an <eslEINVAL> error. * * Attempting to convert a <ct> that involves triplet interactions * will return an <eslEINVAL> error. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. * <eslEINCONCEIVABLE> on internal failure. */ int esl_ct2simplewuss(int *ct, int n, char *ss) { int rb[26]; /* array that delimits the right bound of a pseudoknot character */ ESL_STACK *pda = NULL; /* stack for "main" secondary structure */ ESL_STACK *auxpk = NULL; /* aux stack for pseudoknot */ int *cct = NULL; /* copy of ct vector */ int leftbound, rightbound; /* left and right bound to find basepairs belonging to a given pseudoknot */ int xpk = 0; /* number of pseudoknot chararactes used */ int npk = 0; /* number of pseudoknots */ int npairs = 0; /* total number of basepairs */ int npairs_reached = 0; /* number of basepairs found so far */ int found_partner; /* true if we've found left partner of a given base in stack pda */ int i,j,k; /* sequence indices */ int x; /* index for pseudoknot characters */ int status = eslEMEM; /* exit status 'til proven otherwise */ /* total number of basepairs */ for (j = 1; j <= n; j ++) { if (ct[j] > 0 && j < ct[j]) npairs ++; } /* Copy of ct; if a pseudoknotted structure, cct will be modified later. */ ESL_ALLOC(cct, sizeof(int)*(n+1)); esl_vec_ICopy(ct, (n+1), cct); /* Initialize rightbounds for all 26 pseudoknot indices */ for (x = 0; x < 26; x ++) rb[x] = -1; /* init ss[] to single stranded */ for (j = 0; j < n; j ++) { ss[j] = '.'; } ss[n] = '\0'; /* Initialization*/ if ((pda = esl_stack_ICreate()) == NULL) goto FINISH; if ((auxpk = esl_stack_ICreate()) == NULL) goto FINISH; for (j = 1; j <= n; j++) { if (cct[j] == 0) /* unpaired: push j. */ { if (esl_stack_IPush(pda, j) != eslOK) goto FINISH; } else if (cct[j] > j) /* left side of a bp: push j. */ { if (esl_stack_IPush(pda, j) != eslOK) goto FINISH; } else /* right side of a bp; main routine: fingh the left partner */ { found_partner = FALSE; /* Pop back until we find the left partner of j; * In case this is not a nested structure, finding * the left partner of j will require to put bases * aside into stack auxpk. */ while (esl_stack_ObjectCount(pda)) { if (esl_stack_IPop(pda, &i) != eslOK) goto FINISH; if (cct[i] == j) /* we found the i,j pair. */ { found_partner = TRUE; npairs_reached ++; ss[i-1] = '<'; ss[j-1] = '>'; break; } else if (cct[i] == 0) { if (ct[i] == 0) ss[i-1] = '.'; } else /* cct[i]>0, != j: i is paired, but not to j: pseudoknot! */ { /* i is in the way to find j's left partner. * Move i to stack auxpk; resolve pseudoknot(s) after we've found partern for j. */ if (esl_stack_IPush(auxpk, i) != eslOK) goto FINISH; } } if (!found_partner) { esl_stack_Destroy(pda); esl_stack_Destroy(auxpk); free(cct); ESL_EXCEPTION(eslEINVAL, "Cannot find left partner (%d) of base %d. Likely a triplet", ct[j], j); } } /* finished finding the left partner of j */ /* After we've found the left partner of j, resolve pks found along the way. * Then, remove the pseudoknotted based from cct so we can find the rest of the structure. */ if (esl_stack_ObjectCount(auxpk)) { /* init for first pseudoknot */ leftbound = cct[j]; rightbound = leftbound + 1; xpk = -1; /* start with 'A' if possible again */ while (esl_stack_IPop(auxpk, &i) == eslOK) { for (k = rightbound-1; k > leftbound; k --) { if (cct[k] == 0) { continue; } else if (cct[k] > rightbound) { continue; } else if (cct[k] == i) { break; } /* i continues the given pseudoknot */ else { k = leftbound; break; } /* a new pseudoknot */ } if (k == leftbound) /* a new pseudoknot */ { npk ++; xpk ++; /* figure out if we can use this alphabet index, or bump it up if necessary */ while (i < rb[xpk]) { xpk ++; } leftbound = (rightbound < cct[i])? rightbound : cct[j]; rightbound = cct[i]; } npairs_reached ++; if (xpk+(int)('a') <= (int)('z')) { /* update the rightbound of this pk index if necessary */ if (cct[i] > rb[xpk]) rb[xpk] = cct[i]; /* Add pk indices for this basepair */ ss[i-1] = (char)(xpk+(int)('A')); ss[cct[i]-1] = (char)(xpk+(int)('a')); /* remove pseudoknotted pair from cct */ cct[i] = 0; cct[ct[i]] = 0; } else ESL_EXCEPTION(eslEINVAL, "Don't have enough letters to describe all different pseudoknots."); } } /* while there is something in auxpk stack */ } /* finished loop over j: end position on seq, 1..n*/ status = eslOK; ERROR: FINISH: if (npairs != npairs_reached) ESL_EXCEPTION(eslFAIL, "found %d out of %d pairs.", npairs_reached, npairs); if (pda != NULL) esl_stack_Destroy(pda); if (auxpk != NULL) esl_stack_Destroy(auxpk); if (cct != NULL) free(cct); return status; }
/* Function: esl_ct2wuss() * Incept: SRE, Wed Feb 16 11:22:53 2005 [St. Louis] * * Purpose: Convert a CT array <ct> for <n> residues (1..n) to a WUSS * format string <ss>. <ss> must be allocated for at least * n+1 chars (+1 for the terminal NUL). * * ER, Sat Aug 18 13:22:03 EDT 2012 * esl\_ct2wuss() extended to deal with pseudoknots structures. * Pseudoknots are annotated as AA...aa, BB...bb,..., ZZ..zz. * Attemting to convert a <ct> that requires more letters * than [A-Z] will return an <eslEINVAL> error. * * Attempting to convert a <ct> that involves triplet interactions * will return an <eslEINVAL> error. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. * <eslEINCONCEIVABLE> on internal failure. */ int esl_ct2wuss(int *ct, int n, char *ss) { int rb[26]; /* array that delimits the right bound of a pseudoknot character */ ESL_STACK *pda = NULL; /* stack for "main" secondary structure */ ESL_STACK *auxpk = NULL; /* aux stack for pseudoknot */ ESL_STACK *auxss = NULL; /* aux stack for single stranded */ int *cct = NULL; /* copy of ct vector */ int nfaces; /* number of faces in a cWW structure */ int minface; /* max depth of faces in a cWW structure */ int leftbound, rightbound; /* left and right bound to find basepairs belonging to a given pseudoknot */ int xpk = 0; /* number of pseudoknot chararactes used */ int npk = 0; /* number of pseudoknots */ int npairs = 0; /* total number of basepairs */ int npairs_reached = 0; /* number of basepairs found so far */ int found_partner; /* true if we've found left partner of a given base in stack pda */ int i,j,k; /* sequence indices */ int x; /* index for pseudoknot characters */ int status = eslEMEM; /* exit status 'til proven otherwise */ /* total number of basepairs */ for (j = 1; j <= n; j ++) { if (ct[j] > 0 && j < ct[j]) npairs ++; } /* Copy of ct; if a pseudoknotted structure, cct will be modified later. */ ESL_ALLOC(cct, sizeof(int)*(n+1)); esl_vec_ICopy(ct, (n+1), cct); /* Initialize rightbounds for all 26 pseudoknot indices */ for (x = 0; x < 26; x ++) rb[x] = -1; /* init ss[] to single stranded */ for (j = 0; j < n; j ++) { ss[j] = ':'; } ss[n] = '\0'; /* Initialization*/ if ((pda = esl_stack_ICreate()) == NULL) goto FINISH; if ((auxpk = esl_stack_ICreate()) == NULL) goto FINISH; if ((auxss = esl_stack_ICreate()) == NULL) goto FINISH; for (j = 1; j <= n; j++) { if (cct[j] == 0) /* unpaired: push j. */ { if (esl_stack_IPush(pda, j) != eslOK) goto FINISH; } else if (cct[j] > j) /* left side of a bp: push j. */ { if (esl_stack_IPush(pda, j) != eslOK) goto FINISH; } else /* right side of a bp; main routine: fingh the left partner */ { found_partner = FALSE; /* Pop back until we find the left partner of j; * In case this is not a nested structure, finding * the left partner of j will require to put bases * aside into stack auxpk. * * After we find the left partner of j, * store single stranded residues in auxss; * keep track of #faces and the maximum face depth. */ nfaces = 0; minface = -1; while (esl_stack_ObjectCount(pda)) { if (esl_stack_IPop(pda, &i) != eslOK) goto FINISH; if (i < 0) /* a face counter */ { nfaces++; if (i < minface) minface = i; } else if (cct[i] == j) /* we found the i,j pair. */ { found_partner = TRUE; npairs_reached ++; /* Now we know i,j pair; and we know how many faces are * above them; and we know the max depth of those faces. * That's enough to label the pair in WUSS notation. * if nfaces == 0, minface is -1; <> a closing bp of a hairpin. * if nfaces == 1, inherit minface, we're continuing a stem. * if nfaces > 1, bump minface in depth; we're closing a bifurc. */ if (nfaces > 1 && minface > -4) minface--; switch (minface) { case -1: ss[i-1] = '<'; ss[j-1] = '>'; break; case -2: ss[i-1] = '('; ss[j-1] = ')'; break; case -3: ss[i-1] = '['; ss[j-1] = ']'; break; case -4: ss[i-1] = '{'; ss[j-1] = '}'; break; default: esl_stack_Destroy(pda); esl_stack_Destroy(auxpk); esl_stack_Destroy(auxss); free(cct); ESL_EXCEPTION(eslEINCONCEIVABLE, "no such face code"); } if (esl_stack_IPush(pda, minface) != eslOK) goto FINISH; /* Now, aux contains all the unpaired residues we need to label, * according to the # of faces "above" them: * nfaces = 0: hairpin loop * nfaces = 1: bulge or interior loop * nfaces > 1: multifurc */ while (esl_stack_IPop(auxss, &i) == eslOK) { switch (nfaces) { case 0: ss[i-1] = '_'; break; case 1: ss[i-1] = '-'; break; default: ss[i-1] = ','; break; /* nfaces > 1 */ } } break; } else if (cct[i] == 0) { /* add to auxss only if originally sigle stranded */ if (ct[i] == 0) { if (esl_stack_IPush(auxss, i) != eslOK) goto FINISH; } } else /* cct[i]>0, != j: i is paired, but not to j: pseudoknot! */ { /* i is in the way to find j's left partner. * Move i to stack auxpk; resolve pseudoknot(s) after we've found partern for j. */ if (esl_stack_IPush(auxpk, i) != eslOK) goto FINISH; } } if (!found_partner) { esl_stack_Destroy(pda); esl_stack_Destroy(auxpk); esl_stack_Destroy(auxss); free(cct); ESL_EXCEPTION(eslEINVAL, "Cannot find left partner (%d) of base %d. Likely a triplet", ct[j], j); } } /* finished finding the left partner of j */ /* After we've found the left partner of j, resolve pks found along the way. * Then, remove the pseudoknotted based from cct so we can find the rest of the structure. */ if (esl_stack_ObjectCount(auxpk)) { /* init for first pseudoknot */ leftbound = cct[j]; rightbound = leftbound + 1; xpk = -1; /* start with 'A' if possible again */ while (esl_stack_IPop(auxpk, &i) == eslOK) { for (k = rightbound-1; k > leftbound; k --) { if (cct[k] == 0) { continue; } else if (cct[k] > rightbound) { continue; } else if (cct[k] == i) { break; } /* i continues the given pseudoknot */ else { k = leftbound; break; } /* a new pseudoknot */ } if (k == leftbound) /* a new pseudoknot */ { npk ++; xpk ++; /* figure out if we can use this alphabet index, or bump it up if necessary */ while (i < rb[xpk]) { xpk ++; } leftbound = (rightbound < cct[i])? rightbound : cct[j]; rightbound = cct[i]; } npairs_reached ++; if (xpk+(int)('a') <= (int)('z')) { /* update the rightbound of this pk index if necessary */ if (cct[i] > rb[xpk]) rb[xpk] = cct[i]; /* Add pk indices for this basepair */ ss[i-1] = (char)(xpk+(int)('A')); ss[cct[i]-1] = (char)(xpk+(int)('a')); /* remove pseudoknotted pair from cct */ cct[i] = 0; cct[ct[i]] = 0; } else ESL_EXCEPTION(eslEINVAL, "Don't have enough letters to describe all different pseudoknots."); } } /* while there is something in auxpk stack */ } /* finished loop over j: end position on seq, 1..n*/ status = eslOK; ERROR: FINISH: if (npairs != npairs_reached) ESL_EXCEPTION(eslFAIL, "found %d out of %d pairs.", npairs_reached, npairs); if (pda != NULL) esl_stack_Destroy(pda); if (auxpk != NULL) esl_stack_Destroy(auxpk); if (auxss != NULL) esl_stack_Destroy(auxss); if (cct != NULL) free(cct); return status; }
/* Each test sequence will contain one or two domains, depending on whether --single is set. */ static int synthesize_positives(ESL_GETOPTS *go, struct cfg_s *cfg, char *testname, ESL_STACK *teststack, int *ret_ntest) { ESL_SQ *domain1, *domain2; ESL_SQ *sq; void *p; int64_t L; /* total length of synthetic test seq */ int d1n, d2n; /* lengths of two domains */ int L1,L2,L3; /* lengths of three random regions */ int i,j; int ntest = 0; int ndomains = ( (esl_opt_GetBoolean(go, "--single") == TRUE) ? 1 : 2); int status; while (esl_stack_ObjectCount(teststack) >= ndomains) { ESL_RALLOC(cfg->test_lens, p, (cfg->ntest+1) * sizeof(struct testseq_s)); /* Pop our one or two test domains off the stack */ esl_stack_PPop(teststack, &p); domain1 = p; d1n = domain1->n; if (ndomains == 2) { esl_stack_PPop(teststack, &p); domain2 = p; d2n = domain2->n; } else { domain2 = NULL; d2n = 0; } /* Select a random total sequence length */ if (d1n+d2n > cfg->db_maxL) esl_fatal("can't construct test seq; no db seq >= %d residues\n", d1n+d2n); do { if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &L, NULL) != eslOK) esl_fatal("failed to look up a random seq"); } while (L < d1n+d2n); /* Now figure out the embedding */ if (ndomains == 2) { /* Select random lengths of three flanking domains; * Imagine picking two "insert after" points i,j in sequence 1..L', for * L' = L-d1n-d2n (the total length of nonhomologous test seq) */ do { i = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* i = 0..L' */ j = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* j = 0..L' */ } while (i > j); /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. j+d1n = random region 2 (if i==j, there's none); * j+d1n+1 .. j+d1n+d2n = domain 2 * j+d1n+d2n+1 .. L = random region 3 (if j == L-d1n-d2n, there's none); */ L1 = i; L2 = j-i; L3 = L - d1n - d2n - j; } else { /* embedding one domain */ i = esl_rnd_Roll(cfg->r, L - d1n + 1 ); /* i = 0..L' */ /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. L = random region 2 (if i==j, there's none); */ L1 = i; L2 = L - d1n - L1; L3 = 0; } sq = esl_sq_CreateDigital(cfg->abc); esl_sq_GrowTo(sq, L); sq->n = L; if (ndomains == 2) { esl_sq_FormatName(sq, "%s/%d/%d-%d/%d-%d", testname, cfg->ntest, i+1, i+d1n, j+d1n+1, j+d1n+d2n); esl_sq_FormatDesc(sq, "domains: %s %s", domain1->name, domain2->name); } else { esl_sq_FormatName(sq, "%s/%d/%d-%d", testname, cfg->ntest, i+1, i+d1n); esl_sq_FormatDesc(sq, "domain: %s", domain1->name); } fprintf(cfg->possummfp, "%-35s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->possummfp, sq->dsq+1, L1); memcpy(sq->dsq+i+1, domain1->dsq+1, sizeof(ESL_DSQ) * d1n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain1->name, 1, d1n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+i+d1n+1, L2); if (ndomains == 2) { memcpy(sq->dsq+j+d1n+1, domain2->dsq+1, sizeof(ESL_DSQ) * d2n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain2->name, 1, d2n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+j+d1n+d2n+1, L3); } fprintf(cfg->possummfp, "\n"); cfg->test_lens[cfg->ntest].L = L; cfg->test_lens[cfg->ntest].L1 = L1; cfg->test_lens[cfg->ntest].d1n = d1n; cfg->test_lens[cfg->ntest].L2 = L2; cfg->test_lens[cfg->ntest].d2n = d2n; cfg->test_lens[cfg->ntest].L3 = L3; cfg->ntest++; ntest++; esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(domain1); if (ndomains == 2) esl_sq_Destroy(domain2); esl_sq_Destroy(sq); } *ret_ntest = ntest; return eslOK; ERROR: esl_fatal("Failure in synthesize_positives"); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); /* Open the MSA file; determine alphabet */ status = esl_msafile_Open(alifile, alifmt, NULL, &afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); else { int type; status = esl_msafile_GuessAlphabet(afp, &type); if (status == eslEAMBIGUOUS) esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile); else if (status == eslEFORMAT) esl_fatal("Alignment file parse failed: %s\n", afp->errbuf); else if (status == eslENODATA) esl_fatal("Alignment file %s is empty\n", alifile); else if (status != eslOK) esl_fatal("Failed to read alignment file %s\n", alifile); cfg.abc = esl_alphabet_Create(type); } esl_msafile_SetDigital(afp, cfg.abc); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = esl_msa_Read(afp, &origmsa)) == eslOK) { remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); ntestdom = esl_stack_ObjectCount(teststack); if (ntestdom >= 2) { esl_stack_Shuffle(cfg.r, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); esl_msa_MinimGaps(trainmsa, NULL, NULL); esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", afp->linenumber, afp->fname, afp->errbuf, afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); if (nali > 0) synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0); cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); if (esl_opt_GetBoolean(go, "--pid")) { if (snprintf(outfile, 256, "%s.pid", basename) >= 256) esl_fatal("Failed to construct %%id table file name"); if ((cfg.pidfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile); } else cfg.pidfp = NULL; /* Open the MSA file, digital mode; determine alphabet */ if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); esl_msa_ConvertDegen2X(origmsa); esl_msa_Hash(origmsa); remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); if ( esl_stack_ObjectCount(teststack) >= 2) { /* randomize test domain order, and apply size limit if any */ esl_stack_Shuffle(cfg.r, teststack); if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest); ntestdom = esl_stack_ObjectCount(teststack); /* randomize training set alignment order, and apply size limit if any */ esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa); if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain); esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE); if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); if (cfg.pidfp) fclose(cfg.pidfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }