/* this is an obsolete derivative of old p7_tophits_Add(), which the * test driver happens to use; we should refactor. * The unit test only needs name and sort key; * we add the acc and desc too; and everything else is * at a default unused value. */ static int tophits_Add(P7_TOPHITS *h, char *name, char *acc, char *desc, double sortkey) { P7_HIT *hit = NULL; p7_tophits_CreateNextHit(h, &hit); esl_strdup(name, -1, &(hit->name)); esl_strdup(acc, -1, &(hit->acc)); esl_strdup(desc, -1, &(hit->desc)); hit->sortkey = sortkey; return eslOK; }
/* Function: p7_profile_Copy() * Synopsis: Copy a profile. * * Purpose: Copies profile <src> to profile <dst>, where <dst> * has already been allocated to be of sufficient size, * and has the same alphabet. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small * to fit <src> or is for a different alphabet. */ int p7_profile_Copy(const P7_PROFILE *src, P7_PROFILE *dst) { int x,z; int status; if (src->M > dst->allocM) ESL_EXCEPTION(eslEINVAL, "destination profile is too small to hold a copy of source profile"); if (src->abc->type != dst->abc->type) ESL_EXCEPTION(eslEINVAL, "destination profile has different alphabet than source"); dst->M = src->M; esl_vec_FCopy(src->tsc, (src->M+1)*p7P_NTRANS, dst->tsc); for (x = 0; x < src->abc->Kp; x++) esl_vec_FCopy(src->rsc[x], (src->M+1)*p7P_NR, dst->rsc[x]); for (x = 0; x < p7P_NXSTATES; x++) esl_vec_FCopy(src->xsc[x], p7P_NXTRANS, dst->xsc[x]); dst->L = src->L; dst->nj = src->nj; dst->pglocal = src->pglocal; if (dst->name) free(dst->name); if (dst->acc) free(dst->acc); if (dst->desc) free(dst->desc); if ((status = esl_strdup(src->name, -1, &(dst->name))) != eslOK) return status; if ((status = esl_strdup(src->acc, -1, &(dst->acc))) != eslOK) return status; if ((status = esl_strdup(src->desc, -1, &(dst->desc))) != eslOK) return status; strcpy(dst->rf, src->rf); /* RF is optional: if it's not set, *rf=0, and strcpy still works fine */ strcpy(dst->mm, src->mm); /* MM is also optional annotation */ strcpy(dst->cs, src->cs); /* CS is also optional annotation */ strcpy(dst->consensus, src->consensus); /* consensus though is always present on a valid profile */ for (z = 0; z < p7_NEVPARAM; z++) dst->evparam[z] = src->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) dst->cutoff[z] = src->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) dst->compo[z] = src->compo[z]; for (x = 0; x < p7_NOFFSETS; ++x) dst->offs[x] = src->offs[x]; dst->roff = src->roff; dst->eoff = src->eoff; dst->max_length = src->max_length; return eslOK; }
/* Function: hmmpgmd_GetRanges() * Synopsis: Parse command flag into range(s) * * Purpose: Given a command flag string <rangestr> of the form * <start1>..<end1>,<start2>..<end2>... * parse the string into a RANGE_LIST <list> * * Returns: <eslOK> on success <TRUE>, <eslEMEM> on memory allocation failure, * otherwise <eslESYNTAX> or <eslFAIL> on parsing errors. */ int hmmpgmd_GetRanges (RANGE_LIST *list, char *rangestr) { char *range; char *rangestr_cpy; char *rangestr_cpy_ptr; int status; list->N = 0; list->starts = NULL; list->ends = NULL; //first pass to figure out how much to allocate esl_strdup(rangestr, -1, &rangestr_cpy); // do this because esl_strtok modifies the string, and we shouldn't change the opts value rangestr_cpy_ptr = rangestr_cpy; // do this because esl_strtok advances the pointer on the target string, but we need to free it while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK) list->N++; ESL_ALLOC(list->starts, list->N * sizeof(int)); ESL_ALLOC(list->ends, list->N * sizeof(int)); free(rangestr_cpy_ptr); //2nd pass to get the values list->N = 0; esl_strdup(rangestr, -1, &rangestr_cpy); rangestr_cpy_ptr = rangestr_cpy; while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, list->starts + list->N, list->ends + list->N); if (status == eslESYNTAX) esl_fatal("--seqdb_ranges takes coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); list->N++; } free(rangestr_cpy_ptr); return eslOK; ERROR: return eslEMEM; }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); if (! (hmm->flags & p7H_CONS)) ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->max_length = hmm->max_length; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_MMASK) strcpy(gm->mm, hmm->mm); if (hmm->flags & p7H_CONS) strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */ if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC(occ, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ { Z = log(hmm->t[0][p7H_MD]); p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]); for (k = 1; k < hmm->M; k++) { p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]); Z += log(hmm->t[k][p7H_DD]); } } /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits * N,C,J transitions are set later by length config */ if (p7_profile_IsMultihit(gm)) { gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2; gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2; gm->nj = 1.0f; } else { gm->xsc[p7P_E][p7P_MOVE] = 0.0f; gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY; gm->nj = 0.0f; } /* Transition scores. */ for (k = 1; k < gm->M; k++) { tp = gm->tsc + k * p7P_NTRANS; tp[p7P_MM] = log(hmm->t[k][p7H_MM]); tp[p7P_MI] = log(hmm->t[k][p7H_MI]); tp[p7P_MD] = log(hmm->t[k][p7H_MD]); tp[p7P_IM] = log(hmm->t[k][p7H_IM]); tp[p7P_II] = log(hmm->t[k][p7H_II]); tp[p7P_DM] = log(hmm->t[k][p7H_DM]); tp[p7P_DD] = log(hmm->t[k][p7H_DD]); } /* Match emission scores. */ sc[hmm->abc->K] = -eslINFINITY; /* gap character */ sc[hmm->abc->Kp-2] = -eslINFINITY; /* nonresidue character */ sc[hmm->abc->Kp-1] = -eslINFINITY; /* missing data character */ for (k = 1; k <= hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log((double)hmm->mat[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k * p7P_NR; rp[p7P_MSC] = sc[x]; } } /* Insert emission scores */ /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores * to 0, i.e. corresponding to the insertion emission probabilities * being equal to the background probabilities. Benchmarking shows * that setting inserts to informative emission distributions causes * more problems than it's worth: polar biased composition hits * driven by stretches of "insertion" occur, and are difficult to * correct for. */ for (x = 0; x < gm->abc->Kp; x++) { for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f; p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ } for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K) = -eslINFINITY; /* gap symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */ #if 0 /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */ for (k = 1; k < hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log(hmm->ins[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k*p7P_NR; rp[p7P_ISC] = sc[x]; } } for (x = 0; x < hmm->abc->Kp; x++) p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ #endif /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength() */ gm->L = 0; /* force ReconfigLength to reconfig */ if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR; return eslOK; ERROR: if (occ != NULL) free(occ); return status; }
static int profillic_esl_msafile_profile_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa, ProfileType * profile_ptr ) { /// \note Right now this isn't actually using the open file pointer; for convenience I just use the profile.fromFile( <filename> ) method. /// \todo Use convenience fns in esl_buffer.h; see eg hmmer-3.1/easel/esl_msafile_stockholm.c for examples... ESL_MSA *msa = NULL; string profile_string; char *buf; long len; int seqidx; int status; char errmsg2[eslERRBUFSIZE]; ESL_DASSERT1((afp->format == eslMSAFILE_PROFILLIC)); const char * const seqname = "Galosh Profile Consensus"; const char * const msaname = "Galosh Profile"; uint32_t profile_length; galosh::Sequence<typename ProfileType::ProfileResidueType> consensus_sequence; stringstream tmp_consensus_output_stream; uint32_t pos_i; if (profile_ptr == NULL) { ESL_EXCEPTION(eslEINCONCEIVABLE, "profile_ptr is NULL in profillic_esl_msafile_profile_Read(..)!"); } //if (feof(afp->bf->fp)) { status = eslEOF; goto ERROR; } afp->errmsg[0] = '\0'; // Read in the galosh profile (from profillic) //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end //len = afp->bf->ftell( afp->bf->fp ); // get the position at the end (length) //fseek( afp->bf->fp, 0, SEEK_SET ); // go to the beginning again. //ESL_ALLOC_CPP( char, buf, sizeof( char ) * len ); //malloc buffer //fread( buf, len, 1, afp->bf->fp ); //read into buffer //profile_string = buf; //profile_ptr->fromString( profile_string ); profile_ptr->fromFile( afp->bf->filename ); //if (buf) free(buf); // \todo WHY WON'T THIS WORK? See HACKs in profillic-hmmbuild.cpp to work around it. //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end (to signal there's no more profiles in the file, the next time we come to this function) // Calculate the consensus sequence. profile_length = profile_ptr->length(); consensus_sequence.reinitialize( profile_length ); for( pos_i = 0; pos_i < profile_length; pos_i++ ) { consensus_sequence[ pos_i ] = ( *profile_ptr )[ pos_i ][ galosh::Emission::Match ].maximumValueType(); } tmp_consensus_output_stream << consensus_sequence; /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */ #ifdef eslAUGMENT_ALPHABET if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } #endif if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } // Set first-and-only seq to the consensus. This should set sqlen[0] to the profile's length and set ax to have length 1 and ax[0] to be the sequence itself. Also msa->sqname[0] to the "name" of that consensus sequence. /* if nec, make room for the new seq */ if (msa->nseq >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) return status; seqidx = msa->nseq; // 0 msa->nseq++; // = 1 status = esl_strdup(seqname, -1, &(msa->sqname[seqidx])); // NOTE: Could add description of this "sequence" here, using esl_msa_SetSeqDescription(msa, seqidx, desc). #ifdef eslAUGMENT_ALPHABET if (msa->flags & eslMSA_DIGITAL) { // NOTE (profillic): There was a bug in this; it had said .."esl_abc_dsqcat(msa->abc, " where it should have said .."esl_abc_dsqcat(msa->abc->inmap, " if((status = esl_abc_dsqcat(msa->abc->inmap, &(msa->ax[seqidx]), &(msa->sqlen[seqidx]), tmp_consensus_output_stream.str().c_str(), profile_length)) != eslOK) { /* invalid char(s), get informative error message */ if (esl_abc_ValidateSeq(msa->abc, tmp_consensus_output_stream.str().c_str(), profile_length, afp->errmsg) != eslOK) ESL_XFAIL(eslEFORMAT, errmsg2, "%s (line %d): %s", msa->sqname[0], afp->linenumber, afp->errmsg); } } #endif if (! (msa->flags & eslMSA_DIGITAL)) { status = esl_strcat(&(msa->aseq[seqidx]), 0, tmp_consensus_output_stream.str().c_str(), profile_length); msa->sqlen[seqidx] = profile_length; } msa->alen = profile_length; /// \todo OR read in a fasta file of sequences too. /// \todo (Optional?) Set msa->name to the name of the profile (file?) esl_strdup(msaname, -1, &(msa->name)); /// \todo make sure eslMSA_HASWGTS is FALSE .. OR set it to TRUE and set msa->wgt[idx] to 1.0. /// \note Could have secondary structure (per sequence) too. msa->ss[0]. msa->sslen[0] should be the same as msa->sqlen[0]. /// \todo Investigate what msa->sa and msa->pp are for. /* Give the newly parsed MSA a good * going-over, and finalize the fields of the MSA data structure. * verify_parse will fill in errmsg if it sees a problem. */ //if (verify_parse(msa, afp->errmsg) != eslOK) { status = eslEFORMAT; goto ERROR; } if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; if (ret_msa != NULL) *ret_msa = msa; else esl_msa_Destroy(msa); return eslOK; ERROR: if (msa != NULL) esl_msa_Destroy(msa); if (ret_msa != NULL) *ret_msa = NULL; return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); int N = esl_opt_GetInteger(go, "-N"); int M = esl_opt_GetInteger(go, "-M"); P7_TOPHITS **h = NULL; P7_HIT *hit = NULL; double *sortkeys = NULL; char name[] = "not_unique_name"; char acc[] = "not_unique_acc"; char desc[] = "Test description for the purposes of making the benchmark allocate space"; int i,j; int status; /* prep work: generate our sort keys before starting to time anything */ ESL_ALLOC(h, sizeof(P7_TOPHITS *) * M); /* allocate pointers for M lists */ ESL_ALLOC(sortkeys, sizeof(double) * N * M); for (i = 0; i < N*M; i++) sortkeys[i] = esl_random(r); esl_stopwatch_Start(w); /* generate M "random" lists and sort them */ for (j = 0; j < M; j++) { h[j] = p7_tophits_Create(p7_TOPHITS_DEFAULT_INIT_ALLOC); for (i = 0; i < N; i++) { p7_tophits_CreateNextHit(h[j], &hit); esl_strdup(name, -1, &(hit->name)); esl_strdup(acc, -1, &(hit->acc)); esl_strdup(desc, -1, &(hit->desc)); hit->sortkey = sortkeys[j*N + i]; hit->score = (float) sortkeys[j*N+i]; hit->pre_score = 0.0; hit->sum_score = 0.0; hit->lnP = sortkeys[j*N+i]; hit->pre_lnP = 0.0; hit->sum_lnP = 0.0; hit->ndom = N; hit->noverlaps = 0; hit->nexpected = 0; hit->flags = 0; hit->nreported = 0; hit->nincluded = 0; hit->best_domain = 0; hit->dcl = NULL; } p7_tophits_SortBySortkey(h[j]); } /* then merge them into one big list in h[0] */ for (j = 1; j < M; j++) { p7_tophits_Merge(h[0], h[j]); p7_tophits_Destroy(h[j]); } esl_stopwatch_Stop(w); p7_tophits_Destroy(h[0]); status = eslOK; ERROR: esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); if (sortkeys != NULL) free(sortkeys); if (h != NULL) free(h); return status; }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
/* Function: p7_hmmcache_Open() * Synopsis: Cache a profile database. * * Purpose: Open <hmmfile> and read all of its contents, creating * a cached profile database in memory. Return a ptr to the * cached profile database in <*ret_cache>. * * Caller may optionally provide an <errbuf> ptr to * at least <eslERRBUFSIZE> bytes, to capture an * informative error message on failure. * * Args: hmmfile - (base) name of profile file to open * ret_cache - RETURN: cached profile database * errbuf - optRETURN: error message for a failure * * Returns: <eslOK> on success. <*ret_cache> points to the * cached db. <errbuf> is unchanged. * * Failure codes: * <eslENOTFOUND> : <hmmfile> couldn't be opened for reading * <eslEFORMAT> : <hmmfile> isn't in recognized HMMER file format * <eslEINCOMPAT> : profiles in <hmmfile> have different alphabets * * On any failure, <*ret_cache> is <NULL> and <errbuf> contains * an informative error message for the user. * * Throws: <eslEMEM> : memory allocation error. */ int p7_hmmcache_Open(char *hmmfile, P7_HMMCACHE **ret_cache, char *errbuf) { P7_HMMCACHE *cache = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; int status; if (errbuf) errbuf[0] = '\0'; ESL_ALLOC(cache, sizeof(P7_HMMCACHE)); cache->name = NULL; cache->abc = NULL; cache->omlist = NULL; cache->gmlist = NULL; cache->lalloc = 4096; /* allocation chunk size for <list> of ptrs */ cache->n = 0; if ( ( status = esl_strdup(hmmfile, -1, &cache->name) != eslOK)) goto ERROR; ESL_ALLOC(cache->omlist, sizeof(P7_OPROFILE *) * cache->lalloc); ESL_ALLOC(cache->gmlist, sizeof(P7_PROFILE *) * cache->lalloc); if ( (status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf)) != eslOK) goto ERROR; // eslENOTFOUND | eslEFORMAT; <errbuf> while ((status = p7_hmmfile_Read(hfp, &(cache->abc), &hmm)) != eslEOF) // eslEFORMAT | eslEINCOMPAT; <errbuf> { if (status != eslOK) ESL_XFAIL(status, errbuf, "%s", hfp->errbuf); if (!bg && (bg = p7_bg_Create(cache->abc)) == NULL) { status = eslEMEM; goto ERROR; } if ( ( gm = p7_profile_Create(hmm->M, cache->abc)) == NULL) { status = eslEMEM; goto ERROR; } if ( (status = p7_profile_Config(gm, hmm, bg)) != eslOK) goto ERROR; if ( (status = p7_oprofile_ReadMSV (hfp, &(cache->abc), &om)) != eslOK || /* eslEFORMAT: hfp->errbuf | eslEINCOMPAT | eslEOF */ (status = p7_oprofile_ReadRest(hfp, om)) != eslOK) /* eslEFORMAT: hfp->errbuf */ { if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "Premature EOF in vectorized profile files"); else goto ERROR; } ESL_DASSERT1(( strcmp(gm->name, om->name) == 0 )); if (cache->n >= cache->lalloc) { ESL_REALLOC(cache->gmlist, sizeof(P7_PROFILE *) * cache->lalloc * 2); ESL_REALLOC(cache->omlist, sizeof(P7_OPROFILE *) * cache->lalloc * 2); cache->lalloc *= 2; } cache->omlist[cache->n] = om; cache->gmlist[cache->n] = gm; cache->n++; om = NULL; gm = NULL; p7_hmm_Destroy(hmm); } //printf("\nfinal:: %d memory %" PRId64 "\n", inx, total_mem); p7_hmmfile_Close(hfp); p7_bg_Destroy(bg); *ret_cache = cache; return eslOK; ERROR: if (cache) p7_hmmcache_Close(cache); if (om) p7_oprofile_Destroy(om); if (gm) p7_profile_Destroy(gm); if (hmm) p7_hmm_Destroy(hmm); if (bg) p7_bg_Destroy(bg); if (hfp) p7_hmmfile_Close(hfp); return status; }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * Incept: SRE, Sun Sep 25 12:21:25 2005 [St. Louis] * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float mthresh; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Determine the "consensus" residue for each match position. * This is only used for alignment displays, not in any calculations. */ if (hmm->abc->type == eslAMINO) mthresh = 0.5; else if (hmm->abc->type == eslDNA) mthresh = 0.9; else if (hmm->abc->type == eslRNA) mthresh = 0.9; else mthresh = 0.5; gm->consensus[0] = ' '; for (k = 1; k <= hmm->M; k++) { x = esl_vec_FArgMax(hmm->mat[k], hmm->abc->K); gm->consensus[k] = ((hmm->mat[k][x] > mthresh) ? toupper(hmm->abc->sym[x]) : tolower(hmm->abc->sym[x])); } gm->consensus[hmm->M+1] = '\0'; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC_WITH_TYPE(occ, float*, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log((double)(occ[k] / Z)); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ {
int p7_seqcache_Open(char *seqfile, P7_SEQCACHE **ret_cache, char *errbuf) { int i; int inx; int val; int status; int32_t seq_cnt; int32_t db_cnt; int32_t db_inx[32]; uint32_t db_key; uint64_t res_cnt; uint64_t res_size; uint64_t hdr_size; char *hdr_ptr; char *res_ptr; char *desc_ptr; char *ptr; char buffer[512]; off_t offset; uint64_t total_mem; SEQ_DB *db = NULL; P7_SEQCACHE *cache = NULL; ESL_RANDOMNESS *rnd = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; ESL_ALPHABET *abc = NULL; ESL_SQASCII_DATA *ascii = NULL; if (errbuf) errbuf[0] = '\0'; /* CURRENTLY UNUSED. FIXME */ /* Open the target sequence database */ if ((status = esl_sqfile_Open(seqfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) return status; /* This is a bit of a hack. The first line contains database information. * * #<res_count> <seq_count> <db_count> <db_sequences_1> <db_sequences_before_removing_duplicates_1> <db_sequences_2> <db_sequences_before_removing_duplicates_2> ... <date_stamp> * * The rest of the file is a fasta format. The fasta header is just * sequence number followed by a binary number indicating which * database this sequence occurs in. * * The header line will be read in, parsed and saved. Then the * parser will be repositioned after the line and used normally. */ ascii = &sqfp->data.ascii; fseek(ascii->fp, 0L, SEEK_SET); if (fgets(buffer, sizeof(buffer), ascii->fp) == NULL) return eslEFORMAT; if (buffer[0] != '#') return eslEFORMAT; ptr = buffer + 1; res_cnt = strtoll(ptr, &ptr, 10); seq_cnt = strtol(ptr, &ptr, 10); db_cnt = strtol(ptr, &ptr, 10); if (db_cnt > (sizeof(db_inx)/sizeof(db_inx[0]))) return eslEFORMAT; total_mem = sizeof(P7_SEQCACHE); ESL_ALLOC(cache, sizeof(P7_SEQCACHE)); memset(cache, 0, sizeof(P7_SEQCACHE)); if (esl_strdup(seqfile, -1, &cache->name) != eslOK) goto ERROR; total_mem += (sizeof(HMMER_SEQ) * seq_cnt); ESL_ALLOC(cache->list, sizeof(HMMER_SEQ) * seq_cnt); memset(cache->list, 0, sizeof(HMMER_SEQ) * seq_cnt); total_mem += (sizeof(SEQ_DB) * db_cnt); ESL_ALLOC(db, sizeof(SEQ_DB) * db_cnt); for (i = 0; i < db_cnt; ++i) { db[i].count = strtol(ptr, &ptr, 10); db[i].K = strtol(ptr, &ptr, 10); total_mem += (sizeof(HMMER_SEQ *) * db[i].count); ESL_ALLOC(db[i].list, sizeof(HMMER_SEQ *) * db[i].count); memset(db[i].list, 0, sizeof(HMMER_SEQ *) * db[i].count); } /* grab the unique identifier */ while (*ptr && isspace(*ptr)) ++ptr; i = strlen(ptr); ESL_ALLOC(cache->id, i+1); strcpy(cache->id, ptr); while (--i > 0 && isspace(cache->id[i])) cache->id[i] = 0; res_size = res_cnt + seq_cnt + 1; hdr_size = seq_cnt * 10; total_mem += res_size + hdr_size; ESL_ALLOC(cache->residue_mem, res_size); ESL_ALLOC(cache->header_mem, hdr_size); /* position the sequence file to the start of the first sequence. * this will force any buffers associated with the file to be reset. */ offset = ftell(ascii->fp); if ((status = esl_sqfile_Position(sqfp, offset)) != eslOK) goto ERROR; abc = esl_alphabet_Create(eslAMINO); sq = esl_sq_CreateDigital(abc); cache->db_cnt = db_cnt; cache->db = db; cache->abc = abc; cache->res_size = res_size; cache->hdr_size = hdr_size; cache->count = seq_cnt; hdr_ptr = cache->header_mem; res_ptr = cache->residue_mem; for (i = 0; i < db_cnt; ++i) db_inx[i] = 0; strcpy(buffer, "000000001"); inx = 0; while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { /* sanity checks */ if (inx >= seq_cnt) { printf("inx: %d\n", inx); return eslEFORMAT; } if (sq->n + 1 > res_size) { printf("inx: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } if (hdr_size <= 0) { printf("inx: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } /* generate the database key - modified to take the first word in the desc line. * The remaining part of the desc is then cached as the description. */ ptr = sq->desc;; desc_ptr = strchr(sq->desc, ' '); if(desc_ptr != NULL) { *desc_ptr= '\0'; ++desc_ptr; } val = 1; db_key = 0; while (*ptr) { if (*ptr == '1') db_key += val; val <<= 1; ++ptr; } if (db_key >= (1 << (db_cnt + 1))) { printf("inx: %d db %d %s\n", inx, db_key, sq->desc); return eslEFORMAT; } cache->list[inx].name = hdr_ptr; cache->list[inx].dsq = (ESL_DSQ *)res_ptr; cache->list[inx].n = sq->n; cache->list[inx].idx = inx; cache->list[inx].db_key = db_key; if(desc_ptr != NULL) esl_strdup(desc_ptr, -1, &(cache->list[inx].desc)); /* copy the digitized sequence */ memcpy(res_ptr, sq->dsq, sq->n + 1); res_ptr += (sq->n + 1); res_size -= (sq->n + 1); /* copy the index to the header */ strcpy(hdr_ptr, buffer); hdr_ptr += 10; hdr_size -= 10; /* increment the buffer string */ ++buffer[8]; for (i = 8; i > 0; --i) { if (buffer[i] > '9') { buffer[i] = '0'; buffer[i-1]++; } } esl_sq_Reuse(sq); ++inx; } if (status != eslEOF) { printf("Unexpected error %d at %d\n", status, inx); return status; } if (inx != seq_cnt) { printf("inx:: %d %d\n", inx, seq_cnt); return eslEFORMAT; } if (hdr_size != 0) { printf("inx:: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } if (res_size != 1) { printf("inx:: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } /* copy the final sentinel character */ *res_ptr++ = eslDSQ_SENTINEL; --res_size; /* sort the order of the database sequences */ rnd = esl_randomness_CreateFast(seq_cnt); for (i = 0 ; i < seq_cnt; ++i) { rnd->x = rnd->x * 69069 + 1; cache->list[i].idx = rnd->x; } esl_randomness_Destroy(rnd); qsort(cache->list, seq_cnt, sizeof(HMMER_SEQ), sort_seq); /* fill in the different databases and fix the index */ for (i = 0 ; i < seq_cnt; ++i) { inx = 0; db_key = cache->list[i].db_key; while (db_key) { if (db_key & 1) { SEQ_DB *db = cache->db + inx; if (db_inx[inx] >= db->count) { printf("sort:: %d %d\n", db_inx[inx], db->count); return eslEFORMAT; } db->list[db_inx[inx]] = &cache->list[i]; ++db_inx[inx]; } db_key >>= 1; ++inx; } cache->list[i].idx = (cache->list[i].name - cache->header_mem) / 10 + 1; } for (i = 0; i < cache->db_cnt; ++i) { printf("sequence database (%d):: %d %d\n", i, cache->db[i].count, db_inx[i]); } printf("\nLoaded sequence db file %s; total memory %" PRId64 "\n", seqfile, total_mem); esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); *ret_cache = cache; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (abc != NULL) esl_alphabet_Destroy(abc); if (cache != NULL) { if (cache->header_mem != NULL) free(cache->header_mem); if (cache->residue_mem != NULL) free(cache->residue_mem); if (cache->name != NULL) free(cache->name); if (cache->id != NULL) free(cache->id); free(cache); } for (i = 0; i < db_cnt; ++i) { if (db[i].list != NULL) free(db[i].list); } return eslEMEM; }
/* Create an SSI index file for open sequence file <sqfp>. * Both name and accession of sequences are stored as keys. */ static void create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp) { ESL_NEWSSI *ns = NULL; ESL_SQ *sq = esl_sq_Create(); int nseq = 0; char *ssifile = NULL; uint16_t fh; int status; esl_strdup(sqfp->filename, -1, &ssifile); esl_strcat(&ssifile, -1, ".ssi", 4); status = esl_newssi_Open(ssifile, TRUE, &ns); /* TRUE is for allowing overwrite. */ if (status == eslENOTFOUND) esl_fatal("failed to open SSI index %s", ssifile); else if (status == eslEOVERWRITE) esl_fatal("SSI index %s already exists; delete or rename it", ssifile); /* won't happen, see TRUE above... */ else if (status != eslOK) esl_fatal("failed to create a new SSI index"); if (esl_newssi_AddFile(ns, sqfp->filename, sqfp->format, &fh) != eslOK) esl_fatal("Failed to add sequence file %s to new SSI index\n", sqfp->filename); printf("Creating SSI index for %s... ", sqfp->filename); fflush(stdout); while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK) { nseq++; if (sq->name == NULL) esl_fatal("Every sequence must have a name to be indexed. Failed to find name of seq #%d\n", nseq); if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK) esl_fatal("Failed to add key %s to SSI index", sq->name); if (sq->acc[0] != '\0') { if (esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK) esl_fatal("Failed to add secondary key %s to SSI index", sq->acc); } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Determine if the file was suitable for fast subseq lookup. */ if (sqfp->data.ascii.bpl > 0 && sqfp->data.ascii.rpl > 0) { if ((status = esl_newssi_SetSubseq(ns, fh, sqfp->data.ascii.bpl, sqfp->data.ascii.rpl)) != eslOK) esl_fatal("Failed to set %s for fast subseq lookup."); } /* Save the SSI file to disk */ if (esl_newssi_Write(ns) != eslOK) esl_fatal("Failed to write keys to ssi file %s\n", ssifile); /* Done - output and exit. */ printf("done.\n"); if (ns->nsecondary > 0) printf("Indexed %d sequences (%ld names and %ld accessions).\n", nseq, (long) ns->nprimary, (long) ns->nsecondary); else printf("Indexed %d sequences (%ld names).\n", nseq, (long) ns->nprimary); printf("SSI index written to file %s\n", ssifile); free(ssifile); esl_sq_Destroy(sq); esl_newssi_Close(ns); return; }