/* Function: esl_keyhash_Store() * Synopsis: Store a key and get a key index for it. * * Purpose: Store a string <key> of length <n> in the key index hash table <kh>. * Associate it with a unique key index, counting from * 0. It's this index that lets us map the hashed keys to * integer-indexed C arrays, clumsily emulating Perl's * hashes. Optionally returns the index through <opt_index>. * * <key>, <n> follow the standard idiom for strings and * unterminated buffers. * * Returns: <eslOK> on success; stores <key> in <kh>; <opt_index> is * returned, set to the next higher index value. * Returns <eslEDUP> if <key> was already stored in the table; * <opt_index> is set to the existing index for <key>. * * Throws: <eslEMEM> on allocation failure, and sets <opt_index> to -1. */ int esl_keyhash_Store(ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index) { uint32_t val = jenkins_hash(key, n, kh->hashsize); int idx; int status; if (n == -1) n = strlen(key); /* Was this key already stored? */ for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx]) if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx])) { if (opt_index != NULL) *opt_index = idx; return eslEDUP; } /* Reallocate key ptr/index memory if needed */ if (kh->nkeys == kh->kalloc) { ESL_REALLOC(kh->key_offset, sizeof(int)*kh->kalloc*2); ESL_REALLOC(kh->nxt, sizeof(int)*kh->kalloc*2); kh->kalloc *= 2; } /* Reallocate key string memory if needed */ while (kh->sn + n + 1 > kh->salloc) { ESL_REALLOC(kh->smem, sizeof(char) * kh->salloc * 2); kh->salloc *= 2; } /* Copy the key, assign its index */ idx = kh->nkeys; kh->key_offset[idx] = kh->sn; kh->sn += n+1; esl_memstrcpy(key, n, kh->smem + kh->key_offset[idx]); kh->nkeys++; /* Insert new element at head of the approp linked list in hashtable */ kh->nxt[idx] = kh->hashtable[val]; kh->hashtable[val] = idx; /* Time to upsize? If we're 3x saturated, expand the hash table */ if (kh->nkeys > 3*kh->hashsize) if ((status = key_upsize(kh)) != eslOK) goto ERROR; if (opt_index != NULL) *opt_index = idx; return eslOK; ERROR: if (opt_index != NULL) *opt_index = -1; return status; }
/* regurgitate_pfam_as_pfam() * * Given an open Pfam formatted msafile, read the next alignment and * regurgitate it, after modifying it as necessary (change dna to rna, * wussify SS, etc) in Pfam format. * * Returns <eslOK> on success. * Returns <eslEOF> if there are no more alignments in <afp>. * Returns <eslEFORMAT> if parse fails because of a file format * problem, in which case afp->errmsg is set to contain a formatted * message that indicates the cause of the problem. */ static int regurgitate_pfam_as_pfam(ESLX_MSAFILE *afp, FILE *ofp, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, int wussify, int dewuss, int fullwuss, char *rfrom, char *rto) { char *p; esl_pos_t n; char *first_seqname = NULL; char *gx = NULL; char *seqname = NULL; char *tag = NULL; char *text = NULL; esl_pos_t gxlen, namelen, taglen, textlen; int nseq_read = 0; int parse_gc_and_gr; int flushpoint = 10000; int exp_alen = -1; char *buf = NULL; esl_pos_t pos, pos2; int status; parse_gc_and_gr = (wussify || dewuss || fullwuss) ? TRUE : FALSE; /* should we parse out GR/GC lines and check if they're SS lines? */ afp->errmsg[0] = '\0'; /* Check the magic Stockholm header line. * We have to skip blank lines here, else we perceive * trailing blank lines in a file as a format error when * reading in multi-record mode. */ /* Check the magic Stockholm header line, allowing blank lines */ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return eslEOF; else if (status != eslOK) esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber); fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); /* Read the alignment file one line at a time. */ while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { if ((int) afp->linenumber % flushpoint == 0) fflush(ofp); while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (!n) fprintf(ofp, "\n"); else if (esl_memstrpfx(p, n, "//")) { fprintf(ofp, "//\n"); break; } /* normal way out */ else if (*p == '#') { if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GC")) { /* parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &gx, &gxlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber); pos = text - afp->line; /* pos: position of first aligned char on line; total width of annotation tag w/spaces */ /* verify alignment length */ if (exp_alen == -1) exp_alen = textlen; else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen); /* we need to make a writable string copy of the annotation, to edit it */ ESL_REALLOC(buf, sizeof(char) * (textlen+1)); esl_memstrcpy(text, textlen, buf); if (esl_memstrcmp(tag, taglen, "SS_cons")) { if (wussify) esl_kh2wuss(buf, buf); else if (dewuss) esl_wuss2kh(buf, buf); else if (fullwuss) { status = esl_wuss_full(buf, buf); if (status == eslESYNTAX) esl_fatal("Bad SS_cons line: not in WUSS format, alifile line: %d", (int) afp->linenumber); else if (status != eslOK) esl_fatal("Conversion of SS_cons line failed, code %d, alifile line: %d", status, (int) afp->linenumber); } } fprintf(ofp, "#=GC %.*s%*s%s\n", (int) taglen, tag, (int) (pos-taglen-5), "", buf); } else if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GR") == 0) { /* parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &gx, &gxlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); pos = tag - afp->line; if (esl_memtok(&p, &n, " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); pos2 = text - afp->line; /* we need to make a writable string copy of the annotation, to edit it */ ESL_REALLOC(buf, sizeof(char) * (textlen+1)); esl_memstrcpy(text, textlen, buf); /* verify alignment length */ if (exp_alen == -1) exp_alen = textlen; else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen); if (esl_memstrcmp(tag, taglen, "SS") == 0) { if (wussify) esl_kh2wuss(buf, buf); else if (dewuss) esl_wuss2kh(buf, buf); else if (fullwuss) { status = esl_wuss_full(buf, buf); if (status == eslESYNTAX) esl_fatal("Bad SS line: not in WUSS format, alifile line: %d", (int) afp->linenumber); else if (status != eslOK) esl_fatal("Conversion of SS line failed, code %d, alifile line: %d", status, (int) afp->linenumber); } } fprintf(ofp, "#=GR %.*s%*s%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen-5), "", (int) taglen, tag, (int) (pos2-pos-taglen), "", buf); } else { /* '#' prefixed line that is not #=GR (or it is #=GR and wussify,dewuss,fullwuss are all FALSE) */ fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); /* print the line */ } } /* end of 'if (*s == '#')' */ else { /* sequence line */ if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber); pos = text - afp->line; /* verify alignment length */ if (exp_alen == -1) exp_alen = textlen; else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen); /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */ if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, namelen, &(first_seqname))) != eslOK) goto ERROR; } else if (esl_memstrcmp(seqname, namelen, first_seqname)) { ESL_XFAIL(eslEFORMAT, afp->errmsg, "parse failed (line %d): two seqs named %s. Alignment appears to be in Stockholm format. Reformat to Pfam with esl-reformat.", (int) afp->linenumber, seqname); } nseq_read++; /* we need to make a writable string copy of the annotation, to edit it */ ESL_REALLOC(buf, sizeof(char) * (textlen+1)); esl_memstrcpy(text, textlen, buf); /* make adjustments as necessary */ if (rfrom) symconvert(buf, rfrom, rto); if (gapsym) symconvert(buf, "-_.", gapsym); if (force_lower) symconvert(buf, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"); if (force_upper) symconvert(buf, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); if (force_rna) symconvert(buf, "Tt", "Uu"); if (force_dna) symconvert(buf, "Uu", "Tt"); if (iupac_to_n) symconvert(buf, "RYMKSWHBVDrymkswhbvd", "NNNNNNNNNNnnnnnnnnnn"); if (x_is_bad) symconvert(buf, "Xx", "Nn"); /* print it out */ fprintf(ofp, "%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen), "", buf); } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status != eslOK) esl_fatal("--small parse failed (line %d): didn't find // at end of alignment", (int) afp->linenumber); if (first_seqname) free(first_seqname); if (buf) free(buf); return eslOK; ERROR: return status; }
/* Function: fm_alphabetCreate() * * Synopsis: Produce an alphabet for FMindex. * * Purpose: Produce an alphabet for FMindex. This may end up being * replaced with easel alphabet functions, but the easel * requirement of having a gap-character between * cannonical and degenerate symbols poses a problem * from a bit-packing perspective * * Args: meta - metadata object already initialized with the alphabet type. * This will hold the alphabet (and corresponding reverse alphabet) * created here. * alph_bits - pointer to an int that this function sets equal to the * number of bits required to store the alphabet (log of alph size) * * Returns: <eslOK> on success. */ int fm_alphabetCreate (FM_METADATA *meta, uint8_t *alph_bits) { int i = 0; int status; if ( meta->alph_type == fm_DNA) { meta->alph_size = 4; if (alph_bits) *alph_bits = 2; } else if ( meta->alph_type == fm_DNA_full) { meta->alph_size = 15; if (alph_bits) *alph_bits = 4; } else if ( meta->alph_type == fm_AMINO) { meta->alph_size = 26; if (alph_bits) *alph_bits = 5; } else { esl_fatal("Unknown alphabet type\n%s", ""); } ESL_ALLOC(meta->alph, (1+meta->alph_size)*sizeof(char)); ESL_ALLOC(meta->inv_alph, 256*sizeof(char)); if ( meta->alph_type == fm_DNA || meta->alph_type == fm_DNA_full) ESL_ALLOC(meta->compl_alph, (1+meta->alph_size)*sizeof(int)); if ( meta->alph_type == fm_DNA) { esl_memstrcpy("ACGT", 4, meta->alph); for (i=0; i<4; i++) meta->compl_alph[i] = 3-i; } else if ( meta->alph_type == fm_DNA_full) { esl_memstrcpy("ACGTRYMKSWHBVDN", 15, meta->alph); meta->compl_alph[0] = 3; /* A->T */ meta->compl_alph[1] = 2; /* C->G */ meta->compl_alph[2] = 1; /* G->C */ meta->compl_alph[3] = 0; /* T->A */ meta->compl_alph[4] = 5; /* R->Y */ meta->compl_alph[5] = 4; /* Y->R */ meta->compl_alph[6] = 7; /* M->K */ meta->compl_alph[7] = 6; /* K->M */ meta->compl_alph[8] = 8; /* S S */ meta->compl_alph[9] = 9; /* W W */ meta->compl_alph[10]= 13; /* H->D */ meta->compl_alph[11]= 12; /* B->V */ meta->compl_alph[12]= 11; /* V->B */ meta->compl_alph[13]= 10; /* D->H */ meta->compl_alph[14]= 14; /* N N */ } else if ( meta->alph_type == fm_AMINO) { esl_memstrcpy("ACDEFGHIKLMNPQRSTVWYBJZOUX", meta->alph_size, meta->alph); } for (i=0; i<256; i++) meta->inv_alph[i] = -1; for (i=0; i<meta->alph_size; i++) { meta->inv_alph[tolower(meta->alph[i])] = meta->inv_alph[toupper(meta->alph[i])] = i; //special case for RNA, equate U to T: if ( (meta->alph_type == fm_DNA || meta->alph_type == fm_DNA_full) && toupper(meta->alph[i]) == 'T') meta->inv_alph['u'] = meta->inv_alph['U'] = i; } return eslOK; ERROR: esl_fatal("error allocating space for alphabet\n"); return eslFAIL; }