static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end) { int start, end; int do_revcomp; ESL_SQ *sq = esl_sq_Create(); if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index"); /* reverse complement indicated by coords. */ /* -c 52: would be 52,0, so watch out for given_end = 0 case */ if (given_end != 0 && given_start > given_end) { start = given_end; end = given_start; do_revcomp = TRUE; } else { start = given_start; end = given_end; do_revcomp = FALSE; } if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp)); if (newname != NULL) esl_sq_SetName(sq, newname); else esl_sq_FormatName(sq, "%s/%d-%d", key, given_start, (given_end == 0) ? sq->L : given_end); /* Two ways we might have been asked to revcomp: by coord, or by -r option */ /* (If both happen, they'll cancel each other out) */ if (do_revcomp) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (esl_opt_GetBoolean(go, "-r")) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(sq); }
static int do_by_sequences(ESL_GENCODE *gcode, ESL_GENCODE_WORKSTATE *wrk, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_CreateDigital(gcode->nt_abc); int status; while (( status = esl_sqio_Read(sqfp, sq )) == eslOK) { if (sq->n < 3) continue; if (wrk->do_watson) { esl_gencode_ProcessStart(gcode, wrk, sq); esl_gencode_ProcessPiece(gcode, wrk, sq); esl_gencode_ProcessEnd(wrk, sq); } if (wrk->do_crick) { esl_sq_ReverseComplement(sq); esl_gencode_ProcessStart(gcode, wrk, sq); esl_gencode_ProcessPiece(gcode, wrk, sq); esl_gencode_ProcessEnd(wrk, sq); } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s)\n%s\n", sqfp->filename, sqfp->get_error(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); return eslOK; }
/* onefetch(): * Given one <key> (a seq name or accession), retrieve the corresponding sequence. * In SSI mode, we can do this quickly by positioning the file, then regurgitating * every line until the end-of-record marker; we don't even have to parse. * Without an SSI index, we have to parse the file sequentially 'til we find * the one we're after. */ static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_Create(); int do_revcomp = esl_opt_GetBoolean(go, "-r"); char *newname = esl_opt_GetString(go, "-n"); int status; /* Try to position the file at the desired sequence with SSI. */ if (sqfp->data.ascii.ssi != NULL) { status = esl_sqfile_PositionByKey(sqfp, key); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", key, sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Failed to parse SSI index for %s\n", sqfp->filename); else if (status != eslOK) esl_fatal("Failed to look up location of seq %s in SSI index of file %s\n", key, sqfp->filename); status = esl_sqio_Read(sqfp, sq); if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status == eslEOF) esl_fatal("Unexpected EOF reading sequence file %s", status, sqfp->filename); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) != 0 && strcmp(key, sq->acc) != 0) esl_fatal("whoa, internal error; found the wrong sequence %s, not %s", sq->name, key); } else { /* Else, we have to read the whole damn file sequentially until we find the seq */ while ((status = esl_sqio_Read(sqfp, sq)) != eslEOF) { if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) == 0 || strcmp(key, sq->acc) == 0) break; esl_sq_Reuse(sq); } if (status == eslEOF) esl_fatal("Failed to find sequence %s in file %s\n", key, sqfp->filename); } if (do_revcomp == FALSE && newname == NULL && ! esl_sqio_IsAlignment(sqfp->format)) { /* If we're not manipulating the sequence in any way, and it's not from an alignment file, we can Echo() it. */ if (esl_sqio_Echo(sqfp, sq, ofp) != eslOK) esl_fatal("Echo failed: %s\n", esl_sqfile_GetErrorBuf(sqfp)); } else { /* Otherwise we Write() the parsed version. */ if (do_revcomp && esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (newname != NULL) esl_sq_SetName(sq, newname); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); } esl_sq_Destroy(sq); }
int esl_trans_s2p(ESL_SQ *in, ESL_SQ **out, int frameshift, int rcFlag) { // The encoding for this is taken from squid: A=0, C=1, G=2, U/T=3, // code[0] corresponds to AAA, code[1] is AAC... code[4] is ACA... // and so on up to 63 being UUU. 64 is a sentinel. Regular 20 amino codes and '*' for stop // the nucleotide indices match well with the easel alphabet index // but the actual translation still needs to be hard coded char code[] = {'K','N','K','N','T','T','T','T','R','S','R','S', 'I','I','M','I','Q','H','Q','H','P','P','P','P', 'R','R','R','R','L','L','L','L','E','D','E','D', 'A','A','A','A','G','G','G','G','V','V','V','V', '*','Y','*','Y','L','F','L','F','*','C','W','C', 'L','F','L','F'}; int status; int codon; //progress in counting current codon char *aaseq; //hold the protein sequence to be output char *aaptr; //pointer records progress in writing to output char *readseq; //pointer records progress in reading nucleotide sequence int read_dg; //index into digital sequence ESL_ALPHABET *abc = esl_alphabet_Create(eslDNA); char errbuf[256]; //validateseq demands this char namestring[256]; (*out) = NULL; if(frameshift >= in->n) return eslFAIL; if(!abc) goto ERROR; //make sure we have a nucleotide sequence; could use esl_abc_ValidateSeq but that wants too //much boilerplate for the simple bit I need done. doesn't help that i don't care if there are U or T //characters but that would test against two alphabets if(in->seq) { if(eslOK != esl_abc_ValidateSeq(abc, in->seq, in->n, errbuf)) goto ERROR; } else if(in->dsq) { if(in->abc->type != eslRNA && in->abc->type != eslDNA) goto ERROR; } else { goto ERROR; } //apply the reverse compliment if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;} ESL_ALLOC(aaseq, (in->n+1) * sizeof(char)); aaptr = aaseq; if(in->seq) //text sequence { //get an alphabet to do the lookup with. //an ordinary text sequence doesn't have in->abc //if it has one that is not a standard dna/rna alphabet //then this code won't work. I wanted to use an alphabet if available, could save some allocating time that way //if we're calling this repeatedly //but the compiler complains about "pointer qualifiers" so nevermind readseq = in->seq+frameshift; //as long as there are at least 3 nucleotides left, pull and translate another codon for (; *readseq != '\0' && *(readseq+1) != '\0' && *(readseq+2) != '\0'; readseq += 3) { codon = abc->inmap[(int)*(readseq)] * 16 + abc->inmap[(int)*(readseq+1)] * 4 + abc->inmap[(int)*(readseq+2)]; if(codon > 63 || codon < 0) break; *aaptr = code[codon]; aaptr += 1; } *aaptr = '\0'; } else if(in->dsq) //do it digitally { if(in->dsq == NULL) goto ERROR; read_dg = 1+frameshift; //add one here because digital index 0 is a sentinel for(;in->dsq[read_dg] != 255 && in->dsq[read_dg+1] != 255 && in->dsq[read_dg+2] != 255; read_dg += 3) { codon = in->dsq[read_dg] * 16 + in->dsq[read_dg+1] * 4 + in->dsq[read_dg+2]; if(codon > 63 || codon < 0) break; *aaptr = code[codon]; aaptr += 1; } *aaptr = '\0'; } else { goto ERROR; } //modify name to record any reading frame adjustments sprintf(namestring, "%s_s%d", in->name, frameshift); if(rcFlag) strcat(namestring, "_rc"); *out = esl_sq_CreateFrom(namestring, aaseq, in->desc, in->acc, in->ss); if(aaseq != NULL) free(aaseq); //return the input to its original state if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;} if(abc) esl_alphabet_Destroy(abc); if(*out) return eslOK; ERROR: if(abc) esl_alphabet_Destroy(abc); if(aaseq != NULL) free(aaseq); (*out) = NULL; return eslEMEM; }
static void pipeline_thread(void *arg) { int i, j; int status; int workeridx; WORKER_INFO *info; ESL_THREADS *obj; P7_OM_BLOCK *block; void *newBlock; P7_OPROFILE *om = NULL; P7_SCOREDATA *scoredata = NULL; /* hmm-specific data used by nhmmer */ P7_DOMAIN *dcl; int seq_len = 0; int prev_hit_cnt = 0; #ifdef eslAUGMENT_ALPHABET ESL_SQ *sq_revcmp = NULL; #endif /*eslAUGMENT_ALPHABET*/ impl_ThreadInit(); obj = (ESL_THREADS *) arg; esl_threads_Started(obj, &workeridx); info = (WORKER_INFO *) esl_threads_GetData(obj, workeridx); status = esl_workqueue_WorkerUpdate(info->queue, NULL, &newBlock); if (status != eslOK) esl_fatal("Work queue worker failed"); #ifdef eslAUGMENT_ALPHABET //reverse complement if (info->pli->strand != p7_STRAND_TOPONLY && info->qsq->abc->complement != NULL ) { sq_revcmp = esl_sq_CreateDigital(info->qsq->abc); esl_sq_Copy(info->qsq,sq_revcmp); esl_sq_ReverseComplement(sq_revcmp); info->pli->nres += info->qsq->n; } #endif /*eslAUGMENT_ALPHABET*/ /* loop until all blocks have been processed */ block = (P7_OM_BLOCK *) newBlock; while (block->count > 0) { /* Main loop: */ for (i = 0; i < block->count; ++i) { om = block->list[i]; seq_len = 0; p7_pli_NewModel(info->pli, om, info->bg); p7_bg_SetLength(info->bg, info->qsq->n); p7_oprofile_ReconfigLength(om, info->qsq->n); scoredata = p7_hmm_ScoreDataCreate(om, FALSE); #ifdef eslAUGMENT_ALPHABET //reverse complement if (info->pli->strand != p7_STRAND_TOPONLY && info->qsq->abc->complement != NULL ) { p7_Pipeline_LongTarget(info->pli, om, scoredata, info->bg, sq_revcmp, info->th, 0); p7_pipeline_Reuse(info->pli); // prepare for next search seq_len = info->qsq->n; for (j = prev_hit_cnt; j < info->th->N ; j++) { dcl = info->th->unsrt[j].dcl; // modify hit positions to account for the position of the window in the full sequence dcl->ienv = seq_len - dcl->ienv + 1; dcl->jenv = seq_len - dcl->jenv + 1; dcl->iali = seq_len - dcl->iali + 1; dcl->jali = seq_len - dcl->jali + 1; dcl->ad->sqfrom = seq_len - dcl->ad->sqfrom + 1; dcl->ad->sqto = seq_len - dcl->ad->sqto + 1; } } #endif if (info->pli->strand != p7_STRAND_BOTTOMONLY) { p7_Pipeline_LongTarget(info->pli, om, scoredata, info->bg, info->qsq, info->th, 0); p7_pipeline_Reuse(info->pli); seq_len += info->qsq->n; } for (j = prev_hit_cnt; j < info->th->N ; j++) { info->th->unsrt[j].lnP += log((float)seq_len / (float)om->max_length); info->th->unsrt[j].dcl[0].lnP = info->th->unsrt[j].lnP; info->th->unsrt[j].sortkey = -1.0 * info->th->unsrt[j].lnP; info->th->unsrt[j].dcl[0].ad->L = om->M; } prev_hit_cnt = info->th->N; p7_hmm_ScoreDataDestroy(scoredata); p7_oprofile_Destroy(om); block->list[i] = NULL; } status = esl_workqueue_WorkerUpdate(info->queue, block, &newBlock); if (status != eslOK) esl_fatal("Work queue worker failed"); block = (P7_OM_BLOCK *) newBlock; } #ifdef eslAUGMENT_ALPHABET esl_sq_Destroy(sq_revcmp); #endif status = esl_workqueue_WorkerUpdate(info->queue, block, NULL); if (status != eslOK) esl_fatal("Work queue worker failed"); esl_threads_Finished(obj, workeridx); return; }
static int serial_loop(WORKER_INFO *info, P7_HMMFILE *hfp) { int status; int i; int seq_len = 0; int prev_hit_cnt = 0; P7_OPROFILE *om = NULL; P7_SCOREDATA *scoredata = NULL; /* hmm-specific data used by nhmmer */ ESL_ALPHABET *abc = NULL; P7_DOMAIN *dcl; #ifdef eslAUGMENT_ALPHABET ESL_SQ *sq_revcmp = NULL; if (info->pli->strand != p7_STRAND_TOPONLY && info->qsq->abc->complement != NULL ) { sq_revcmp = esl_sq_CreateDigital(info->qsq->abc); esl_sq_Copy(info->qsq,sq_revcmp); esl_sq_ReverseComplement(sq_revcmp); info->pli->nres += info->qsq->n; } #endif /*eslAUGMENT_ALPHABET*/ /* Main loop: */ while ((status = p7_oprofile_ReadMSV(hfp, &abc, &om)) == eslOK) { seq_len = 0; p7_pli_NewModel(info->pli, om, info->bg); p7_bg_SetLength(info->bg, info->qsq->n); p7_oprofile_ReconfigLength(om, info->qsq->n); scoredata = p7_hmm_ScoreDataCreate(om, FALSE); #ifdef eslAUGMENT_ALPHABET //reverse complement if (info->pli->strand != p7_STRAND_TOPONLY && info->qsq->abc->complement != NULL ) { p7_Pipeline_LongTarget(info->pli, om, scoredata, info->bg, sq_revcmp, info->th, 0); p7_pipeline_Reuse(info->pli); // prepare for next search seq_len = info->qsq->n; for (i = prev_hit_cnt; i < info->th->N ; i++) { dcl = info->th->unsrt[i].dcl; // modify hit positions to account for the position of the window in the full sequence dcl->ienv = seq_len - dcl->ienv + 1; dcl->jenv = seq_len - dcl->jenv + 1; dcl->iali = seq_len - dcl->iali + 1; dcl->jali = seq_len - dcl->jali + 1; dcl->ad->sqfrom = seq_len - dcl->ad->sqfrom + 1; dcl->ad->sqto = seq_len - dcl->ad->sqto + 1; } } #endif if (info->pli->strand != p7_STRAND_BOTTOMONLY) { p7_Pipeline_LongTarget(info->pli, om, scoredata, info->bg, info->qsq, info->th, 0); p7_pipeline_Reuse(info->pli); seq_len += info->qsq->n; } for (i = prev_hit_cnt; i < info->th->N ; i++) { info->th->unsrt[i].lnP += log((float)seq_len / (float)om->max_length); info->th->unsrt[i].dcl[0].lnP = info->th->unsrt[i].lnP; info->th->unsrt[i].sortkey = -1.0 * info->th->unsrt[i].lnP; info->th->unsrt[i].dcl[0].ad->L = om->M; } prev_hit_cnt = info->th->N; p7_oprofile_Destroy(om); p7_hmm_ScoreDataDestroy(scoredata); } esl_alphabet_Destroy(abc); #ifdef eslAUGMENT_ALPHABET esl_sq_Destroy(sq_revcmp); #endif return status; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }