void P7ReadNullModel(char *rndfile, float *null, float *ret_p1) { FILE *fp; char *s; int x; int type = 0; if ((fp = fopen(rndfile, "r")) == NULL) Die("Failed to open null model file %s\n", rndfile); if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; s2upper(s); if (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC; else if (strcmp(s, "AMINO") == 0) type = hmmAMINO; else goto FAILURE; /* check/set alphabet type */ if (Alphabet_type == 0) SetAlphabet(type); else if (Alphabet_type != type) Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile); /* parse the file */ for (x = 0; x < Alphabet_size; x++) { if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; null[x] = atof(s); } if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; *ret_p1 = atof(s); fclose(fp); return; FAILURE: fclose(fp); Die("%s is not in HMMER null model file format", rndfile); }
/* Function: P7ReadPrior() * * Purpose: Input a prior from disk file. */ struct p7prior_s * P7ReadPrior(char *prifile) { FILE *fp; struct p7prior_s *pri; char *sptr; int q, x; if ((fp = fopen(prifile, "r")) == NULL) Die("Failed to open HMMER prior file %s\n", prifile); pri = P7AllocPrior(); /* First entry is the strategy: * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far */ sptr = Getword(fp, sqdARG_STRING); s2upper(sptr); if (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET; else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile); /* Second entry is the alphabet type: * Amino or Nucleic */ sptr = Getword(fp, sqdARG_STRING); s2upper(sptr); if (strcmp(sptr, "AMINO") == 0) { if (Alphabet_type != hmmAMINO) Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile); } else if (strcmp(sptr, "NUCLEIC") == 0) { if (Alphabet_type != hmmNUCLEIC) Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile); } else Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile); /* State transition priors: * # of mixtures. * then for each mixture: * prior P(q) * Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd */ pri->tnum = atoi(Getword(fp, sqdARG_INT)); if (pri->tnum < 0) Die("%d is bad; need at least one state transition mixture component", pri->tnum); if (pri->tnum > MAXDCHLET) Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET); for (q = 0; q < pri->tnum; q++) { pri->tq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); for (x = 0; x < 7; x++) pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); } /* Match emission priors: * # of mixtures. * then for each mixture: * prior P(q) * Dirichlet terms for Alphabet_size symbols in Alphabet */ pri->mnum = atoi(Getword(fp, sqdARG_INT)); if (pri->mnum < 0) Die("%d is bad; need at least one match emission mixture component", pri->mnum); if (pri->mnum > MAXDCHLET) Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET); for (q = 0; q < pri->mnum; q++) { pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); for (x = 0; x < Alphabet_size; x++) pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); } /* Insert emission priors: * # of mixtures. * then for each mixture component: * prior P(q) * Dirichlet terms for Alphabet_size symbols in Alphabet */ pri->inum = atoi(Getword(fp, sqdARG_INT)); if (pri->inum < 0) Die("%d is bad; need at least one insert emission mixture component", pri->inum); if (pri->inum > MAXDCHLET) Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum, MAXDCHLET); for (q = 0; q < pri->inum; q++) { pri->iq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); for (x = 0; x < Alphabet_size; x++) pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); } fclose(fp); return pri; }
/* Function: include_alignment() * Date: SRE, Sun Jul 5 15:25:13 1998 [St. Louis] * * Purpose: Given the name of a multiple alignment file, * align that alignment to the HMM, and add traces * to an existing array of traces. If do_mapped * is TRUE, we use the HMM's map file. If not, * we use P7ViterbiAlignAlignment(). * * Args: seqfile - name of alignment file * hmm - model to align to * do_mapped- TRUE if we're to use the HMM's alignment map * rsq - RETURN: array of rseqs to add to * dsq - RETURN: array of dsq to add to * sqinfo - RETURN: array of SQINFO to add to * tr - RETURN: array of traces to add to * nseq - RETURN: number of seqs * * Returns: new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is * increased to nseq+ainfo.nseq. */ void include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped, char ***rsq, char ***dsq, SQINFO **sqinfo, struct p7trace_s ***tr, int *nseq) { int format; /* format of alignment file */ char **aseq; /* aligned seqs */ char **newdsq; char **newrseq; AINFO ainfo; /* info that goes with aseq */ int idx; /* counter over aseqs */ struct p7trace_s *master; /* master trace */ struct p7trace_s **addtr; /* individual traces for aseq */ if (! SeqfileFormat(seqfile, &format, NULL)) switch (squid_errno) { case SQERR_NOFILE: ajFatal("Alignment file %s could not be opened for reading", seqfile); /*FALLTHRU*/ /* a white lie to shut lint up */ case SQERR_FORMAT: default: ajFatal("Failed to determine format of alignment file %s", seqfile); } /* read the alignment from file */ if (! ReadAlignment(seqfile, format, &aseq, &ainfo)) ajFatal("Failed to read aligned sequence file %s", seqfile); for (idx = 0; idx < ainfo.nseq; idx++) s2upper(aseq[idx]); /* Verify checksums before mapping */ if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum) ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", seqfile); /* Get a master trace */ if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen); else master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm); /* convert to individual traces */ ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr); /* add those traces to existing ones */ *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq); /* additional bookkeeping: add to dsq, sqinfo */ *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq)); DealignAseqs(aseq, ainfo.nseq, &newrseq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*rsq)[idx] = newrseq[idx - (*nseq)]; free(newrseq); *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq)); DigitizeAlignment(aseq, &ainfo, &newdsq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*dsq)[idx] = newdsq[idx - (*nseq)]; free(newdsq); /* unnecessarily complex, but I can't be bothered... */ *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq)); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)])); *nseq = *nseq + ainfo.nseq; /* Cleanup */ P7FreeTrace(master); FreeAlignment(aseq, &ainfo); /* Return */ return; }
int WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo) { int numline = 0; int lines = 0, spacer = 0, width = 50, tab = 0; int i, j, l, l1, ibase; char endstr[10]; char s[100]; /* buffer for sequence */ char ss[100]; /* buffer for structure */ int checksum = 0; int seqlen; int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */ int dostruc; /* TRUE to print structure lines*/ which_case = 0; dostruc = FALSE; seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq); /* intercept Selex-format requests - SRE */ if (outform == kSelex) { fprintf(outf, "%10s %s\n", sqinfo->name, seq); return 1; } if (outform == kClustal || outform == kMSF) { Warn("Tried to write Clustal or MSF with WriteSeq() -- bad, bad."); return 1; } strcpy( endstr,""); l1 = 0; /* 10Nov91: write this out in all possible formats: */ checksum = GCGchecksum(seq, seqlen); switch (outform) { case kUnknown: /* no header, just sequence */ strcpy(endstr,"\n"); /* end w/ extra blank line */ break; case kGenBank: fprintf(outf,"LOCUS %s %d bp\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name, seqlen); fprintf(outf,"DEFINITION %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf,"ACCESSION %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf,"ORIGIN \n"); spacer = 11; numline = 1; strcpy(endstr, "\n//"); break; case kGCGdata: fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen); fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); break; case kPIR: fprintf(outf, "ENTRY %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); fprintf(outf, "TITLE %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf, "ACCESSION %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf, "SUMMARY #Length %d #Checksum %d\n", sqinfo->len, checksum); fprintf(outf, "SEQUENCE\n"); fprintf(outf, " 5 10 15 20 25 30\n"); spacer = 2; /* spaces after every residue */ numline = 1; /* number lines w/ coords */ width = 30; /* 30 aa per line */ strcpy(endstr, "\n///"); break; case kSquid: fprintf(outf, "NAM %s\n", sqinfo->name); if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN)) fprintf(outf, "SRC %s %s %d..%d::%d\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0); if (sqinfo->flags & SQINFO_DESC) fprintf(outf, "DES %s\n", sqinfo->desc); if (sqinfo->flags & SQINFO_SS) { fprintf(outf, "SEQ +SS\n"); dostruc = TRUE; /* print structure lines too */ } else fprintf(outf, "SEQ\n"); numline = 1; /* number seq lines w/ coords */ strcpy(endstr, "\n++"); break; case kEMBL: fprintf(outf,"ID %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); fprintf(outf,"AC %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf,"DE %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf,"SQ %d BP\n", seqlen); strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ tab = 5; /** added 31jan91 */ spacer = 11; /** added 31jan91 */ break; case kGCG: fprintf(outf,"%s\n", sqinfo->name); if (sqinfo->flags & SQINFO_ACC) fprintf(outf,"ACCESSION %s\n", sqinfo->acc); if (sqinfo->flags & SQINFO_DESC) fprintf(outf,"DEFINITION %s\n", sqinfo->desc); fprintf(outf," %s Length: %d (today) Check: %d ..\n", sqinfo->name, seqlen, checksum); spacer = 11; numline = 1; strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ break; case kStrider: /* ?? map ?*/ fprintf(outf,"; ### from DNA Strider ;-)\n"); fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n", sqinfo->name, seqlen, checksum); strcpy(endstr, "\n//"); break; /* SRE: Don had Zuker default to Pearson, which is not intuitive or helpful, since Zuker's MFOLD can't read Pearson format. More useful to use kIG */ case kZuker: which_case = 1; /* MFOLD requires upper case. */ /*FALLTHRU*/ case kIG: fprintf(outf,";%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); fprintf(outf,"%s\n", sqinfo->name); strcpy(endstr,"1"); /* == linear dna */ break; case kRaw: /* Raw: just print the whole sequence. */ fprintf(outf, "%s\n", seq); return 1; default : case kPearson: fprintf(outf,">%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); break; } if (which_case == 1) s2upper(seq); if (which_case == 2) s2lower(seq); width = MIN(width,100); for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) { if (l1 < 0) l1 = 0; else if (l1 == 0) { if (numline) fprintf(outf,"%8d ",ibase); for (j=0; j<tab; j++) fputc(' ',outf); } if ((spacer != 0) && ((l+1) % spacer == 1)) { s[l] = ' '; ss[l] = ' '; l++; } s[l] = seq[i]; ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.'; l++; i++; l1++; /* don't count spaces for width*/ if (l1 == width || i == seqlen) { s[l] = ss[l] = '\0'; l = 0; l1 = 0; if (dostruc) { fprintf(outf, "%s\n", s); if (numline) fprintf(outf," "); for (j=0; j<tab; j++) fputc(' ',outf); if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr); else fprintf(outf,"%s\n",ss); } else { if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr); else fprintf(outf,"%s\n",s); } lines++; ibase = i+1; } } return lines; }