/* Function: WriteMSF() * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis] * * Purpose: Write an alignment in MSF format to an open file. * * Args: fp - file that's open for writing. * msa - alignment to write. * * Note that msa->type, usually optional, must be * set for WriteMSF to work. If it isn't, a fatal * error is generated. * * Returns: (void) */ void WriteMSF(FILE *fp, MSA *msa) { time_t now; /* current time as a time_t */ char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */ char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */ char **gcg_sqname; /* sequence names with GCG-valid character sets */ int idx; /* counter for sequences */ char *s; /* pointer into sqname or seq */ int len; /* tmp variable for name lengths */ int namelen; /* maximum name length used */ int pos; /* position counter */ char buffer[51]; /* buffer for writing seq */ int i; /* another position counter */ /***************************************************************** * Make copies of sequence names and sequences. * GCG recommends that name characters should only contain * alphanumeric characters, -, or _ * Some GCG and GCG-compatible software is sensitive to this. * We silently convert all other characters to '_'. * * For sequences, GCG allows only ~ and . for gaps. * Otherwise, everthing is interpreted as a residue; * so squid's IUPAC-restricted chars are fine. ~ means * an external gap. . means an internal gap. *****************************************************************/ /* make copies that we can edit */ gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq); gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq); for (idx = 0; idx < msa->nseq; idx++) { gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen); gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1); } /* alter names as needed */ for (idx = 0; idx < msa->nseq; idx++) for (s = gcg_sqname[idx]; *s != '\0'; s++) if (! isalnum((int) *s) && *s != '-' && *s != '_') *s = '_'; /* alter gap chars in seq */ for (idx = 0; idx < msa->nseq; idx++) { for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++) *s = '~'; for (; *s != '\0'; s++) if (isgap(*s)) *s = '.'; for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--) gcg_aseq[idx][pos] = '~'; } /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < msa->nseq; idx++) if ((len = strlen(msa->sqname[idx])) > namelen) namelen = len; /***************************************************** * Write the MSF header *****************************************************/ /* required file type line */ if (msa->type == kOtherSeq) msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq); if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kOtherSeq) Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n"); else Die("Invalid sequence type %d in WriteMSF()\n", msa->type); /* free text comments */ if (msa->ncomment > 0) { for (idx = 0; idx < msa->ncomment; idx++) fprintf(fp, "%s\n", msa->comment[idx]); fprintf(fp, "\n"); } /* required checksum line */ now = time(NULL); if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0) Die("What time is it on earth? strftime() failed in WriteMSF().\n"); fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n", msa->name != NULL ? msa->name : "squid.msf", msa->alen, msa->type == kRNA ? 'N' : 'P', date, GCGMultchecksum(gcg_aseq, msa->nseq)); fprintf(fp, "\n"); /***************************************************** * Names/weights section *****************************************************/ for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n", namelen, namelen, gcg_sqname[idx], msa->alen, GCGchecksum(gcg_aseq[idx], msa->alen), msa->wgt[idx]); } fprintf(fp, "\n"); fprintf(fp, "//\n"); /***************************************************** * Write the sequences *****************************************************/ for (pos = 0; pos < msa->alen; pos += 50) { fprintf(fp, "\n"); /* Blank line between sequence blocks */ /* Coordinate line */ len = (pos + 50) > msa->alen ? msa->alen - pos : 50; if (len > 10) fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "", pos+1, len + ((len-1)/10) - 12, "", pos + len); else fprintf(fp, "%*s %-6d\n", namelen, "", pos+1); for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]); /* get next line's worth of 50 from seq */ strncpy(buffer, gcg_aseq[idx] + pos, 50); buffer[50] = '\0'; /* draw the sequence line */ for (i = 0; i < len; i++) { if (! (i % 10)) fputc(' ', fp); fputc(buffer[i], fp); } fputc('\n', fp); } } Free2DArray((void **) gcg_aseq, msa->nseq); Free2DArray((void **) gcg_sqname, msa->nseq); return; }
/* Function: include_alignment() * Date: SRE, Sun Jul 5 15:25:13 1998 [St. Louis] * * Purpose: Given the name of a multiple alignment file, * align that alignment to the HMM, and add traces * to an existing array of traces. If do_mapped * is TRUE, we use the HMM's map file. If not, * we use P7ViterbiAlignAlignment(). * * Args: seqfile - name of alignment file * hmm - model to align to * do_mapped- TRUE if we're to use the HMM's alignment map * rsq - RETURN: array of rseqs to add to * dsq - RETURN: array of dsq to add to * sqinfo - RETURN: array of SQINFO to add to * tr - RETURN: array of traces to add to * nseq - RETURN: number of seqs * * Returns: new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is * increased to nseq+ainfo.nseq. */ void include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped, char ***rsq, char ***dsq, SQINFO **sqinfo, struct p7trace_s ***tr, int *nseq) { int format; /* format of alignment file */ char **aseq; /* aligned seqs */ char **newdsq; char **newrseq; AINFO ainfo; /* info that goes with aseq */ int idx; /* counter over aseqs */ struct p7trace_s *master; /* master trace */ struct p7trace_s **addtr; /* individual traces for aseq */ if (! SeqfileFormat(seqfile, &format, NULL)) switch (squid_errno) { case SQERR_NOFILE: ajFatal("Alignment file %s could not be opened for reading", seqfile); /*FALLTHRU*/ /* a white lie to shut lint up */ case SQERR_FORMAT: default: ajFatal("Failed to determine format of alignment file %s", seqfile); } /* read the alignment from file */ if (! ReadAlignment(seqfile, format, &aseq, &ainfo)) ajFatal("Failed to read aligned sequence file %s", seqfile); for (idx = 0; idx < ainfo.nseq; idx++) s2upper(aseq[idx]); /* Verify checksums before mapping */ if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum) ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", seqfile); /* Get a master trace */ if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen); else master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm); /* convert to individual traces */ ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr); /* add those traces to existing ones */ *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq); /* additional bookkeeping: add to dsq, sqinfo */ *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq)); DealignAseqs(aseq, ainfo.nseq, &newrseq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*rsq)[idx] = newrseq[idx - (*nseq)]; free(newrseq); *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq)); DigitizeAlignment(aseq, &ainfo, &newdsq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*dsq)[idx] = newdsq[idx - (*nseq)]; free(newdsq); /* unnecessarily complex, but I can't be bothered... */ *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq)); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)])); *nseq = *nseq + ainfo.nseq; /* Cleanup */ P7FreeTrace(master); FreeAlignment(aseq, &ainfo); /* Return */ return; }