/** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] iSeqType * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs, int iMaxNumSeq, int iMaxSeqLen) { SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == iSeqFmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { iSeqFmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } if ((int)strlen(cur_seq)==0) { Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues", cur_sqinfo.name); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != iSeqType) { if (prMSeq->seqtype != iSeqType) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType)); prMSeq->seqtype = iSeqType; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); /*#if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif*/ /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } /* order in which sequences appear in guide-tree * only allocate if different output-order desired */ prMSeq->tree_order = NULL; prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; }
/** * * @brief the 'real' main function * */ int MyMain(int argc, char **argv) { mseq_t *prMSeq = NULL; mseq_t *prMSeqProfile1 = NULL; mseq_t *prMSeqProfile2 = NULL; cmdline_opts_t cmdline_opts; /* Must happen first: setup logger */ LogDefaultSetup(&rLog); /*Log(&rLog, LOG_WARN, "This is a non-public realase of %s. Please do not distribute.", PACKAGE_NAME);*/ /*Log(&rLog, LOG_WARN, "This is a beta version of %s, for protein only.", PACKAGE_NAME);*/ /* FS, r237 -> 238 */ SetDefaultUserOpts(&(cmdline_opts)); ParseCommandLine(&cmdline_opts, argc, argv); if (NULL != cmdline_opts.pcLogFile) { prLogFile = fopen(cmdline_opts.pcLogFile, "w"); LogSetFP(&rLog, LOG_INFO, prLogFile); LogSetFP(&rLog, LOG_VERBOSE, prLogFile); LogSetFP(&rLog, LOG_DEBUG, prLogFile); } InitClustalOmega(cmdline_opts.iThreads); if (rLog.iLogLevelEnabled < LOG_INFO) { PrintUserOpts(LogGetFP(&rLog, LOG_INFO), & cmdline_opts); PrintAlnOpts(LogGetFP(&rLog, LOG_INFO), & (cmdline_opts.aln_opts)); } /* Read sequence file * */ if (NULL != cmdline_opts.pcSeqInfile) { NewMSeq(&prMSeq); if (ReadSequences(prMSeq, cmdline_opts.pcSeqInfile, cmdline_opts.iSeqType, cmdline_opts.iSeqInFormat, cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen)) { Log(&rLog, LOG_FATAL, "Reading sequence file '%s' failed", cmdline_opts.pcSeqInfile); } #if TRACE { int iAux; for (iAux=0; iAux<prMSeq->nseqs; iAux++) { Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", iAux, prMSeq->seq[iAux]); LogSqInfo(&prMSeq->sqinfo[iAux]); } } #endif } /* k-tuple pairwise distance calculation seg-faults if * only one sequence, simply exit early. * note that for profile/profile alignment prMSeq is NULL * FS, r222->r223 */ if (prMSeq && (prMSeq->nseqs <= 1)){ Log(&rLog, LOG_FATAL, "File '%s' contains %d sequence%s, nothing to align", cmdline_opts.pcSeqInfile, prMSeq->nseqs, 1==prMSeq->nseqs?"":"s"); } /* Dealign if requested and neccessary */ if (NULL != prMSeq) { if (TRUE == prMSeq->aligned && cmdline_opts.bDealignInputSeqs) { Log(&rLog, LOG_INFO, "Dealigning already aligned input sequences as requested."); DealignMSeq(prMSeq); } } /* Read profile1 * */ if (NULL != cmdline_opts.pcProfile1Infile) { NewMSeq(&prMSeqProfile1); if (ReadSequences(prMSeqProfile1, cmdline_opts.pcProfile1Infile, cmdline_opts.iSeqType, cmdline_opts.iSeqInFormat, cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen)) { Log(&rLog, LOG_FATAL, "Reading sequences from profile file '%s' failed", cmdline_opts.pcProfile1Infile); } /* FIXME: commented out. FS, r240 -> r241 * for explanation see below */ /*if (1==prMSeqProfile1->nseqs) { Log(&rLog, LOG_FATAL, "'%s' contains only one sequence and can therefore not be used as a profile", cmdline_opts.pcProfile1Infile); }*/ if (FALSE == prMSeqProfile1->aligned) { Log(&rLog, LOG_FATAL, "Sequences in '%s' are not aligned, i.e. this is not a profile", cmdline_opts.pcProfile1Infile); } } /* Read profile2 * */ if (NULL != cmdline_opts.pcProfile2Infile) { NewMSeq(&prMSeqProfile2); if (ReadSequences(prMSeqProfile2, cmdline_opts.pcProfile2Infile, cmdline_opts.iSeqType, cmdline_opts.iSeqInFormat, cmdline_opts.iMaxNumSeq, cmdline_opts.iMaxSeqLen)) { Log(&rLog, LOG_FATAL, "Reading sequences from profile file '%s' failed", cmdline_opts.pcProfile2Infile); } /* FIXME: there is no (clean) way to align a single sequence to a profile. * if we go down the -i route, it causes a seg-fault in the pair-wise * k-tuple distance calculation. However, single sequences can be * understood as 1-profiles. Therefore we have to allow for 1-profiles. * FS, r240 -> r241 */ /*if (1==prMSeqProfile2->nseqs) { Log(&rLog, LOG_FATAL, "'%s' contains only one sequence and can therefore not be used as a profile", cmdline_opts.pcProfile2Infile); }*/ if (FALSE == prMSeqProfile1->aligned) { Log(&rLog, LOG_FATAL, "Sequences in '%s' are not aligned, i.e. this is not a profile", cmdline_opts.pcProfile2Infile); } } /* Depending on the input we got perform * * (i) normal alignment: seq + optional profile * or * (ii) profile profile alignment * */ if (NULL != prMSeq) { if (Align(prMSeq, prMSeqProfile1, & cmdline_opts.aln_opts)) { Log(&rLog, LOG_FATAL, "An error occured during the alignment"); } if (WriteAlignment(prMSeq, cmdline_opts.pcAlnOutfile, cmdline_opts.iAlnOutFormat)) { Log(&rLog, LOG_FATAL, "Could not save alignment to %s", cmdline_opts.pcAlnOutfile); } #if 0 { bool bSampling = FALSE; /* better set to TRUE for many sequences */ bool bReportAll = TRUE; AliStat(prMSeq, bSampling, bReportAll); } #endif } else if (NULL != prMSeqProfile1 && NULL != prMSeqProfile2) { if (AlignProfiles(prMSeqProfile1, prMSeqProfile2, cmdline_opts.aln_opts.rHhalignPara)) { Log(&rLog, LOG_FATAL, "An error occured during the alignment"); } if (WriteAlignment(prMSeqProfile1, cmdline_opts.pcAlnOutfile, cmdline_opts.iAlnOutFormat)) { Log(&rLog, LOG_FATAL, "Could not save alignment to %s", cmdline_opts.pcAlnOutfile); } } /* cleanup */ if (NULL != prMSeq) { FreeMSeq(&prMSeq); } if (NULL != prMSeqProfile1) { FreeMSeq(&prMSeqProfile1); } if (NULL != prMSeqProfile2) { FreeMSeq(&prMSeqProfile2); } FreeUserOpts(&cmdline_opts); Log(&rLog, LOG_DEBUG, "Successful program exit"); if (NULL != cmdline_opts.pcLogFile) { fclose(prLogFile); } return EXIT_SUCCESS; }