/** * @brief FIXME add doc * */ void PrintUserOpts(FILE *prFile, cmdline_opts_t *opts) { /* keep in same order as in struct. FIXME could this be derived from argtable? */ fprintf(prFile, "seq-type = %s\n", SeqTypeToStr(opts->iSeqType)); fprintf(prFile, "seq-in-fmt = %s\n", SeqfileFormat2String(opts->iSeqInFormat)); fprintf(prFile, "option: seq-in = %s\n", NULL != opts->pcSeqInfile? opts->pcSeqInfile: "(null)"); fprintf(prFile, "option: dealign = %d\n", opts->bDealignInputSeqs); fprintf(prFile, "option: profile1 = %s\n", NULL != opts->pcProfile1Infile? opts->pcProfile1Infile: "(null)"); fprintf(prFile, "option: profile2 = %s\n", NULL != opts->pcProfile2Infile? opts->pcProfile2Infile: "(null)"); fprintf(prFile, "option: max-num-seq = %d\n", opts->iMaxNumSeq); fprintf(prFile, "option: max-seq-len = %d\n", opts->iMaxSeqLen); fprintf(prFile, "option: aln-out-file = %s\n", NULL != opts->pcAlnOutfile? opts->pcAlnOutfile: "(null)"); fprintf(prFile, "option: aln-out-format = %s\n", SeqfileFormat2String(opts->iAlnOutFormat)); fprintf(prFile, "option: force-file-overwrite = %d\n", opts->bForceFileOverwrite); fprintf(prFile, "option: threads = %d\n", opts->iThreads); fprintf(prFile, "option: logFile = %s\n", opts->pcLogFile); }
/** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] iSeqType * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs, int iMaxNumSeq, int iMaxSeqLen) { SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == iSeqFmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { iSeqFmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } if ((int)strlen(cur_seq)==0) { Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues", cur_sqinfo.name); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != iSeqType) { if (prMSeq->seqtype != iSeqType) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType)); prMSeq->seqtype = iSeqType; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); /*#if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif*/ /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } /* order in which sequences appear in guide-tree * only allocate if different output-order desired */ prMSeq->tree_order = NULL; prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; }