static void readGCGdata(struct ReadSeqVars *V) { int binary = FALSE; /* whether data are binary or not */ int blen; /* length of binary sequence */ /* first line contains ">>>>" followed by name */ if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0) { binary = TRUE; SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); blen = atoi(sqd_parse[2]); } else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0) SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); else Die("bogus GCGdata format? %s", V->sbuffer); /* second line contains free text description */ getline2(V); SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC); if (binary) { /* allocate for blen characters +3... (allow for 3 bytes of slop) */ if (blen >= V->maxseq) { V->maxseq = blen; if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) Die("malloc failed"); } /* read (blen+3)/4 bytes from file */ if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4)) Die("fread failed"); V->seqlen = blen; /* convert binary code to seq */ GCGBinaryToSequence(V->seq, blen); } else readLoop(0, endGCGdata, V); while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>')))) getline2(V); }
/** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] iSeqType * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs, int iMaxNumSeq, int iMaxSeqLen) { SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == iSeqFmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { iSeqFmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } if ((int)strlen(cur_seq)==0) { Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues", cur_sqinfo.name); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != iSeqType) { if (prMSeq->seqtype != iSeqType) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType)); prMSeq->seqtype = iSeqType; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); /*#if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif*/ /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } /* order in which sequences appear in guide-tree * only allocate if different output-order desired */ prMSeq->tree_order = NULL; prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; }
main (int argc, char ** argv ) { char *seqfile; /* name of sequence file */ SQINFO sqinfo; /* extra info about sequence */ SQFILE *dbfp; /* open sequence file */ int fmt,ofmt=106; /* format of seqfile */ /* 106 is PHYLIP format in SQUID */ char *seq; /* sequence */ int type; /* kAmino, kDNA, kRNA, or kOtherSeq */ sequence * seqs, * cds_seqs; sequence tmp_seqs[2], tmp_cds_seqs[2]; char *optname; char *optarg, *t; int optind; int be_quiet; int seqct = 0,cdsct = 0; int min_aln_len = 0; int do_oneline = 0; char * output_filename = 0, *submat_file = 0; int showaln = 1; int showheader=1; FILE *ofd, *fd; alignment *cds_aln; alignment * opt_alignment = NULL; /* place for pairwise alignment */ int len,i,j, k, jk,ik,aln_count, rc; pairwise_distances pwMLdist, pwNGdist; int firsttime = 1; struct timeval tp; pwMLdist.N = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.S = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.SEdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.SEdN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.t = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.kappa= make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); /* pwMLdist.N = pwMLdist.dN = pwMLdist.S = 0; pwMLdist.dS = pwMLdist.dNdS = pwMLdist.SEdS = 0; pwMLdist.SEdN = pwMLdist.t = pwMLdist.kappa= 0; pwNGdist.dN = pwNGdist.dS = pwNGdist.dNdS = 0; */ Alntype = default_aln_type; /* Command line Parse */ fmt = SQFILE_UNKNOWN; /* default: autodetect format */ be_quiet = FALSE; type = kOtherSeq; /* for our purposes this is only pairwise alignments, but * would rather do it correctly in case we move to MSA case */ while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "--matrix") == 0) submat_file = optarg; else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE; else if (strcmp(optname, "--gapopen") == 0) { Gapopen = atoi(optarg); if( Gapopen < 0 ) Gapopen *= -1; } else if (strcmp(optname, "--gapext") == 0) { Gapext = atoi(optarg); if( Gapext < 0 ) Gapext *= -1; } else if (strcmp(optname, "--informat") == 0) { fmt = String2SeqfileFormat(optarg); if (fmt == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if (strcmp(optname, "--outformat") == 0) { ofmt = String2SeqfileFormat(optarg); if (ofmt == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if( strcmp(optname, "--global") == 0 ) { Alntype = global; } else if (strcmp(optname, "-h") == 0) { puts(usage); puts(experts); exit(EXIT_SUCCESS); } else if ( strcmp(optname, "-v") == 0 ) { Verbose = 1; } else if ( strcmp(optname, "--gapchar") == 0 ) { GapChar = optarg[0]; } else if( strcmp(optname, "--output") == 0 ) { output_filename = optarg; } else if( strcmp(optname, "--showtable" ) == 0 ) { showaln = 0; } else if( strcmp(optname, "--noheader" ) == 0 ) { showheader = 0; } } if (argc - optind < 1) Die("%s\n", usage); if( ! submat_file ) { if( (t = getenv("SUBOPTDIR")) != 0 || (t = getenv("SUBOPT_DIR")) != 0 ) { submat_file = calloc(strlen(t) + 24, sizeof(char)); sprintf(submat_file, "%s/%s",t,Default_submat); } else { submat_file = calloc(strlen((void *)Default_submat) + 24, sizeof(char)); sprintf(submat_file, "../%s",Default_submat); } } /* open matrix */ fd = fopen(submat_file, "r"); if( ! ParsePAMFile(fd,&ScoringMatrix, &MatrixScale) ) { fprintf(stderr, "Cannot parse or open matrix file %s\n",submat_file); free(submat_file); exit(EXIT_SUCCESS); } if( output_filename && strlen(output_filename) != 1 && output_filename[0] != '-') { ofd = fopen(output_filename,"w"); if( ! ofd ) { fprintf(stderr, "could not open file %s",output_filename); goto end; } } else ofd = stdout; while( optind < argc ) { seqfile = argv[optind++]; /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (fmt == SQFILE_UNKNOWN && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) fmt = SQFILE_FASTA; if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) { FreeSequence(NULL, &sqinfo); seqct++; } cds_seqs = (sequence *)calloc(seqct, sizeof(sequence)); seqs = (sequence *)calloc(seqct, sizeof(sequence)); SeqfileRewind(dbfp); seqct=0; while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) { sqinfo.type = Seqtype(seq); if( sqinfo.type == kDNA || sqinfo.type == kRNA ) { seqs[seqct].seqstr = Translate(seq,stdcode1); /* Let's remove the last codon if it is a stop codon */ len = strlen(seqs[seqct].seqstr); if( Verbose ) fprintf(stderr,"seqct is %d length is %d\n",seqct, len); if( seqs[seqct].seqstr[len-1] == '*' ) { seqs[seqct].seqstr[len-1] = '\0'; seq[strlen(seq) - 3] = '\0'; } cds_seqs[cdsct].seqstr = seq; seqs[seqct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char)); cds_seqs[cdsct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char)); strcpy(seqs[seqct].seqname,sqinfo.name ); strcpy(cds_seqs[cdsct].seqname,sqinfo.name); cds_seqs[cdsct].length = sqinfo.len; cds_seqs[cdsct].alphabet = ( sqinfo.type == kDNA ) ? dna : rna; seqs[seqct].length = strlen(seqs[seqct].seqstr); seqs[seqct].alphabet = protein; cdsct++; seqct++; } else { fprintf(stderr,"Expect CDS sequences (DNA or RNA) not Protein\n"); goto end; } FreeSequence(NULL, &sqinfo); if( Verbose && seqct > 3 ) break; } if( seqct < 2 ) { fprintf(stderr,"Must have provided a valid file with at least 2 sequences in it"); goto end; } for( i=0; i < seqct; i++ ) { for(k=i+1; k < seqct; k++ ) { if( (opt_alignment = (alignment *)calloc(1,sizeof(alignment *))) == NULL) { fprintf(stderr,"Could not allocate memory\n"); goto end; } opt_alignment->msa = NULL; rc = optimal_align(&seqs[i],&seqs[k],opt_alignment); if( rc != 1 ) { fprintf(stderr,"Could not make an optimal alignment\n"); goto end; } else { tmp_cds_seqs[0] = cds_seqs[i]; tmp_cds_seqs[1] = cds_seqs[k]; rc = mrtrans(opt_alignment, tmp_cds_seqs, &cds_aln,0); if( rc != 0 ) { fprintf(stderr, "Could not map the coding sequence to the protein alignemnt for aln %d: %d\n",i,rc); goto end; } if( showaln ) { if( ofmt >= 100 ) { MSAFileWrite(ofd,cds_aln->msa, ofmt,do_oneline); } else { for(j=0; j < cds_aln->msa->nseq; j++ ) { WriteSeq(ofd, ofmt, cds_aln->msa->aseq[j], &(cds_aln->sqinfo[j]) ); } } } else { if( showheader && firsttime ) { fprintf(ofd,"SEQ1\tSEQ2\tSCORE\tdN\tdS\tOMEGA\tN\tS\tkappa\tt\tLENGTH\n"); firsttime = 0; } if( do_kaks_yn00(cds_aln->msa, &pwMLdist,&pwNGdist) < 0 ) { fprintf(stderr, "warning: problem with align for %s %s\n", cds_aln->msa->sqname[0], cds_aln->msa->sqname[1]); continue; } for(ik = 0; ik < NUM_PW_SEQS; ik++ ) { for( jk = ik+1; jk < NUM_PW_SEQS; jk++ ) { fprintf(ofd,"%s\t%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%d\n", cds_aln->sqinfo[ik].name, cds_aln->sqinfo[jk].name, opt_alignment->score, pwMLdist.dN[ik][jk],pwMLdist.dS[ik][jk], pwMLdist.dNdS[ik][jk], pwMLdist.N[ik][jk], pwMLdist.S[ik][jk], pwMLdist.kappa[ik][jk], pwMLdist.t[ik][jk], opt_alignment->msa->alen); } } } } cleanup_alignment(cds_aln); cleanup_alignment(opt_alignment); } } } if( ofd && ofd != stdout ) fclose(ofd); end: free(submat_file); Free2DArray((void **)ScoringMatrix,27); for(i =0; i< seqct; i++ ) { free(seqs[i].seqstr); free(seqs[i].seqname); seqs[i].seqstr = seqs[i].seqname = 0; } for(i = 0; i < cdsct; i++) { free(cds_seqs[i].seqstr); free(cds_seqs[i].seqname); cds_seqs[i].seqstr = cds_seqs[i].seqname = 0; } cleanup_matrix((void **)pwMLdist.N,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dN,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.S,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.SEdS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.SEdN,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.t,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dNdS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.kappa,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dN,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dS,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dNdS,NUM_PW_SEQS); free(pwNGdist.dNdS); free(pwNGdist.dN); free(pwNGdist.dS); free(pwMLdist.dNdS); free(pwMLdist.dN); free(pwMLdist.dS); free(pwMLdist.N); free(pwMLdist.S); free(pwMLdist.SEdS); free(pwMLdist.SEdN); free(pwMLdist.t); free(pwMLdist.kappa); return 0; }
/* Function: MSAFileOpen() * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis] * * Purpose: Open an alignment database file and prepare * for reading one alignment, or sequentially * in the (rare) case of multiple MSA databases * (e.g. Stockholm format). * * Args: filename - name of file to open * if "-", read stdin * if it ends in ".gz", read from pipe to gunzip -dc * format - format of file (e.g. MSAFILE_STOCKHOLM) * env - environment variable for path (e.g. BLASTDB) * * Returns: opened MSAFILE * on success. * NULL on failure: * usually, because the file doesn't exist; * for gzip'ed files, may also mean that gzip isn't in the path. */ MSAFILE * MSAFileOpen(char *filename, int format, char *env) { MSAFILE *afp; afp = MallocOrDie(sizeof(MSAFILE)); if (strcmp(filename, "-") == 0) { afp->f = stdin; afp->do_stdin = TRUE; afp->do_gzip = FALSE; afp->fname = sre_strdup("[STDIN]", -1); afp->ssi = NULL; /* can't index stdin because we can't seek*/ } #ifndef SRE_STRICT_ANSI /* popen(), pclose() aren't portable to non-POSIX systems; disable */ else if (Strparse("^.*\\.gz$", filename, 0)) { char cmd[256]; /* Note that popen() will return "successfully" * if file doesn't exist, because gzip works fine * and prints an error! So we have to check for * existence of file ourself. */ if (! FileExists(filename)) Die("%s: file does not exist", filename); if (strlen(filename) + strlen("gzip -dc ") >= 256) Die("filename > 255 char in MSAFileOpen()"); sprintf(cmd, "gzip -dc %s", filename); if ((afp->f = popen(cmd, "r")) == NULL) return NULL; afp->do_stdin = FALSE; afp->do_gzip = TRUE; afp->fname = sre_strdup(filename, -1); /* we can't index a .gz file, because we can't seek in a pipe afaik */ afp->ssi = NULL; } #endif /*SRE_STRICT_ANSI*/ else { char *ssifile; char *dir; /* When we open a file, it may be either in the current * directory, or in the directory indicated by the env * argument - and we have to construct the SSI filename accordingly. */ if ((afp->f = fopen(filename, "r")) != NULL) { ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5)); sprintf(ssifile, "%s.ssi", filename); } else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL) { char *full; full = FileConcat(dir, filename); ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5)); sprintf(ssifile, "%s.ssi", full); free(dir); } else return NULL; afp->do_stdin = FALSE; afp->do_gzip = FALSE; afp->fname = sre_strdup(filename, -1); afp->ssi = NULL; /* Open the SSI index file. If it doesn't exist, or * it's corrupt, or some error happens, afp->ssi stays NULL. */ SSIOpen(ssifile, &(afp->ssi)); free(ssifile); } /* Invoke autodetection if we haven't already been told what * to expect. */ if (format == MSAFILE_UNKNOWN) { if (afp->do_stdin == TRUE || afp->do_gzip) Die("Can't autodetect alignment file format from a stdin or gzip pipe"); format = MSAFileFormat(afp); if (format == MSAFILE_UNKNOWN) Die("Can't determine format of multiple alignment file %s", afp->fname); } afp->format = format; afp->linenumber = 0; afp->buf = NULL; afp->buflen = 0; return afp; }
/* Function: ReadMSF() * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] * * Purpose: Parse an alignment read from an open MSF format * alignment file. (MSF is a single-alignment format.) * Return the alignment, or NULL if we've already * read the alignment. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadMSF(MSAFILE *afp) { MSA *msa; char *s; int alleged_alen; int alleged_type; int alleged_checksum; char *tok; char *sp; int slen; int sqidx; char *name; char *seq; if (feof(afp->f)) return NULL; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; /* The first line is the header. * This is a new-ish GCG feature. Don't count on it, so * we can be a bit more tolerant towards non-GCG software * generating "MSF" files. */ msa = MSAAlloc(10, 0); if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kAmino; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { msa->type = kRNA; if ((s = MSAFileGetLine(afp)) == NULL) return NULL; } /* Now we're in the free text comment section of the MSF file. * It ends when we see the "MSF: Type: Check: .." line. * This line must be present. */ do { if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) { alleged_alen = atoi(sqd_parse[0]); switch (*(sqd_parse[1])) { case 'N' : alleged_type = kRNA; break; case 'P' : alleged_type = kAmino; break; case 'X' : alleged_type = kOtherSeq; break; default : alleged_type = kOtherSeq; } alleged_checksum = atoi(sqd_parse[3]); if (msa->type == kOtherSeq) msa->type = alleged_type; break; /* we're done with comment section. */ } if (! IsBlankline(s)) MSAAddComment(msa, s); } while ((s = MSAFileGetLine(afp)) != NULL); /* Now we're in the name section. * GCG has a relatively poorly documented feature: only sequences that * appear in this list will be read from the alignment section. Commenting * out sequences in the name list (by preceding them with "!") is * allowed as a means of manually defining subsets of sequences in * the alignment section. We can support this feature reasonably * easily because of the hash table for names in the MSA: we * only add names to the hash table when we see 'em in the name section. */ while ((s = MSAFileGetLine(afp)) != NULL) { while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ if (*s == '\n') continue; /* skip blank lines */ else if (*s == '!') MSAAddComment(msa, s); else if ((sp = strstr(s, "Name:")) != NULL) { /* We take the name and the weigh, and that's it */ sp += 5; tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */ sqidx = GKIStoreKey(msa->index, tok); if (sqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[sqidx] = sre_strdup(tok, slen); msa->nseq++; if ((sp = strstr(sp, "Weight:")) == NULL) Die("No Weight: on line %d for %s in name section of MSF file %s\n", afp->linenumber, msa->sqname[sqidx], afp->fname); sp += 7; tok = sre_strtok(&sp, " \t", &slen); msa->wgt[sqidx] = atof(tok); msa->flags |= MSA_SET_WGT; } else if (strncmp(s, "//", 2) == 0) break; else { Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", afp->linenumber, afp->fname, s); squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ return NULL; } } /* And now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Also, watch out for coordinate-only lines. */ while ((s = MSAFileGetLine(afp)) != NULL) { sp = s; if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; /* The test for a coord line: digits starting both fields */ if (isdigit((int) *name) && isdigit((int) *seq)) continue; /* It's not blank, and it's not a coord line: must be sequence */ sqidx = GKIKeyIndex(msa->index, name); if (sqidx < 0) continue; /* not a sequence we recognize */ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } /* We've left blanks in the aseqs; take them back out. */ for (sqidx = 0; sqidx < msa->nseq; sqidx++) { if (msa->aseq[sqidx] == NULL) Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) { if (*s == ' ' || *s == '\t') { msa->sqlen[sqidx]--; } else { *sp = *s; sp++; } } *sp = '\0'; } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; }
/* Function: SeqfileOpen() * * Purpose : Open a sequence database file and prepare for reading * sequentially. * * Args: filename - name of file to open * format - format of file * env - environment variable for path (e.g. BLASTDB) * * Returns opened SQFILE ptr, or NULL on failure. */ SQFILE * SeqfileOpen(char *filename, int format, char *env) { SQFILE *dbfp; dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE)); dbfp->format = format; dbfp->longline = FALSE; /* Open our file handle. * Three possibilities: * 1. normal file open * 2. filename = "-"; read from stdin * 3. filename = "*.gz"; read thru pipe from gzip * If we're reading from stdin or a pipe, we can't reliably * back up, so we can't do two-pass parsers like the interleaved alignment * formats. */ if (strcmp(filename, "-") == 0) { if (IsInterleavedFormat(format)) Die("Can't read interleaved alignment formats thru stdin, sorry"); dbfp->f = stdin; dbfp->do_stdin = TRUE; dbfp->do_gzip = FALSE; } else if (Strparse("^.*\\.gz$", filename, 0) == 0) { char cmd[256]; if (IsInterleavedFormat(format)) Die("Can't read interleaved alignment formats thru gunzip, sorry"); if (strlen(filename) + strlen("gzip -dc ") >= 256) { squid_errno = SQERR_PARAMETER; return NULL; } sprintf(cmd, "gzip -dc %s", filename); if ((dbfp->f = popen(cmd, "r")) == NULL) { squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */ dbfp->do_stdin = FALSE; dbfp->do_gzip = TRUE; } else { if ((dbfp->f = fopen(filename, "r")) == NULL && (dbfp->f = EnvFileOpen(filename, env)) == NULL) { squid_errno = SQERR_NOFILE; return NULL; } dbfp->do_stdin = FALSE; dbfp->do_gzip = FALSE; } /* The hack for sequential access of an interleaved alignment file: * read the alignment in, we'll copy sequences out one at a time. */ dbfp->ali_aseqs = NULL; if (IsInterleavedFormat(format)) { if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo))) return NULL; dbfp->ali_curridx = 0; return dbfp; } /* Load the first line. */ getline2(dbfp); return dbfp; }