Example #1
0
static void
readGCGdata(struct ReadSeqVars *V)
{
  int   binary = FALSE;		/* whether data are binary or not */
  int   blen;			/* length of binary sequence */
  
				/* first line contains ">>>>" followed by name */
  if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0) 
    {
      binary = TRUE;
      SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
      blen = atoi(sqd_parse[2]);
    } 
  else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0) 
    SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
  else 
    Die("bogus GCGdata format? %s", V->sbuffer);

				/* second line contains free text description */
  getline2(V);
  SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);

  if (binary) {
    /* allocate for blen characters +3... (allow for 3 bytes of slop) */
    if (blen >= V->maxseq) {
      V->maxseq = blen;
      if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
	Die("malloc failed");
    }
				/* read (blen+3)/4 bytes from file */
    if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
      Die("fread failed");
    V->seqlen = blen;
				/* convert binary code to seq */
    GCGBinaryToSequence(V->seq, blen);
  }
  else readLoop(0, endGCGdata, V);
  
  while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
    getline2(V);
}
Example #2
0
/**
 * @brief reads sequences from file
 *
 * @param[out] prMSeq
 * Multiple sequence struct. Must be preallocated.
 * FIXME: would make more sense to allocate it here.
 * @param[in] seqfile
 * Sequence file name. If '-' sequence will be read from stdin.
 * @param[in] iSeqType
 * int-encoded sequence type. Set to
 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
 * @param[in] iMaxNumSeq
 * Return an error, if more than iMaxNumSeq have been read
 * @param[in] iMaxSeqLen
 * Return an error, if a seq longer than iMaxSeqLen has been read
 *
 * @return 0 on success, -1 on error
 *
 * @note
 *  - Depends heavily on squid
 *  - Sequence file format will be guessed
 *  - If supported by squid, gzipped files can be read as well.
 */
int
ReadSequences(mseq_t *prMSeq, char *seqfile,
              int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
              int iMaxNumSeq, int iMaxSeqLen)
{
    SQFILE *dbfp; /* sequence file descriptor */
    char *cur_seq;
    SQINFO cur_sqinfo;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    assert(NULL!=seqfile);


    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (SQFILE_UNKNOWN == iSeqFmt  &&
            (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
        iSeqFmt = SQFILE_FASTA;
    }

    /* Using squid routines to read input. taken from seqstat_main.c. we don't
     * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
     * etc. NOTE this also means we discard some information, e.g. when
     * reading from and writing to a stockholm file, all extra MSA
     * info/annotation will be lost.
     *
     */

    if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) {
        Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
        return -1;
    }


    /* FIXME squid's ReadSeq() will exit with fatal error if format is
     * unknown. This will be a problem for a GUI. Same is true for many squid
     * other functions.
     *
     * The original squid:ReadSeq() dealigns sequences on input. We
     * use a patched version.
     *
     */
    while (ReadSeq(dbfp, dbfp->format,
                   &cur_seq,
                   &cur_sqinfo)) {

        if (prMSeq->nseqs+1>iMaxNumSeq) {
            Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
                iMaxNumSeq, cur_sqinfo.name, seqfile);
            return -1;
        }
        if ((int)strlen(cur_seq)>iMaxSeqLen) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
                cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
            return -1;
        }
        if ((int)strlen(cur_seq)==0) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues",
                cur_sqinfo.name);
            return -1;
        }

        /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
         */

        prMSeq->seq =  (char **)
                       CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
        prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);


        prMSeq->sqinfo =  (SQINFO *)
                          CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
        SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);

#ifdef TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
        LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
#endif
        /* always guess type from first seq. use squid function and
         * convert value
         */
        if (0 == prMSeq->nseqs) {
            int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
            switch (type)  {
            case kDNA:
                prMSeq->seqtype = SEQTYPE_DNA;
                break;
            case kRNA:
                prMSeq->seqtype = SEQTYPE_RNA;
                break;
            case kAmino:
                prMSeq->seqtype = SEQTYPE_PROTEIN;
                break;
            case kOtherSeq:
                prMSeq->seqtype = SEQTYPE_UNKNOWN;
                break;
            default:
                Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
            }

            /* override with given sequence type but check with
             * automatically detected type and warn if necessary
             */
            if (SEQTYPE_UNKNOWN != iSeqType) {
                if (prMSeq->seqtype != iSeqType) {
                    Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
                        SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType));
                    prMSeq->seqtype = iSeqType;
                }
            }
            /* if type could not be determined and was not set return error */
            if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
                Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
                FreeSequence(cur_seq, &cur_sqinfo);
                SeqfileClose(dbfp);
                return -1;
            }
        }

        Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
            prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
            prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
            prMSeq->seq[prMSeq->nseqs]);

        /* FIXME IPUAC and/or case conversion? If yes see
         * corresponding squid functions. Special treatment of
         * Stockholm tilde-gaps for ktuple code?
         */

        prMSeq->nseqs++;

        FreeSequence(cur_seq, &cur_sqinfo);
    }
    SeqfileClose(dbfp);

    /*#if ALLOW_ONLY_PROTEIN
        if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
            Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
                  SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
        }
    #endif*/

    /* Check if sequences are aligned */
    prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs);


    /* keep original sequence as copy and convert "working" sequence
     *
     */
    prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
    for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {

        prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);


        /* convert unknown characters according to set seqtype
         * be conservative, i.e. don't allow any fancy ambiguity
         * characters to make sure that ktuple code etc. works.
         */

        /* first on the fly conversion between DNA and RNA
         */
        if (prMSeq->seqtype==SEQTYPE_DNA)
            ToDNA(prMSeq->seq[iSeqIdx]);
        if (prMSeq->seqtype==SEQTYPE_RNA)
            ToRNA(prMSeq->seq[iSeqIdx]);

        /* then check of each character
         */
        for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
            char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
            if (isgap(*res))
                continue;

            if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
                if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
                    *res = AMINOACID_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_DNA) {
                if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_RNA) {
                if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            }
        }
    }

    /* order in which sequences appear in guide-tree
     * only allocate if different output-order desired */
    prMSeq->tree_order = NULL;

    prMSeq->filename = CkStrdup(seqfile);
    Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
        prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);

    return 0;
}
Example #3
0
main (int argc, char ** argv ) 
{
  char     *seqfile;            /* name of sequence file     */
  SQINFO    sqinfo;             /* extra info about sequence */
  SQFILE   *dbfp;		/* open sequence file        */
  int       fmt,ofmt=106;	/* format of seqfile         */
                                /* 106 is PHYLIP format in SQUID */
  char     *seq;		/* sequence                  */
  int       type;		/* kAmino, kDNA, kRNA, or kOtherSeq */
  sequence  * seqs, * cds_seqs;
  sequence  tmp_seqs[2], tmp_cds_seqs[2];
  char  *optname;
  char  *optarg, *t;
  int    optind;
  int    be_quiet;
  int    seqct = 0,cdsct = 0;
  int    min_aln_len      = 0;
  int    do_oneline       = 0;
  char   * output_filename = 0, *submat_file = 0;
  int    showaln = 1;
  int    showheader=1;
  FILE  *ofd, *fd;
  alignment   *cds_aln;
  alignment * opt_alignment = NULL;  /* place for pairwise alignment */

  int    len,i,j, k, jk,ik,aln_count, rc;
  pairwise_distances pwMLdist, pwNGdist;
  int firsttime = 1;
  
  struct timeval tp;

  pwMLdist.N    = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.dN   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.S    = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.dS   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.SEdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.SEdN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.t    = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.kappa= make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);

  pwNGdist.dN   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwNGdist.dS   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwNGdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
/*
  pwMLdist.N    = pwMLdist.dN   = pwMLdist.S    = 0;
  pwMLdist.dS   = pwMLdist.dNdS = pwMLdist.SEdS = 0;
  pwMLdist.SEdN = pwMLdist.t    = pwMLdist.kappa= 0;
      
  pwNGdist.dN   = pwNGdist.dS   = pwNGdist.dNdS = 0;
*/

  Alntype = default_aln_type;
  
  /* Command line Parse */
  fmt       = SQFILE_UNKNOWN;	/* default: autodetect format  */
  be_quiet  = FALSE;
  type      =  kOtherSeq;

  /* for our purposes this is only pairwise alignments, but
   * would rather do it correctly in case we move to MSA case 
   */
  
  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, 
		&optind, &optname, &optarg))
    {
      if      (strcmp(optname, "--matrix") == 0)  submat_file = optarg; 
      else if (strcmp(optname, "--quiet")   == 0)  be_quiet  = TRUE; 
      else if (strcmp(optname, "--gapopen") == 0)  {
	Gapopen = atoi(optarg); 
	if( Gapopen < 0 ) Gapopen *= -1;
	
      } else if (strcmp(optname, "--gapext")  == 0)  {
	Gapext = atoi(optarg); 
	if( Gapext < 0 ) Gapext *= -1;

      } else if (strcmp(optname, "--informat") == 0) {
	fmt = String2SeqfileFormat(optarg);
	if (fmt == SQFILE_UNKNOWN) 
	  Die("unrecognized sequence file format \"%s\"", optarg);
      } else if (strcmp(optname, "--outformat") == 0) {
	ofmt = String2SeqfileFormat(optarg);
	if (ofmt == SQFILE_UNKNOWN) 
	  Die("unrecognized sequence file format \"%s\"", optarg);
      }  else if( strcmp(optname, "--global") == 0 ) {
	Alntype = global;
      } else if (strcmp(optname, "-h") == 0) {
	puts(usage);
	puts(experts);
        exit(EXIT_SUCCESS);
      } else if ( strcmp(optname, "-v") == 0 ) {
	Verbose = 1;
      } else if ( strcmp(optname, "--gapchar") == 0 ) {
	GapChar = optarg[0];
      }  else if(  strcmp(optname, "--output") == 0 ) {
	output_filename = optarg;	  
      } else if( strcmp(optname, "--showtable" ) == 0  ) {
	showaln = 0;
      } else if( strcmp(optname, "--noheader" ) == 0 ) {
	showheader = 0;
      }      
    }

  if (argc - optind < 1) Die("%s\n", usage);

  if( ! submat_file ) { 
    if( (t = getenv("SUBOPTDIR")) != 0 || 
	(t = getenv("SUBOPT_DIR")) != 0 ) {
      submat_file = calloc(strlen(t) + 24, sizeof(char));
      sprintf(submat_file, "%s/%s",t,Default_submat);
    } else { 
      submat_file = calloc(strlen((void *)Default_submat) + 24, sizeof(char));
      sprintf(submat_file, "../%s",Default_submat);
    }
  }
  /* open matrix */
  fd = fopen(submat_file, "r");
  
  if( ! ParsePAMFile(fd,&ScoringMatrix, &MatrixScale) ) {
    fprintf(stderr, "Cannot parse or open matrix file %s\n",submat_file);
    free(submat_file);
    exit(EXIT_SUCCESS);
  }
  

  if( output_filename && strlen(output_filename) != 1 &&
      output_filename[0] != '-') {      
    ofd = fopen(output_filename,"w");
    if( ! ofd ) {
      fprintf(stderr, "could not open file %s",output_filename);
      goto end;
    }
  } else 
    ofd = stdout;

  while( optind < argc ) {
    seqfile = argv[optind++];
    
    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (fmt == SQFILE_UNKNOWN &&
	(Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0))
      fmt = SQFILE_FASTA;
    
    if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
      Die("Failed to open sequence file %s for reading", seqfile);
    
    while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
    { 
      FreeSequence(NULL, &sqinfo);
      seqct++;
    }
    

    cds_seqs = (sequence *)calloc(seqct, sizeof(sequence));
    seqs     = (sequence *)calloc(seqct, sizeof(sequence));
    SeqfileRewind(dbfp);
    seqct=0;

    while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
    {
      sqinfo.type = Seqtype(seq);
      if( sqinfo.type == kDNA || sqinfo.type == kRNA ) {

	seqs[seqct].seqstr = Translate(seq,stdcode1);
	/* Let's remove the last codon if it is a stop codon */	
	len = strlen(seqs[seqct].seqstr);
	if( Verbose ) 
	  fprintf(stderr,"seqct is %d length is %d\n",seqct,
		  len);

	if( seqs[seqct].seqstr[len-1] == '*' ) {
	  seqs[seqct].seqstr[len-1] = '\0';
	  seq[strlen(seq) - 3] = '\0';
	}
	cds_seqs[cdsct].seqstr = seq;
	seqs[seqct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char));
	cds_seqs[cdsct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char));
	strcpy(seqs[seqct].seqname,sqinfo.name );
	strcpy(cds_seqs[cdsct].seqname,sqinfo.name);	
	cds_seqs[cdsct].length = sqinfo.len;
	cds_seqs[cdsct].alphabet = ( sqinfo.type == kDNA ) ? dna : rna;
	seqs[seqct].length = strlen(seqs[seqct].seqstr);
	
	seqs[seqct].alphabet = protein;
	cdsct++; seqct++;
      } else {
	fprintf(stderr,"Expect CDS sequences (DNA or RNA) not Protein\n");
	goto end;
      }    
      FreeSequence(NULL, &sqinfo);
      if( Verbose && seqct > 3 ) 
	break;
    }
    
    if( seqct < 2 ) {
      fprintf(stderr,"Must have provided a valid file with at least 2 sequences in it");
      goto end;
    }
    
    for( i=0; i  < seqct; i++ ) {
      for(k=i+1; k < seqct; k++ ) {	
	if( (opt_alignment = (alignment *)calloc(1,sizeof(alignment *))) == NULL) {
	  fprintf(stderr,"Could not allocate memory\n");
	  goto end;
	}

	opt_alignment->msa = NULL;
	rc = optimal_align(&seqs[i],&seqs[k],opt_alignment);
  
	if( rc != 1 ) {
	  fprintf(stderr,"Could not make an optimal alignment\n");
	  goto end;
	} else {
	  tmp_cds_seqs[0] = cds_seqs[i];
	  tmp_cds_seqs[1] = cds_seqs[k];
	  rc = mrtrans(opt_alignment, tmp_cds_seqs, &cds_aln,0);
	  if( rc != 0  ) { 
	    fprintf(stderr, "Could not map the coding sequence to the protein alignemnt for aln %d: %d\n",i,rc);
	    goto end;
	  }
	  if( showaln ) {
	    if( ofmt >= 100 ) {
	      MSAFileWrite(ofd,cds_aln->msa, ofmt,do_oneline);
	    } else { 
	      for(j=0; j < cds_aln->msa->nseq; j++ ) {	
		WriteSeq(ofd, ofmt, 
			 cds_aln->msa->aseq[j],
			 &(cds_aln->sqinfo[j]) );
	      }
	    }	    
	  } else {
	    if( showheader && firsttime ) {
	      fprintf(ofd,"SEQ1\tSEQ2\tSCORE\tdN\tdS\tOMEGA\tN\tS\tkappa\tt\tLENGTH\n");
	      firsttime = 0;
	    }
	    if( do_kaks_yn00(cds_aln->msa, &pwMLdist,&pwNGdist) < 0 ) {
	      fprintf(stderr, "warning: problem with align for %s %s\n",
		      cds_aln->msa->sqname[0], cds_aln->msa->sqname[1]);
	      continue;
	    }

	    for(ik = 0; ik < NUM_PW_SEQS; ik++ ) {	  
	      for( jk = ik+1; jk < NUM_PW_SEQS; jk++ ) {
		fprintf(ofd,"%s\t%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%d\n",
			cds_aln->sqinfo[ik].name,
			cds_aln->sqinfo[jk].name,
			opt_alignment->score,
			pwMLdist.dN[ik][jk],pwMLdist.dS[ik][jk], 
			pwMLdist.dNdS[ik][jk],
			pwMLdist.N[ik][jk],
			pwMLdist.S[ik][jk],
			pwMLdist.kappa[ik][jk],
			pwMLdist.t[ik][jk],
			opt_alignment->msa->alen);
	      }
	    }  
	  }
	}
	cleanup_alignment(cds_aln);
	cleanup_alignment(opt_alignment); 
      }
    }
  }
  if( ofd && ofd != stdout )
    fclose(ofd);

  end:
  free(submat_file);
  Free2DArray((void **)ScoringMatrix,27);
  for(i =0; i< seqct; i++ ) {
    free(seqs[i].seqstr);
    free(seqs[i].seqname);    
    seqs[i].seqstr = seqs[i].seqname = 0;
  }
  for(i = 0; i < cdsct; i++) {
    free(cds_seqs[i].seqstr);
    free(cds_seqs[i].seqname);    
    cds_seqs[i].seqstr = cds_seqs[i].seqname = 0;
  }
  
  cleanup_matrix((void **)pwMLdist.N,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.dN,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.S,NUM_PW_SEQS);
  
  cleanup_matrix((void **)pwMLdist.dS,NUM_PW_SEQS);

  cleanup_matrix((void **)pwMLdist.SEdS,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.SEdN,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.t,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.dNdS,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.kappa,NUM_PW_SEQS);

  cleanup_matrix((void **)pwNGdist.dN,NUM_PW_SEQS);
  cleanup_matrix((void **)pwNGdist.dS,NUM_PW_SEQS);
  cleanup_matrix((void **)pwNGdist.dNdS,NUM_PW_SEQS);


  free(pwNGdist.dNdS);
  free(pwNGdist.dN);
  free(pwNGdist.dS);

  free(pwMLdist.dNdS);
  free(pwMLdist.dN);
  free(pwMLdist.dS);
  free(pwMLdist.N);
  free(pwMLdist.S);
  free(pwMLdist.SEdS);
  free(pwMLdist.SEdN);
  free(pwMLdist.t);
  free(pwMLdist.kappa);
  
  return 0;
}
Example #4
0
/* Function: MSAFileOpen()
 * Date:     SRE, Tue May 18 13:22:01 1999 [St. Louis]
 *
 * Purpose:  Open an alignment database file and prepare
 *           for reading one alignment, or sequentially
 *           in the (rare) case of multiple MSA databases
 *           (e.g. Stockholm format).
 *           
 * Args:     filename - name of file to open
 *                      if "-", read stdin
 *                      if it ends in ".gz", read from pipe to gunzip -dc
 *           format   - format of file (e.g. MSAFILE_STOCKHOLM)
 *           env      - environment variable for path (e.g. BLASTDB)
 *
 * Returns:  opened MSAFILE * on success.
 *           NULL on failure: 
 *             usually, because the file doesn't exist;
 *             for gzip'ed files, may also mean that gzip isn't in the path.
 */
MSAFILE *
MSAFileOpen(char *filename, int format, char *env)
{
  MSAFILE *afp;
  
  afp        = MallocOrDie(sizeof(MSAFILE));
  if (strcmp(filename, "-") == 0)
    {
      afp->f         = stdin;
      afp->do_stdin  = TRUE; 
      afp->do_gzip   = FALSE;
      afp->fname     = sre_strdup("[STDIN]", -1);
      afp->ssi       = NULL;	/* can't index stdin because we can't seek*/
    }
#ifndef SRE_STRICT_ANSI		
  /* popen(), pclose() aren't portable to non-POSIX systems; disable */
  else if (Strparse("^.*\\.gz$", filename, 0))
    {
      char cmd[256];

      /* Note that popen() will return "successfully"
       * if file doesn't exist, because gzip works fine
       * and prints an error! So we have to check for
       * existence of file ourself.
       */
      if (! FileExists(filename))
	Die("%s: file does not exist", filename);
      if (strlen(filename) + strlen("gzip -dc ") >= 256)
	Die("filename > 255 char in MSAFileOpen()"); 
      sprintf(cmd, "gzip -dc %s", filename);
      if ((afp->f = popen(cmd, "r")) == NULL)
	return NULL;

      afp->do_stdin = FALSE;
      afp->do_gzip  = TRUE;
      afp->fname    = sre_strdup(filename, -1);
      /* we can't index a .gz file, because we can't seek in a pipe afaik */
      afp->ssi      = NULL;	
    }
#endif /*SRE_STRICT_ANSI*/
  else
    {
      char *ssifile;
      char *dir;

      /* When we open a file, it may be either in the current
       * directory, or in the directory indicated by the env
       * argument - and we have to construct the SSI filename accordingly.
       */
      if ((afp->f = fopen(filename, "r")) != NULL)
	{
	  ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5));
	  sprintf(ssifile, "%s.ssi", filename);
	}
      else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL)
	{
	  char *full;
	  full = FileConcat(dir, filename);
	  ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename)  + 5));
	  sprintf(ssifile, "%s.ssi", full);
	  free(dir);
	}
      else return NULL;

      afp->do_stdin = FALSE;
      afp->do_gzip  = FALSE;
      afp->fname    = sre_strdup(filename, -1);
      afp->ssi      = NULL;

      /* Open the SSI index file. If it doesn't exist, or
       * it's corrupt, or some error happens, afp->ssi stays NULL.
       */
      SSIOpen(ssifile, &(afp->ssi));
      free(ssifile);
    }

  /* Invoke autodetection if we haven't already been told what
   * to expect.
   */
  if (format == MSAFILE_UNKNOWN)
    {
      if (afp->do_stdin == TRUE || afp->do_gzip)
	Die("Can't autodetect alignment file format from a stdin or gzip pipe");
      format = MSAFileFormat(afp);
      if (format == MSAFILE_UNKNOWN)
	Die("Can't determine format of multiple alignment file %s", afp->fname);
    }

  afp->format     = format;
  afp->linenumber = 0;
  afp->buf        = NULL;
  afp->buflen     = 0;

  return afp;
}
/* Function: ReadMSF()
 * Date:     SRE, Tue Jun  1 08:07:22 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open MSF format
 *           alignment file. (MSF is a single-alignment format.)
 *           Return the alignment, or NULL if we've already
 *           read the alignment.
 *           
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   caller responsible for an MSAFree()
 *           NULL if no more alignments
 *
 * Diagnostics: 
 *           Will Die() here with a (potentially) useful message
 *           if a parsing error occurs.
 */
MSA *
ReadMSF(MSAFILE *afp)
{
  MSA    *msa;
  char   *s;
  int     alleged_alen;
  int     alleged_type;
  int     alleged_checksum;
  char   *tok;
  char   *sp;
  int     slen;
  int     sqidx;
  char   *name;
  char   *seq;

  if (feof(afp->f)) return NULL;
  if ((s = MSAFileGetLine(afp)) == NULL) return NULL;

  /* The first line is the header.
   * This is a new-ish GCG feature. Don't count on it, so
   * we can be a bit more tolerant towards non-GCG software
   * generating "MSF" files.
   */
  msa = MSAAlloc(10, 0);
  if      (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
    msa->type = kAmino;
    if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
  } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
    msa->type = kRNA;
    if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
  }

  /* Now we're in the free text comment section of the MSF file.
   * It ends when we see the "MSF: Type: Check: .." line.
   * This line must be present. 
   */
  do
    {
      if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
	  Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
	{
	  alleged_alen     = atoi(sqd_parse[0]);
	  switch (*(sqd_parse[1])) {
	  case 'N' : alleged_type = kRNA;      break;
	  case 'P' : alleged_type = kAmino;    break;  
	  case 'X' : alleged_type = kOtherSeq; break;
	  default  : alleged_type = kOtherSeq; 
	  }
	  alleged_checksum = atoi(sqd_parse[3]);
	  if (msa->type == kOtherSeq) msa->type = alleged_type;
	  break;		/* we're done with comment section. */
	}
      if (! IsBlankline(s)) 
	MSAAddComment(msa, s);
    } while ((s = MSAFileGetLine(afp)) != NULL); 

  /* Now we're in the name section.
   * GCG has a relatively poorly documented feature: only sequences that
   * appear in this list will be read from the alignment section. Commenting
   * out sequences in the name list (by preceding them with "!") is
   * allowed as a means of manually defining subsets of sequences in
   * the alignment section. We can support this feature reasonably
   * easily because of the hash table for names in the MSA: we
   * only add names to the hash table when we see 'em in the name section.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */

      if      (*s == '\n')   continue;                 /* skip blank lines */
      else if (*s == '!')    MSAAddComment(msa, s);
      else if ((sp  = strstr(s, "Name:")) != NULL) 
	{
				/* We take the name and the weigh, and that's it */
	  sp   += 5;
	  tok   = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
	  sqidx = GKIStoreKey(msa->index, tok);
	  if (sqidx >= msa->nseqalloc) MSAExpand(msa);
	  msa->sqname[sqidx] = sre_strdup(tok, slen);
	  msa->nseq++;

	  if ((sp = strstr(sp, "Weight:")) == NULL)
	    Die("No Weight: on line %d for %s in name section of MSF file %s\n",
		afp->linenumber, msa->sqname[sqidx],  afp->fname);
	  sp += 7;
	  tok = sre_strtok(&sp, " \t", &slen);
	  msa->wgt[sqidx] = atof(tok);
	  msa->flags |= MSA_SET_WGT;
	}
      else if (strncmp(s, "//", 2) == 0)
	break;
      else
	{
	  Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
	      afp->linenumber, afp->fname, s);
	  squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
	  return NULL;
	}

    }

  /* And now we're in the sequence section. 
   * As discussed above, if we haven't seen a sequence name, then we
   * don't include the sequence in the alignment.
   * Also, watch out for coordinate-only lines.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      sp  = s;
      if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
      if ((seq  = sre_strtok(&sp, "\n",  &slen)) == NULL) continue;
      
      /* The test for a coord line: digits starting both fields
       */
      if (isdigit((int) *name) && isdigit((int) *seq))
	continue;
  
      /* It's not blank, and it's not a coord line: must be sequence
       */
      sqidx = GKIKeyIndex(msa->index, name);
      if (sqidx < 0) continue;	/* not a sequence we recognize */
      
      msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); 
    }
  
  /* We've left blanks in the aseqs; take them back out.
   */
  for (sqidx = 0; sqidx <  msa->nseq; sqidx++)
    {
      if (msa->aseq[sqidx] == NULL)
	Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
      
      for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
	{
	  if (*s == ' ' || *s == '\t') {
	    msa->sqlen[sqidx]--;
	  } else {
	    *sp = *s;
	    sp++;
	  }
	}
      *sp = '\0';
    }
  
  MSAVerifyParse(msa);		/* verifies, and also sets alen and wgt. */
  return msa;
}
Example #6
0
/* Function: SeqfileOpen()
 * 
 * Purpose : Open a sequence database file and prepare for reading
 *           sequentially.
 *           
 * Args:     filename - name of file to open
 *           format   - format of file
 *           env      - environment variable for path (e.g. BLASTDB)                     
 *
 *           Returns opened SQFILE ptr, or NULL on failure.
 */
SQFILE *
SeqfileOpen(char *filename, int format, char *env)
{
  SQFILE *dbfp;

  dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
  dbfp->format   = format;
  dbfp->longline = FALSE;

  /* Open our file handle.
   * Three possibilities:
   *    1. normal file open
   *    2. filename = "-";    read from stdin
   *    3. filename = "*.gz"; read thru pipe from gzip 
   * If we're reading from stdin or a pipe, we can't reliably
   * back up, so we can't do two-pass parsers like the interleaved alignment   
   * formats.
   */
  if (strcmp(filename, "-") == 0)
    {
      if (IsInterleavedFormat(format))
	Die("Can't read interleaved alignment formats thru stdin, sorry");

      dbfp->f         = stdin;
      dbfp->do_stdin  = TRUE; 
      dbfp->do_gzip   = FALSE;
    }
  else if (Strparse("^.*\\.gz$", filename, 0) == 0)
    {
      char cmd[256];

      if (IsInterleavedFormat(format))
	Die("Can't read interleaved alignment formats thru gunzip, sorry");

      if (strlen(filename) + strlen("gzip -dc ") >= 256)
	{ squid_errno = SQERR_PARAMETER; return NULL; }
      sprintf(cmd, "gzip -dc %s", filename);
      if ((dbfp->f = popen(cmd, "r")) == NULL)
	{ squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */
      dbfp->do_stdin = FALSE;
      dbfp->do_gzip  = TRUE;
    }
  else
    {
      if ((dbfp->f = fopen(filename, "r")) == NULL &&
	  (dbfp->f = EnvFileOpen(filename, env)) == NULL)
	{  squid_errno = SQERR_NOFILE; return NULL; }
      dbfp->do_stdin = FALSE;
      dbfp->do_gzip  = FALSE;
    }
  
  /* The hack for sequential access of an interleaved alignment file:
   * read the alignment in, we'll copy sequences out one at a time.
   */
  dbfp->ali_aseqs = NULL;
  if (IsInterleavedFormat(format))
    {
      if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo)))
	return NULL;
      dbfp->ali_curridx = 0;
      return dbfp;
    }

  /* Load the first line.
   */
  getline2(dbfp);

  return dbfp;
}