Exemple #1
0
/* Function: IsSELEXFormat()
 * 
 * Return TRUE if filename may be in SELEX format.
 * 
 * Accuracy is sacrificed for speed; a TRUE return does
 * *not* guarantee that the file will pass the stricter
 * error-checking of ReadSELEX(). All it checks is that
 * the first 500 non-comment lines of a file are 
 * blank, or if there's a second "word" on the line
 * it looks like sequence (i.e., it's not kOtherSeq).
 * 
 * Returns TRUE or FALSE.
 */
int
IsSELEXFormat(char *filename)
{
  FILE *fp;                     /* ptr to open sequence file */
  char  buffer[LINEBUFLEN];
  char *sptr;                   /* ptr to first word          */
  int   linenum;


  if ((fp = fopen(filename, "r")) == NULL)
    { squid_errno = SQERR_NOFILE; return 0; }

  linenum = 0;
  while (linenum < 500 && 
	 fgets(buffer, LINEBUFLEN, fp) != NULL)
    {
      linenum++;
				/* dead giveaways for extended SELEX */
      if      (strncmp(buffer, "#=AU", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=ID", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=AC", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=DE", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=GA", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=TC", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=NC", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE;
      else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE;

				/* a comment? */
      if (strchr(commentsyms, *buffer) != NULL) continue;

				/* a blank line? */
      if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue;

				/* a one-word line (name only)
				   is possible, though rare */
      if ((sptr = strtok(NULL, "\n")) == NULL) continue;
      
      if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;}
    }

 DONE:
  fclose(fp);
  return 1;
}
Exemple #2
0
/* Function: DetermineAlphabet()
 * 
 * Purpose:  From a set of sequences (raw or aligned), make a good
 *           guess whether they're RNA, DNA, protein, or something
 *           else, and set alphabet accordingly.
 */
int
DetermineAlphabet(char  **rseqs, int  nseq)
{
  int idx;
  int othercount, dnacount, rnacount, aminocount;
  
  othercount = dnacount = rnacount = aminocount = 0;
  for (idx = 0; idx < nseq; idx++)
    {
      switch (Seqtype(rseqs[idx])) {
      case kRNA:      rnacount++;   break;
      case kDNA:      dnacount++;   break;
      case kAmino:    aminocount++; break;
      case kOtherSeq: othercount++; break;
      default: Die("No such alphabet type");
      }
    }
  if      (dnacount == nseq)   Alphabet_type = kDNA;
  else if (rnacount == nseq)   Alphabet_type = kRNA;
  else if (aminocount == nseq) Alphabet_type = kAmino;
  else if (dnacount > rnacount && dnacount > aminocount && dnacount > othercount) {
    Warn("Looks like DNA sequence, hope that's right");
    Alphabet_type = kDNA;
  }
  else if (rnacount > dnacount && rnacount > aminocount && rnacount > othercount) {
    Warn("Looks like RNA sequence, hope that's right");
    Alphabet_type = kRNA;
  }
  else if (aminocount > dnacount && aminocount > rnacount && aminocount > othercount) {
    Warn("Looks like protein sequence, hope that's right");
    Alphabet_type = kAmino;
  }
  else Die("Sorry, I can't tell if that's protein or DNA"); 

  switch(Alphabet_type) {
  case kRNA:   strncpy(Alphabet, RNA_ALPHABET, 4);   Alphabet_size = 4;  break;
  case kDNA:   strncpy(Alphabet, DNA_ALPHABET, 4);   Alphabet_size = 4;  break;
  case kAmino: strncpy(Alphabet, AMINO_ALPHABET, 20); Alphabet_size = 20; break;
  default: Die("No support for non-DNA/RNA or protein alphabets");  
  }

  return 1;
}
Exemple #3
0
/* Function: DetermineAlphabet()
 * 
 * Purpose:  From a set of sequences (raw or aligned), make a good
 *           guess whether they're Nucleic, Amino, or something
 *           else, and set alphabet accordingly. 
 *           
 *           If Alphabet_type is already set, that means our
 *           autodetection was overridden from the command line, 
 *           and we just set the other globals accordingly.  
 */
void
DetermineAlphabet(char **rseqs, int  nseq)
{
  int idx;
  int other, nucleic, amino;
  int type;
  
  /* Autodetection of alphabet type.
   */
  type = hmmNOTSETYET;
  other = nucleic = amino = 0;
  for (idx = 0; idx < nseq; idx++) {
    switch (Seqtype(rseqs[idx])) {
    case kRNA:      nucleic++;   break;
    case kDNA:      nucleic++;   break;
    case kAmino:    amino++;     break;
    case kOtherSeq: other++;     break;
    default: Die("No such alphabet type");
    }
  }

  if      (nucleic == nseq) type = hmmNUCLEIC;
  else if (amino   == nseq) type = hmmAMINO;
  else if (nucleic > amino && nucleic > other) {
    Warn("Looks like nucleic acid sequence, hope that's right");
    type = hmmNUCLEIC;
  }
  else if (amino > nucleic && amino > other) {
    Warn("Looks like amino acid sequence, hope that's right");
    type = hmmAMINO;
  }
  else Die("Sorry, I can't tell if that's protein or DNA"); 

  /* Now set up the alphabet.
   */
  SetAlphabet(type);
}
Exemple #4
0
/**
 * @brief reads sequences from file
 *
 * @param[out] prMSeq
 * Multiple sequence struct. Must be preallocated.
 * FIXME: would make more sense to allocate it here.
 * @param[in] seqfile
 * Sequence file name. If '-' sequence will be read from stdin.
 * @param[in] iSeqType
 * int-encoded sequence type. Set to
 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
 * @param[in] iMaxNumSeq
 * Return an error, if more than iMaxNumSeq have been read
 * @param[in] iMaxSeqLen
 * Return an error, if a seq longer than iMaxSeqLen has been read
 *
 * @return 0 on success, -1 on error
 *
 * @note
 *  - Depends heavily on squid
 *  - Sequence file format will be guessed
 *  - If supported by squid, gzipped files can be read as well.
 */
int
ReadSequences(mseq_t *prMSeq, char *seqfile,
              int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
              int iMaxNumSeq, int iMaxSeqLen)
{
    SQFILE *dbfp; /* sequence file descriptor */
    char *cur_seq;
    SQINFO cur_sqinfo;
    int iSeqIdx; /* sequence counter */
    int iSeqPos; /* sequence string position counter */

    assert(NULL!=seqfile);


    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (SQFILE_UNKNOWN == iSeqFmt  &&
            (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
        iSeqFmt = SQFILE_FASTA;
    }

    /* Using squid routines to read input. taken from seqstat_main.c. we don't
     * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
     * etc. NOTE this also means we discard some information, e.g. when
     * reading from and writing to a stockholm file, all extra MSA
     * info/annotation will be lost.
     *
     */

    if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) {
        Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
        return -1;
    }


    /* FIXME squid's ReadSeq() will exit with fatal error if format is
     * unknown. This will be a problem for a GUI. Same is true for many squid
     * other functions.
     *
     * The original squid:ReadSeq() dealigns sequences on input. We
     * use a patched version.
     *
     */
    while (ReadSeq(dbfp, dbfp->format,
                   &cur_seq,
                   &cur_sqinfo)) {

        if (prMSeq->nseqs+1>iMaxNumSeq) {
            Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
                iMaxNumSeq, cur_sqinfo.name, seqfile);
            return -1;
        }
        if ((int)strlen(cur_seq)>iMaxSeqLen) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
                cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
            return -1;
        }
        if ((int)strlen(cur_seq)==0) {
            Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues",
                cur_sqinfo.name);
            return -1;
        }

        /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
         */

        prMSeq->seq =  (char **)
                       CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
        prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);


        prMSeq->sqinfo =  (SQINFO *)
                          CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
        SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);

#ifdef TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
        LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
#endif
        /* always guess type from first seq. use squid function and
         * convert value
         */
        if (0 == prMSeq->nseqs) {
            int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
            switch (type)  {
            case kDNA:
                prMSeq->seqtype = SEQTYPE_DNA;
                break;
            case kRNA:
                prMSeq->seqtype = SEQTYPE_RNA;
                break;
            case kAmino:
                prMSeq->seqtype = SEQTYPE_PROTEIN;
                break;
            case kOtherSeq:
                prMSeq->seqtype = SEQTYPE_UNKNOWN;
                break;
            default:
                Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
            }

            /* override with given sequence type but check with
             * automatically detected type and warn if necessary
             */
            if (SEQTYPE_UNKNOWN != iSeqType) {
                if (prMSeq->seqtype != iSeqType) {
                    Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
                        SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType));
                    prMSeq->seqtype = iSeqType;
                }
            }
            /* if type could not be determined and was not set return error */
            if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
                Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
                FreeSequence(cur_seq, &cur_sqinfo);
                SeqfileClose(dbfp);
                return -1;
            }
        }

        Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
            prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
            prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
            prMSeq->seq[prMSeq->nseqs]);

        /* FIXME IPUAC and/or case conversion? If yes see
         * corresponding squid functions. Special treatment of
         * Stockholm tilde-gaps for ktuple code?
         */

        prMSeq->nseqs++;

        FreeSequence(cur_seq, &cur_sqinfo);
    }
    SeqfileClose(dbfp);

    /*#if ALLOW_ONLY_PROTEIN
        if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
            Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
                  SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
        }
    #endif*/

    /* Check if sequences are aligned */
    prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs);


    /* keep original sequence as copy and convert "working" sequence
     *
     */
    prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
    for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {

        prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);


        /* convert unknown characters according to set seqtype
         * be conservative, i.e. don't allow any fancy ambiguity
         * characters to make sure that ktuple code etc. works.
         */

        /* first on the fly conversion between DNA and RNA
         */
        if (prMSeq->seqtype==SEQTYPE_DNA)
            ToDNA(prMSeq->seq[iSeqIdx]);
        if (prMSeq->seqtype==SEQTYPE_RNA)
            ToRNA(prMSeq->seq[iSeqIdx]);

        /* then check of each character
         */
        for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
            char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
            if (isgap(*res))
                continue;

            if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
                if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
                    *res = AMINOACID_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_DNA) {
                if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            } else if (prMSeq->seqtype==SEQTYPE_RNA) {
                if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
                    *res = NUCLEOTIDE_ANY;
                }
            }
        }
    }

    /* order in which sequences appear in guide-tree
     * only allocate if different output-order desired */
    prMSeq->tree_order = NULL;

    prMSeq->filename = CkStrdup(seqfile);
    Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
        prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);

    return 0;
}
main (int argc, char ** argv ) 
{
  char     *seqfile;            /* name of sequence file     */
  SQINFO    sqinfo;             /* extra info about sequence */
  SQFILE   *dbfp;		/* open sequence file        */
  int       fmt,ofmt=106;	/* format of seqfile         */
                                /* 106 is PHYLIP format in SQUID */
  char     *seq;		/* sequence                  */
  int       type;		/* kAmino, kDNA, kRNA, or kOtherSeq */
  sequence  * seqs, * cds_seqs;
  sequence  tmp_seqs[2], tmp_cds_seqs[2];
  char  *optname;
  char  *optarg, *t;
  int    optind;
  int    be_quiet;
  int    seqct = 0,cdsct = 0;
  int    min_aln_len      = 0;
  int    do_oneline       = 0;
  char   * output_filename = 0, *submat_file = 0;
  int    showaln = 1;
  int    showheader=1;
  FILE  *ofd, *fd;
  alignment   *cds_aln;
  alignment * opt_alignment = NULL;  /* place for pairwise alignment */

  int    len,i,j, k, jk,ik,aln_count, rc;
  pairwise_distances pwMLdist, pwNGdist;
  int firsttime = 1;
  
  struct timeval tp;

  pwMLdist.N    = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.dN   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.S    = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.dS   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.SEdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.SEdN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.t    = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwMLdist.kappa= make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);

  pwNGdist.dN   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwNGdist.dS   = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
  pwNGdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS);
/*
  pwMLdist.N    = pwMLdist.dN   = pwMLdist.S    = 0;
  pwMLdist.dS   = pwMLdist.dNdS = pwMLdist.SEdS = 0;
  pwMLdist.SEdN = pwMLdist.t    = pwMLdist.kappa= 0;
      
  pwNGdist.dN   = pwNGdist.dS   = pwNGdist.dNdS = 0;
*/

  Alntype = default_aln_type;
  
  /* Command line Parse */
  fmt       = SQFILE_UNKNOWN;	/* default: autodetect format  */
  be_quiet  = FALSE;
  type      =  kOtherSeq;

  /* for our purposes this is only pairwise alignments, but
   * would rather do it correctly in case we move to MSA case 
   */
  
  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, 
		&optind, &optname, &optarg))
    {
      if      (strcmp(optname, "--matrix") == 0)  submat_file = optarg; 
      else if (strcmp(optname, "--quiet")   == 0)  be_quiet  = TRUE; 
      else if (strcmp(optname, "--gapopen") == 0)  {
	Gapopen = atoi(optarg); 
	if( Gapopen < 0 ) Gapopen *= -1;
	
      } else if (strcmp(optname, "--gapext")  == 0)  {
	Gapext = atoi(optarg); 
	if( Gapext < 0 ) Gapext *= -1;

      } else if (strcmp(optname, "--informat") == 0) {
	fmt = String2SeqfileFormat(optarg);
	if (fmt == SQFILE_UNKNOWN) 
	  Die("unrecognized sequence file format \"%s\"", optarg);
      } else if (strcmp(optname, "--outformat") == 0) {
	ofmt = String2SeqfileFormat(optarg);
	if (ofmt == SQFILE_UNKNOWN) 
	  Die("unrecognized sequence file format \"%s\"", optarg);
      }  else if( strcmp(optname, "--global") == 0 ) {
	Alntype = global;
      } else if (strcmp(optname, "-h") == 0) {
	puts(usage);
	puts(experts);
        exit(EXIT_SUCCESS);
      } else if ( strcmp(optname, "-v") == 0 ) {
	Verbose = 1;
      } else if ( strcmp(optname, "--gapchar") == 0 ) {
	GapChar = optarg[0];
      }  else if(  strcmp(optname, "--output") == 0 ) {
	output_filename = optarg;	  
      } else if( strcmp(optname, "--showtable" ) == 0  ) {
	showaln = 0;
      } else if( strcmp(optname, "--noheader" ) == 0 ) {
	showheader = 0;
      }      
    }

  if (argc - optind < 1) Die("%s\n", usage);

  if( ! submat_file ) { 
    if( (t = getenv("SUBOPTDIR")) != 0 || 
	(t = getenv("SUBOPT_DIR")) != 0 ) {
      submat_file = calloc(strlen(t) + 24, sizeof(char));
      sprintf(submat_file, "%s/%s",t,Default_submat);
    } else { 
      submat_file = calloc(strlen((void *)Default_submat) + 24, sizeof(char));
      sprintf(submat_file, "../%s",Default_submat);
    }
  }
  /* open matrix */
  fd = fopen(submat_file, "r");
  
  if( ! ParsePAMFile(fd,&ScoringMatrix, &MatrixScale) ) {
    fprintf(stderr, "Cannot parse or open matrix file %s\n",submat_file);
    free(submat_file);
    exit(EXIT_SUCCESS);
  }
  

  if( output_filename && strlen(output_filename) != 1 &&
      output_filename[0] != '-') {      
    ofd = fopen(output_filename,"w");
    if( ! ofd ) {
      fprintf(stderr, "could not open file %s",output_filename);
      goto end;
    }
  } else 
    ofd = stdout;

  while( optind < argc ) {
    seqfile = argv[optind++];
    
    /* Try to work around inability to autodetect from a pipe or .gz:
     * assume FASTA format
     */
    if (fmt == SQFILE_UNKNOWN &&
	(Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0))
      fmt = SQFILE_FASTA;
    
    if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
      Die("Failed to open sequence file %s for reading", seqfile);
    
    while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
    { 
      FreeSequence(NULL, &sqinfo);
      seqct++;
    }
    

    cds_seqs = (sequence *)calloc(seqct, sizeof(sequence));
    seqs     = (sequence *)calloc(seqct, sizeof(sequence));
    SeqfileRewind(dbfp);
    seqct=0;

    while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
    {
      sqinfo.type = Seqtype(seq);
      if( sqinfo.type == kDNA || sqinfo.type == kRNA ) {

	seqs[seqct].seqstr = Translate(seq,stdcode1);
	/* Let's remove the last codon if it is a stop codon */	
	len = strlen(seqs[seqct].seqstr);
	if( Verbose ) 
	  fprintf(stderr,"seqct is %d length is %d\n",seqct,
		  len);

	if( seqs[seqct].seqstr[len-1] == '*' ) {
	  seqs[seqct].seqstr[len-1] = '\0';
	  seq[strlen(seq) - 3] = '\0';
	}
	cds_seqs[cdsct].seqstr = seq;
	seqs[seqct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char));
	cds_seqs[cdsct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char));
	strcpy(seqs[seqct].seqname,sqinfo.name );
	strcpy(cds_seqs[cdsct].seqname,sqinfo.name);	
	cds_seqs[cdsct].length = sqinfo.len;
	cds_seqs[cdsct].alphabet = ( sqinfo.type == kDNA ) ? dna : rna;
	seqs[seqct].length = strlen(seqs[seqct].seqstr);
	
	seqs[seqct].alphabet = protein;
	cdsct++; seqct++;
      } else {
	fprintf(stderr,"Expect CDS sequences (DNA or RNA) not Protein\n");
	goto end;
      }    
      FreeSequence(NULL, &sqinfo);
      if( Verbose && seqct > 3 ) 
	break;
    }
    
    if( seqct < 2 ) {
      fprintf(stderr,"Must have provided a valid file with at least 2 sequences in it");
      goto end;
    }
    
    for( i=0; i  < seqct; i++ ) {
      for(k=i+1; k < seqct; k++ ) {	
	if( (opt_alignment = (alignment *)calloc(1,sizeof(alignment *))) == NULL) {
	  fprintf(stderr,"Could not allocate memory\n");
	  goto end;
	}

	opt_alignment->msa = NULL;
	rc = optimal_align(&seqs[i],&seqs[k],opt_alignment);
  
	if( rc != 1 ) {
	  fprintf(stderr,"Could not make an optimal alignment\n");
	  goto end;
	} else {
	  tmp_cds_seqs[0] = cds_seqs[i];
	  tmp_cds_seqs[1] = cds_seqs[k];
	  rc = mrtrans(opt_alignment, tmp_cds_seqs, &cds_aln,0);
	  if( rc != 0  ) { 
	    fprintf(stderr, "Could not map the coding sequence to the protein alignemnt for aln %d: %d\n",i,rc);
	    goto end;
	  }
	  if( showaln ) {
	    if( ofmt >= 100 ) {
	      MSAFileWrite(ofd,cds_aln->msa, ofmt,do_oneline);
	    } else { 
	      for(j=0; j < cds_aln->msa->nseq; j++ ) {	
		WriteSeq(ofd, ofmt, 
			 cds_aln->msa->aseq[j],
			 &(cds_aln->sqinfo[j]) );
	      }
	    }	    
	  } else {
	    if( showheader && firsttime ) {
	      fprintf(ofd,"SEQ1\tSEQ2\tSCORE\tdN\tdS\tOMEGA\tN\tS\tkappa\tt\tLENGTH\n");
	      firsttime = 0;
	    }
	    if( do_kaks_yn00(cds_aln->msa, &pwMLdist,&pwNGdist) < 0 ) {
	      fprintf(stderr, "warning: problem with align for %s %s\n",
		      cds_aln->msa->sqname[0], cds_aln->msa->sqname[1]);
	      continue;
	    }

	    for(ik = 0; ik < NUM_PW_SEQS; ik++ ) {	  
	      for( jk = ik+1; jk < NUM_PW_SEQS; jk++ ) {
		fprintf(ofd,"%s\t%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%d\n",
			cds_aln->sqinfo[ik].name,
			cds_aln->sqinfo[jk].name,
			opt_alignment->score,
			pwMLdist.dN[ik][jk],pwMLdist.dS[ik][jk], 
			pwMLdist.dNdS[ik][jk],
			pwMLdist.N[ik][jk],
			pwMLdist.S[ik][jk],
			pwMLdist.kappa[ik][jk],
			pwMLdist.t[ik][jk],
			opt_alignment->msa->alen);
	      }
	    }  
	  }
	}
	cleanup_alignment(cds_aln);
	cleanup_alignment(opt_alignment); 
      }
    }
  }
  if( ofd && ofd != stdout )
    fclose(ofd);

  end:
  free(submat_file);
  Free2DArray((void **)ScoringMatrix,27);
  for(i =0; i< seqct; i++ ) {
    free(seqs[i].seqstr);
    free(seqs[i].seqname);    
    seqs[i].seqstr = seqs[i].seqname = 0;
  }
  for(i = 0; i < cdsct; i++) {
    free(cds_seqs[i].seqstr);
    free(cds_seqs[i].seqname);    
    cds_seqs[i].seqstr = cds_seqs[i].seqname = 0;
  }
  
  cleanup_matrix((void **)pwMLdist.N,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.dN,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.S,NUM_PW_SEQS);
  
  cleanup_matrix((void **)pwMLdist.dS,NUM_PW_SEQS);

  cleanup_matrix((void **)pwMLdist.SEdS,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.SEdN,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.t,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.dNdS,NUM_PW_SEQS);
  cleanup_matrix((void **)pwMLdist.kappa,NUM_PW_SEQS);

  cleanup_matrix((void **)pwNGdist.dN,NUM_PW_SEQS);
  cleanup_matrix((void **)pwNGdist.dS,NUM_PW_SEQS);
  cleanup_matrix((void **)pwNGdist.dNdS,NUM_PW_SEQS);


  free(pwNGdist.dNdS);
  free(pwNGdist.dN);
  free(pwNGdist.dS);

  free(pwMLdist.dNdS);
  free(pwMLdist.dN);
  free(pwMLdist.dS);
  free(pwMLdist.N);
  free(pwMLdist.S);
  free(pwMLdist.SEdS);
  free(pwMLdist.SEdN);
  free(pwMLdist.t);
  free(pwMLdist.kappa);
  
  return 0;
}
Exemple #6
0
/* Function: SeqfileFormat()
 * 
 * Purpose:  Determine format of seqfile, and return it
 *           through ret_format. From Gilbert's seqFileFormat().
 *           
 *           If filename is "-", we will read from stdin and
 *           assume that the stream is coming in FASTA format --
 *           either unaligned or aligned.
 *
 * Args:     filename   - name of sequence file      
 *           ret_format - RETURN: format code for file, see squid.h 
 *                        for codes.
 *           env        - name of environment variable containing
 *                        a directory path that filename might also be
 *                        found in. "BLASTDB", for example. Can be NULL.
 *           
 * Return:   1 on success, 0 on failure.
 */          
int
SeqfileFormat(char *filename, int  *ret_format, char *env)
{
  int   foundIG      = 0;
  int   foundStrider = 0;
  int   foundGB      = 0; 
  int   foundEMBL    = 0; 
  int   foundPearson = 0;
  int   foundZuker   = 0;
  int   gotGCGdata   = 0;
  int   gotPIR       = 0;
  int   gotSquid     = 0;
  int   gotuw        = 0;
  int   gotMSF       = 0;
  int   gotClustal   = 0;
  int   done         = 0;
  int   format       = kUnknown;
  int   nlines= 0, dnalines= 0;
  int   splen = 0;
  char  sp[LINEBUFLEN];
  FILE *fseq;

  /* First check if filename is "-": special case indicating
   * a FASTA pipe.
   */
  if (strcmp(filename, "-") == 0)
    { *ret_format = kPearson; return 1; }

#define ReadOneLine(sp)   \
  { done |= (feof(fseq)); \
    readline( fseq, sp);  \
    if (!done) { splen = (int) strlen(sp); ++nlines; } }

  if ((fseq = fopen(filename, "r")) == NULL &&
      (fseq = EnvFileOpen(filename, env)) == NULL)
    { squid_errno = SQERR_NOFILE;  return 0; }

  /* Look at a line at a time
   */
  while ( !done ) {
    ReadOneLine(sp);

    if (sp==NULL || *sp=='\0')
      /*EMPTY*/ ; 

    /* high probability identities: */
    
    else if (strstr(sp, " MSF:")   != NULL &&
	     strstr(sp, " Type:")  != NULL &&
	     strstr(sp, " Check:") != NULL)
      gotMSF = 1;

    else if (strncmp(sp, "CLUSTAL ", 8) == 0 && 
	     strstr( sp, "multiple sequence alignment"))
      gotClustal = 1;

    else if (strstr(sp," Check: ") != NULL)
      gotuw= 1;

    else if (strncmp(sp, "///", 3) == 0 || strncmp(sp, "ENTRY ", 6) == 0)
      gotPIR = 1;

    else if (strncmp(sp, "++", 2) == 0 || strncmp(sp, "NAM ", 4) == 0)
      gotSquid = 1;

    else if (strncmp(sp, ">>>>", 4) == 0 && strstr(sp, "Len: "))
      gotGCGdata = 1;

    /* uncertain identities: */

    else if (*sp ==';') {
      if (strstr(sp,"Strider") !=NULL) foundStrider= 1;
      else foundIG= 1;
    }
    else if (strncmp(sp,"LOCUS",5) == 0 || strncmp(sp,"ORIGIN",5) == 0)
      foundGB= 1;

    else if (*sp == '>') {
      foundPearson  = 1;
    }

    else if (strstr(sp,"ID   ") == sp || strstr(sp,"SQ   ") == sp)
      foundEMBL= 1;

    else if (*sp == '(')
      foundZuker= 1;

    else {
      switch (Seqtype( sp )) {
      case kDNA:
      case kRNA: if (splen>20) dnalines++; break;
      default:   break;
      }
    }

    if      (gotMSF)     {format = kMSF;     done = 1; }
    else if (gotClustal) {format = kClustal; done = 1; }
    else if (gotSquid)   {format = kSquid;   done = 1; }
    else if (gotPIR)     {format = kPIR;     done = 1; }
    else if (gotGCGdata) {format = kGCGdata; done = 1; }
    else if (gotuw)  
      {
	if (foundIG) format= kIG;  /* a TOIG file from GCG for certain */
	else format= kGCG;
	done= 1;
      }
    else if ((dnalines > 1) || done || (nlines > 500)) {
      /* decide on most likely format */
      /* multichar idents: */
      if (foundStrider)      format= kStrider;
      else if (foundGB)      format= kGenBank;
      else if (foundEMBL)    format= kEMBL;
      /* single char idents: */
      else if (foundIG)      format= kIG;
      else if (foundPearson) format= kPearson;
      else if (foundZuker)   format= kZuker;
      /* spacing ident: */
      else if (IsSELEXFormat(filename)) format= kSelex;
      /* no format chars: */
      else 
	{
	  squid_errno = SQERR_FORMAT;
	  return 0;
	}

      done= 1;
    }
  }

  if (fseq!=NULL) fclose(fseq);

  *ret_format = format;
  return 1;
#undef  ReadOneLine
}