コード例 #1
0
ファイル: esl-sfetch.c プロジェクト: ElofssonLab/TOPCONS2
static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage1);
  esl_usage (stdout, argv0, usage2);
  esl_usage (stdout, argv0, usage3);
  puts("\n where general options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
  puts("\n Options for retrieving subsequences:");
  esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
  puts("\n  On command line, subseq coords are separated by any nonnumeric, nonspace character(s).");
  puts("  for example, -c 23..100 or -c 23/100 or -c 23-100 all work.\n");
  puts("  Additionally, to retrieve a suffix to the end, omit the end coord or set it to zero; -c 23.. ");
  puts("  will work, as will -c 23..0\n");
  puts("  By default, the subseq will be named <source name>/<from>-<to>. To assign a name of");
  puts("  your choice, use -n <newname>.\n");
  puts("  In retrieving subsequences listed in a file (-C -f, or just -Cf), each line of the file");
  puts("  is in GDF format: <newname> <from> <to> <source seqname>, space/tab delimited.\n");
  puts("  When <start> coordinate is greater than <end>, for DNA or RNA, the reverse complement is");
  puts("  retrieved; in protein sequence, this is an error. The -r option is another way to revcomp.");
  puts("\n other options:");
  esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
  exit(0);
}
コード例 #2
0
static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage1);
  puts("\n where general options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
  exit(0);
}
コード例 #3
0
ファイル: esl-cluster.c プロジェクト: ElofssonLab/TOPCONS2
static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage);
  puts("\n options:");
  esl_opt_DisplayHelp(stdout, go, 0, 2, 80);
  exit(0);
}
コード例 #4
0
static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage);
  puts("\n where general options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
  puts("\n options controlling segment randomization method:");
  esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
  puts("\n options declaring a particular alphabet:");
  esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
  puts("\n other options:");
  esl_opt_DisplayHelp(stdout, go, 4, 2, 80);
  exit(0);
}
コード例 #5
0
static int
process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_fmfile, char **ret_qfile)
{
  ESL_GETOPTS *go = esl_getopts_Create(options);
  int          status;

  if (esl_opt_ProcessEnvironment(go)         != eslOK)  { if (printf("Failed to process environment: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK)  { if (printf("Failed to parse command line: %s\n", go->errbuf)  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
  if (esl_opt_VerifyConfig(go)               != eslOK)  { if (printf("Failed to parse command line: %s\n", go->errbuf)  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }

  /* help format: */
  if (esl_opt_GetBoolean(go, "-h") == TRUE) 
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage(stdout, argv[0], usage);

      if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 120=textwidth*/

      if (puts("\nSpecial options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80); /* 2= group; 2 = indentation; 120=textwidth*/

      exit(0);
  }

  if (esl_opt_ArgNumber(go)                  != 2)    { if (puts("Incorrect number of command line arguments.")     < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
  if ((*ret_qfile  = esl_opt_GetArg(go, 1)) == NULL)  { if (puts("Failed to get <qfile> argument on command line")  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
  if ((*ret_fmfile = esl_opt_GetArg(go, 2)) == NULL)  { if (puts("Failed to get <fmfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }

  /* Validate any attempted use of stdin streams */
  if (esl_strcmp(*ret_fmfile, "-") == 0 && esl_strcmp(*ret_qfile, "-") == 0) 
    { if (puts("Either <fmfile> or <qfile> may be '-' (to read from stdin), but not both.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }

  *ret_go = go;
  return eslOK;

 FAILURE:  /* all errors handled here are user errors, so be polite.  */
  esl_usage(stdout, argv[0], usage);
  puts("\nwhere basic options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 80=textwidth*/
  printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
  esl_getopts_Destroy(go);
  exit(1);

 ERROR:
  if (go) esl_getopts_Destroy(go);
  exit(status);
}
コード例 #6
0
ファイル: esl-shuffle.c プロジェクト: EddyRivasLab/easel
static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage1);
  esl_usage (stdout, argv0, usage2);
  esl_usage (stdout, argv0, usage3);
  esl_usage (stdout, argv0, usage4);
  puts("\n where general options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
  puts("\n options for shuffling input sequences (default mode):");
  esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
  puts("\n options for generating sequences de novo (w/ -G option):");
  esl_opt_DisplayHelp(stdout, go, 4, 2, 80);
  puts("\n other infrequently used options:");
  esl_opt_DisplayHelp(stdout, go, 5, 2, 80);
  exit(0);
}
コード例 #7
0
ファイル: esl-mask.c プロジェクト: appris/appris
static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage);

  puts("\n where general options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80);

  puts("");
  puts("The <seqfile> is a sequence file in any accepted format, such as FASTA.");
  puts("It may be indexed (see esl-sfetch --index) for faster performance.");
  puts("");
  puts("The <maskfile> is a space-delimited file; each data line with 3 columns:");
  puts("  field 1: <seqname> to fetch from <sqfile>");
  puts("  field 2: <start> coordinate for mask operation, 1..n");
  puts("  field 3: <end> coordinate for mask operation, 1..n");
  puts("Lines starting with # are comments, and ignored.)");
  puts("");
  exit(0);
}
コード例 #8
0
ファイル: esl-translate.c プロジェクト: nathanweeks/easel
int
main(int argc, char **argv)
{
  ESL_GETOPTS   *go          = NULL;
  ESL_ALPHABET  *nt_abc      = esl_alphabet_Create(eslDNA);
  ESL_ALPHABET  *aa_abc      = esl_alphabet_Create(eslAMINO);
  ESL_GENCODE   *gcode       = NULL;
  ESL_GENCODE_WORKSTATE *wrk    = NULL;
  char          *seqfile     = NULL;
  int            informat    = eslSQFILE_UNKNOWN;
  ESL_SQFILE    *sqfp        = NULL;
  int            status;
  
  /***************************************************************** 
   * command line parsing
   *****************************************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  
  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, /*docgroup=*/0, /*indent=*/2, /*textwidth=*/80);

      puts("\nAvailable NCBI genetic code tables (for -c <id>):");
      esl_gencode_DumpAltCodeTable(stdout);

      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
 
  seqfile = esl_opt_GetArg(go, 1);

  if (esl_opt_IsOn(go, "--informat") &&
      (informat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); 

  status = esl_sqfile_OpenDigital(nt_abc, seqfile, informat, /*env=*/NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("Failed to find (or open) sequence file %s", seqfile);
  else if (status == eslEFORMAT)   esl_fatal("Failed to recognize format of sequence file %s", seqfile);
  else if (status != eslOK)        esl_fatal("Failure in opening sequence file %s; code %d", seqfile, status);

  /* A limitation. The esl_sqio_ReadWindow() interface needs to use SSI positioning
   * to read reverse complement, and that doesn't work on nonrewindable streams.
   */
  if ( esl_opt_GetBoolean(go, "-W") && ! esl_sqfile_IsRewindable(sqfp) && ! esl_opt_GetBoolean(go, "--watson"))
    esl_fatal("esl-translate can't read reverse complement from a nonrewindable stream (stdin pipe, .gz file, etc).");

  /* Set up the genetic code. Default = NCBI 1, the standard code; allow ORFs to start at any aa
   */
  gcode = esl_gencode_Create(nt_abc, aa_abc);
  esl_gencode_Set(gcode, esl_opt_GetInteger(go, "-c"));  // default = 1, the standard genetic code

  if      (esl_opt_GetBoolean(go, "-m"))   esl_gencode_SetInitiatorOnlyAUG(gcode);
  else if (! esl_opt_GetBoolean(go, "-M")) esl_gencode_SetInitiatorAny(gcode);      // note this is the default, if neither -m or -M are set


  /* Set up the workstate structure, which contains both stateful 
   * info about our position in <sqfp> and the DNA <sq>, as well as
   * one-time config info from options
   */
  wrk = esl_gencode_WorkstateCreate(go, gcode);


  /* The two styles of main processing loop:
   */
  if (esl_opt_GetBoolean(go, "-W"))  do_by_windows(gcode, wrk, sqfp);
  else                               do_by_sequences(gcode, wrk, sqfp);


  esl_gencode_WorkstateDestroy(wrk);
  esl_sqfile_Close(sqfp);
  esl_gencode_Destroy(gcode);
  esl_alphabet_Destroy(aa_abc);
  esl_alphabet_Destroy(nt_abc);
  esl_getopts_Destroy(go);
  return 0;
}
コード例 #9
0
ファイル: esl-construct.c プロジェクト: EddyRivasLab/easel
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* application configuration       */
  ESL_ALPHABET *abc     = NULL;	/* biological alphabet             */
  char         *alifile = NULL;	/* alignment file name             */
  int           fmt;		/* format code for alifiles        */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *msa     = NULL;	/* multiple sequence alignment     */
  int           status;		/* easel return code               */

  int           do_info = TRUE;                /* TRUE if -i */
  int           do_max = FALSE;                /* TRUE if -x */
  int           do_ffreq = FALSE;              /* TRUE if --ffreq */
  int           do_fmin  = FALSE;              /* TRUE if --fmin */
  float         fthresh = 0.;                  /* <x> from -f <x> */
  int           do_remove_bps = FALSE;         /* TRUE if -r */
  int           do_consistent = FALSE;         /* TRUE if -c */
  int           do_indi2cons = FALSE;          /* TRUE if --indi <x> */
  int           have_cons;                     /* TRUE if first alignment has consensus sequence */
  int           do_newcons = FALSE;            /* TRUE if we're creating a new consensus structure
						* and outputing a new alignment (if -x -f -c or --indi)
						*/
  int           do_a = FALSE;                  /* TRUE if -a */
  char         *indi;                          /* for <x> from --indi <x> */
  int           nindi_read;                    /* number of individual sequence SS lines we've read for current alignment */

  int           a;		               /* counter over seqs               */
  int           i, i2;		               /* counter over residues */
  int           j, j2;		               /* counter over residues */
  int           nali;                          /* counter over alignments */
  int         **bp = NULL;                     /* bp[i][j] is number of individual bps exist between aln cols i and j */
  int          *cur_ct = NULL;                 /* ct array of basepairs for current sequence */
  int          *cons_ct = NULL;                /* ct array of basepairs for SS_cons being created */
  int          *xcons_ct = NULL;               /* ct array of basepairs for existing SS_cons */
  int          *ngaps = NULL;                  /* number of gaps in each alignment position */
  FILE         *ofp;		               /* output file (default is stdout) */
  int           be_verbose = FALSE;            /* TRUE to print extra info */
  int           seqthresh;                     /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/
  char         *sscons = NULL;                 /* the new SS_cons line */
  FILE         *lfp = NULL;                    /* file to list sequences with conflicting bps to */
  int           nlist = 0;                     /* number of sequences listed to list file */
  int          *nconflictsA;                   /* number of conflicting bps in seq a's individual structure annotation */
  int           nconflicts_total = 0;          /* total number of conflicts */
  int           nconflicts_list = 0;           /* total number of conflicts in sequences listed to file <x> from -l <x> */
  int           noverlaps_total = 0;           /* total number of overlaps */
  int           nconsistent_total = 0;         /* total number of consistent bps */
  int           nbps_total = 0;                /* total number of bps */
  int          *nconsistentA;                  /* number of consistent bps in seq a's individual structure annotation */
  int          *noverlapsA;                    /* number of bps in seq a's indi structure that overlap with consensus structure */
  int          *nbpsA;                         /* number of bps in seq a's indi structure that overlap with consensus structure */
  int           ncons_bps = 0;                 /* number of bps in consensus structure */
  int           max_noverlaps_aidx;
  int           max_nconsistent_aidx;
  int           max_nbps_aidx;
  int          *removebp;                      /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */
  int          *has_conflict;    
  int          *nmates_l2r;                    /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */
  int          *nmates_r2l;                    /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left  mates i for j */

  int           lmax;                          /* with -l, maximum number of conflicts to allow */
  int           namewidth = 18;                 /* length of 'SS_cons(consensus)' */
  char         *namedashes = NULL;             /* to store underline for seq name */

  /* --fmin related variables */
  int nbps = 0;
  int prev_nbps = -1;
  float fmin;
  int inconsistent_flag;
  int pknot_flag;
  int k,l;

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\nwhere basic options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      puts("\noptions for defining a new consensus structure (all of these require -o):");
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      puts("\noptions for listing sequences based on structure:");
      esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile  = esl_opt_GetArg(go, 1);

  fmt = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the MSA file; determine alphabet; set for digital input
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--dna"))  abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))  abc = esl_alphabet_Create(eslRNA);

  if ( (status = esl_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK)
    esl_msafile_OpenFailure(afp, status);

  /* open output file */
  if (esl_opt_GetString(go, "-o") != NULL) {
    if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) 
	esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
  } else ofp = NULL;
  if (esl_opt_GetString(go, "-l") != NULL) { 
    if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) 
	esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l"));
  }

  /* determine if we're creating a structure */
  do_max = esl_opt_GetBoolean(go, "-x");
    
  if(!(esl_opt_IsDefault(go, "--ffreq"))) { 
    do_ffreq = TRUE; 
    fthresh = esl_opt_GetReal(go, "--ffreq"); 
  }
  if(!(esl_opt_IsDefault(go, "--fmin"))) { 
    do_fmin = TRUE; 
  }
  do_remove_bps = esl_opt_GetBoolean(go, "-r"); 
  do_consistent = esl_opt_GetBoolean(go, "-c");
  if(!(esl_opt_IsDefault(go, "--indi"))) { 
    do_indi2cons = TRUE; 
  }
  if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { 
    do_newcons = TRUE;
  }
  do_a = esl_opt_GetBoolean(go, "-a");
  if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { 
    do_info = FALSE;
  }

  /***********************************************
   * Read MSAs one at a time.
   ***********************************************/
  nali = 0;
  have_cons = FALSE;
  lmax = esl_opt_GetInteger(go, "--lmax");
  if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; 

  while ((status = esl_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) esl_msafile_ReadFailure(afp, status);
      nali++;

      /* determine max length name */
      namewidth = 18; /* length of 'SS_cons(consensus)' */
      for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i]));
      if(namedashes != NULL) { free(namedashes); }
      ESL_ALLOC(namedashes, sizeof(char) * namewidth+1);
      namedashes[namewidth] = '\0';
      for(i = 0; i < namewidth; i++) namedashes[i] = '-';

      ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1));
      ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1));
      ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1));
      esl_vec_ISet(cur_ct, (msa->alen+1), 0);
      esl_vec_ISet(cons_ct, (msa->alen+1), 0);
      esl_vec_ISet(xcons_ct, (msa->alen+1), 0);
      esl_vec_ISet(removebp, (msa->alen+1), FALSE);
      esl_vec_ISet(has_conflict, (msa->alen+1), FALSE);
      esl_vec_ISet(nmates_l2r, (msa->alen+1), 0);
      esl_vec_ISet(nmates_r2l, (msa->alen+1), 0);

      ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq);
      ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq);
      ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq);
      ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq);
      esl_vec_ISet(nconflictsA, msa->nseq, 0);
      esl_vec_ISet(noverlapsA, msa->nseq, 0);
      esl_vec_ISet(nconsistentA, msa->nseq, 0);
      esl_vec_ISet(nbpsA, msa->nseq, 0);

      max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0;
      nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0;
      for(i = 1; i <= msa->alen; i++) { 
	ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1));
	esl_vec_ISet(bp[i], (msa->alen+1), 0);
      }

      /* make sure we have ss_cons and indi ss if we need it */
      if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_max)             esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_consistent)      esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_indi2cons)       esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_ffreq)           esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_fmin)            esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali);

      if(msa->ss_cons != NULL) { 
	if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { 
	  esl_fatal("Existing SS_cons for alignment %d is invalid.", nali);
	}
	ncons_bps = 0;
	for(i = 1; i <= msa->alen; i++) 
	  if(xcons_ct[i] != 0 && i < xcons_ct[i]) 
	    ncons_bps++;

	if(nali > 1 && !have_cons)
	  esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); 
	if(nali == 1) have_cons = TRUE;
      }
      else if (lfp != NULL) { 
	esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else if (do_remove_bps) { 
	esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else if (do_consistent) { 
	esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else { 
	if(nali > 1 && have_cons)
	  esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); 
      }

      if(do_info) { 
	printf("# Per-sequence basepair information:\n"); 
	printf("# Alignment file: %s\n", alifile);
	printf("# Alignment idx:  %d\n", nali);
	if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); }
	if(have_cons) { 
	  printf("#\n");
	  printf("# indibp: number of basepairs in the individual sequence SS annotation\n");
	  printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n");
	  printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n");
	  printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n");
	  printf("#\n");
	  printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n");
	  printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n");
	  printf("#\n");
	  printf("# %-*s  %6s  %6s  %6s  %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct");
	  printf("# %-*s  %6s  %6s  %6s  %6s\n", namewidth, namedashes, "------", "------", "-----", "------");
	}
	else { 
	  printf("# %-*s  %6s\n", namewidth, "seqname", "nbp");
	  printf("# %-*s  %6s\n", namewidth, namedashes, "------");
	}
      }

      nindi_read = 0;
      for (a = 0; a < msa->nseq; a++) { 
	if(msa->ss != NULL && msa->ss[a] != NULL) { 
	  if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { 
	    esl_fatal("SS annotation for sequence %d, aln %d  is invalid.\n", (a+1), nali);
	  }
	  nindi_read++;
	  for(i = 1; i <= msa->alen; i++) { 
	    if(i < cur_ct[i]) { 
	      bp[i][cur_ct[i]]++;
	      if(bp[i][cur_ct[i]] == 1) { 
		nmates_l2r[i]++;
		nmates_r2l[cur_ct[i]]++;
	      }
	    }
	  }

	  for(i = 1; i <= msa->alen; i++) { 
	    if(cur_ct[i] != 0 && i < cur_ct[i]) { 
	      if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++;
	      if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { 
		if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); }
		nconflictsA[a]++;
		/* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */
		removebp[i]           = TRUE;
		removebp[xcons_ct[i]] = TRUE;
	      }
	      else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { 
		if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); }
		nconflictsA[a]++;
		/* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */
		removebp[cur_ct[i]] = TRUE;
		removebp[xcons_ct[cur_ct[i]]] = TRUE;
	      }
	      else nconsistentA[a]++;
	    }		  
	  }
	  if(nconflictsA[a] > lmax) { 
	    if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); 
	    nconflicts_list += nconflictsA[a];
	    nlist++;
	  }
	  nbpsA[a] = nconflictsA[a] + nconsistentA[a];
	  nconflicts_total += nconflictsA[a];
	  nconsistent_total += nconsistentA[a];
	  noverlaps_total += noverlapsA[a];
	  nbps_total += nbpsA[a];

	  if(do_info && have_cons)  printf("  %-*s  %6d  %6d  %6d  %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); 
	  if(do_info && !have_cons) printf("  %-*s  %6d\n", namewidth, msa->sqname[a], nbpsA[a]);
	  if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a;
	  if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a;
	  if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a;
	}
	else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); }
      }

      if(do_info && have_cons) { 
	if(nindi_read > 0) printf("\n"); 
	printf("  %-*s  %6d  %6d  %6d  %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); 
	if(nindi_read > 0) { 
	  printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.);
	  printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.);
	  printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.);
	}
	else { 
	  printf("# No sequences in the alignment have GR SS annotation.\n");
	}
      }

      if(lfp != NULL) { 
	printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); 
      }

      /* determine number of gaps per alignment column */
      if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR;

      /* -x: determine max bp structure OR
       * -a: list all conflicts in individual structures */
      if(do_max || do_a) { 
	for(i = 1; i <= msa->alen; i++) { 
	  if(nmates_l2r[i] > 1) {/* list the conflicts */
	    has_conflict[i] = TRUE;
	    for(j = 1; j <= msa->alen; j++) { 
	      if(bp[i][j] > 0) { 
		if(do_a) printf("More than 1 right mates for left  mate %4d   %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); 
		has_conflict[j] = TRUE;
	      }
	    }
	  }
	}
	for(i = 1; i <= msa->alen; i++) { 
	  if(nmates_r2l[i] > 1) {/* list the conflicts */
	    has_conflict[i] = TRUE;
	    for(j = 1; j <= msa->alen; j++) { 
	      if(bp[j][i] > 0) { 
		if(do_a) printf("More than 1 left  mates for right mate %4d   %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); 
		has_conflict[j] = TRUE;
	      }
	    }
	  }
	}
	for(i = 1; i <= msa->alen; i++) { 
	  /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/
	  if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { 
	    j = i+1; 
	    while(bp[i][j] == 0) j++;
	    cons_ct[i] = j;
	    cons_ct[j] = i;
	  }
	}

	/* remove pseudoknotted bps greedily */
	for(i = 1; i <= msa->alen; i++) { 
	  j = cons_ct[i]; 
	  if(j != 0 && i < j) { 
	    for(i2 = i+1; i2 <= msa->alen; i2++) { 
	      j2 = cons_ct[i2];
	      if(j2 != 0 && i2 < j2) { 
		if((i2 < j) && (j < j2)) { 
		  /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/
		  /* note: remove both if they have equal number of sequences */
		  if(bp[i][j] <= bp[i2][j2]) { 
		    /*printf("rm %4d:%4d\n", i, j);*/
		    cons_ct[cons_ct[i]] = 0;
		    cons_ct[i]          = 0;
		  }
		  if(bp[i][j] >= bp[i2][j2]) { 
		    /*printf("rm %4d:%4d\n", i2, j2);*/
		    cons_ct[cons_ct[i2]] = 0;
		    cons_ct[i2]          = 0;
		  }
		}
	      }
	    }
	  }
	}
      }
      
      /***************************************/
      /*PARANOID, second check for knots 
      for(i = 1; i <= msa->alen; i++) { 
	j = cons_ct[i]; 
	if(j != 0 && i < j) { 
	  printf("BP: %4d:%4d\n", i, j);
	  for(i2 = 1; i2 <= msa->alen; i2++) { 
	    j2 = cons_ct[i2];
	    if(j2 != 0 && i2 < j2) { 
	      if((i2 < j) && (j < j2)) { 
		if((i < i2)) { 
		  printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);
		}
	      }
	    }
	  }
	}
      }
      ******************************************/

      /***************************************/
      /*PARANOID, check cons_ct for consistency
      for(i = 1; i <= msa->alen; i++) { 
	if(cons_ct[i] != 0) { 
	  if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); }
	}
      }
      */
      /*PARANOID, write out SS_cons 
      for(i = 1; i <= msa->alen; i++) { 
	if(i < cons_ct[i]) printf("<"); 
	else if(cons_ct[i] != 0) { printf(">"); }
	else printf(".");
      }
      printf("\n");
      */
      /***************************************/

      /* textize alignment */
      if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); 

      /* --fmin */
      if(do_fmin) { 
	/* define ss_cons */
	prev_nbps = -1;
	fthresh = 0.99;
	inconsistent_flag = pknot_flag = FALSE;
	printf("# Defining consensus structure:\n");
	printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n");
	printf("# if > <x> individual SS contain i:j as a pair\n");
	printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n");
	printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n");
	printf("#\n");
	printf("# Alignment file: %s\n", alifile);
	printf("# Alignment idx:  %d\n", nali);
	printf("# Number of seqs: %d\n", msa->nseq);
	printf("#\n");
	printf("# %5s  %23s  %6s\n", "<x>", "nseq-required-with-bp", "numbps");
	printf("# %5s  %23s  %6s\n", "-----", "-----------------------", "------");
	while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { 
	  nbps = 0;
	  seqthresh = (int) (fthresh * msa->nseq);
	  /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/
	  esl_vec_ISet(cons_ct, msa->alen+1, 0);
	  for(i = 1; i <= msa->alen; i++) { 
	    for(j = i+1; j <= msa->alen; j++) { 
	      if(bp[i][j] > seqthresh) { 
		if(cons_ct[i] != 0 || cons_ct[j] != 0) { 
		  inconsistent_flag = TRUE;
		}
		/* check for pseudoknots */
		for(k = i+1; k < j; k++) { 
		  l = cons_ct[k];
		  if((k < l) && (l > j)) { 
		    pknot_flag = TRUE;
		  }
		  if((k > l) && (l != 0) && (l < i)) { 
		    pknot_flag = TRUE;
		  }
		}
		cons_ct[i] = j;
		cons_ct[j] = i;
		nbps++;
	      }
	    }
	  }
	  if(inconsistent_flag) 
	    printf("  %.3f  %23d  %s\n", fthresh, seqthresh+1, "inconsistent");
	  else if(pknot_flag) 
	    printf("  %.3f  %23d  %s\n", fthresh, seqthresh+1, "pseudoknotted");
	  else { 
	    if(nbps != prev_nbps) { 
	      printf("  %.3f  %23d  %6d\n", fthresh, seqthresh+1, nbps);
	    }
	    fmin = fthresh;
	  }
	  fthresh -= 0.01;
	  prev_nbps = nbps;
	}
	fthresh = fmin;
	esl_vec_ISet(cons_ct, msa->alen+1, 0);
      }

      /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */
      if(do_ffreq || do_fmin) { 
	if(do_fmin)  { printf("#\n# <x> determined to be %.3f\n", fthresh); }
	if(do_ffreq) { 
	  printf("# Defining consensus structure:\n");
	  printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n");
	  printf("# if > %f individual SS contain i:j as a pair\n", fthresh);
	}
	esl_vec_ISet(cons_ct, msa->alen+1, 0);
	/* define ss_cons */
	  seqthresh = (int) (fthresh * msa->nseq);
	  /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/
	  for(i = 1; i <= msa->alen; i++) { 
	    for(j = i+1; j <= msa->alen; j++) { 
	      if(bp[i][j] > seqthresh) { 
		if(cons_ct[i] != 0) { 
		  esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j);
		}
		if(cons_ct[j] != 0) { 
		  esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j);
		}
		cons_ct[i] = j;
		cons_ct[j] = i;
	      }
	    }
	  }
	}

      /* -r: redefine consensus struct by removing any bps that conflict with individual structures */
      if(do_remove_bps) { 
	for(i = 1; i <= msa->alen; i++) { 
	  if(!(removebp[i])) {
	    cons_ct[i]          = xcons_ct[i];
	    cons_ct[cons_ct[i]] = i;
	  }
	  else {
	    printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]);
	    cons_ct[xcons_ct[i]] = 0; 
	    cons_ct[i]           = 0; 
	  }
	}
      }
      
      /* -c:     define consensus structure as indi sequence with highest number of consistent bps with structure  OR */
      /* --indi: define consensus structure as indi sequence <x> from --indi <x> */
      if(do_consistent || do_indi2cons) {
	if(do_indi2cons) { 
	  indi = esl_opt_GetString(go, "--indi");
	  for(a = 0; a < msa->nseq; a++) { 
	    if(strcmp(indi, msa->sqname[a]) == 0) break;
	  }
	  if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi);
	}
	else { /* do_consistent */
	  a = max_nconsistent_aidx;
	}
	if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]);
	if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { 
	  esl_fatal("Second pass... SS annotation for sequence %d, aln %d  is invalid.\n", (a), nali);
	}	
	printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]);
	if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) {
	  if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; }
	  if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR;
	  printf("# Defined new RF as %s sequence\n", msa->sqname[a]);
	}
      }
      
      /* write out alignment with new SS_cons */
      if(do_newcons) { 
	if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR;
	if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; }
	if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR;
	status = esl_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM));
	if      (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n");
	else if (status != eslOK)   esl_fatal("Writing alignment file failed with error %d\n", status);
      }
      
      free(sscons);
      free(cur_ct);
      free(cons_ct);
      free(xcons_ct);
      for(i = 1; i <= msa->alen; i++) free(bp[i]);
      free(bp);
      esl_msa_Destroy(msa);
    }
  if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile);

  /* Cleanup, normal return
   */
  if(lfp != NULL) fclose(lfp);
  if(ofp != NULL) { 
    printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o"));
    fclose(ofp);
  }
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;

 ERROR:
  if(afp) esl_msafile_Close(afp);
  if(go)  esl_getopts_Destroy(go);
  if(msa) esl_msa_Destroy(msa);
  if(lfp) fclose(lfp);
  if(ofp) fclose(ofp);
  esl_fatal("ERROR\n");
  return 1;
  
}
コード例 #10
0
ファイル: esl-compalign.c プロジェクト: EddyRivasLab/easel
int
main(int argc, char **argv)
{
  ESL_GETOPTS *go;		/* application configuration       */
  int          kstatus, tstatus;/* return code from Easel routine  */
  int          fmt;		/* expected format of kfile, tfile */
  char        *kfile, *tfile;   /* known, test structure file      */
  ESL_MSAFILE *kfp, *tfp;       /* open kfile, tfile               */
  ESL_MSA     *ka,  *ta; 	/* known, trusted alignment        */
  int64_t      klen, tlen;	/* lengths of dealigned seqs       */
  int          i;		/* counter over sequences          */
  int          apos;		/* counter over alignment columns  */
  int          rfpos;		/* counter over consensus (non-gap RF) columns  */
  int       is_rfpos;            /* TRUE if current apos is a consensus pos, FALSE if not */
  int          uapos;		/* counter over unaligned residue positions */
  int          nali;            /* number of alignment we're on in each file */

  int        **kp;              /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */
  int        **tp;              /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */
  /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert'
   * after non-gap RF position (x * -1) 
   */
  int        *km_pos;          /* [0..rflen] = x, in known aln,     number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int        *ki_pos;          /* [0..rflen] = x, in known aln,     number of residues inserted after non-gap RF column x */
  int        *tm_pos;          /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int        *ti_pos;          /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */
  int    *cor_tm_pos;          /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int    *cor_ti_pos;          /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */

  int        *km_seq;          /* [0..i..nseq-1] = x, in known aln,     number of residues aligned to non-gap RF columns in seq i; */
  int        *ki_seq;          /* [0..i..nseq-1] = x, in known aln,     number of residues inserted in seq i */
  int        *tm_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */
  int        *ti_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */
  int    *cor_tm_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */
  int    *cor_ti_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */

  int     *seqlen;             /* [0..i..nseq-1] = x, unaligned seq i has length x */
  ESL_ALPHABET *abc = NULL;    /* alphabet for all alignments */
  int      rflen, t_rflen;     /* non-gap RF length (consensus lengths) */
  int   status;
  char *namedashes;
  int ni;
  int namewidth = 8; /* length of 'seq name' */
  int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */
  char *mask = NULL;
  int masklen;
  ESL_DSQ *ks;
  ESL_DSQ *ts;
  FILE *dfp = NULL; /* for --c2dfile */

  /* variables needed for -p and related options */
  int do_post = FALSE; /* TRUE if -p enabled */
  int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */
  int p;               /* counter over integerized posteriors */
  int *ptm = NULL;     /* [0..p..10] number of total   matches with posterior value p (10="*")*/
  int *pti = NULL;     /* [0..p..10] number of total   inserts with posterior value p */
  int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */
  int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */
  int npostvals = 11;  /* number of posterior values 0-9, * */
  int ppidx;           /* index of PP */
  char ppchars[11] = "0123456789*";
  int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */
  // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti;       /* total counts of posteriors */
  // int tot_incor_ptm,tot_incor_pti;                      // SRE: commented out; don't seem to be used; need to silence compiler warning
  char errbuf[eslERRBUFSIZE];

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      exit(EXIT_SUCCESS);
    }

  if (esl_opt_ArgNumber(go) != 2) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  kfile = esl_opt_GetArg(go, 1);
  tfile = esl_opt_GetArg(go, 2);
  
  fmt = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the two Stockholm files.
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus);
  if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus);

  do_post = esl_opt_GetBoolean(go, "-p");

  /* read the mask file if --p-mask is enabled */
  if(! esl_opt_IsDefault(go, "--p-mask")) { 
    if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf);
  }
  /* open the c2dfile for output, if nec */
  if (esl_opt_IsOn(go, "--c2dfile")) { 
    if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile"));
  }

  /***********************************************
   * Do alignment comparisons, one seq at a time;
   * this means looping over all seqs in all alignments.
   ***********************************************/
  nali = 0;
  while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF)
    {
      if (  kstatus                               != eslOK) esl_msafile_ReadFailure(kfp, kstatus);
      if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus);

      nali++;
      if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); 

      /* Sanity check on alignment
       */
      if (ka->nseq != ta->nseq)
	esl_fatal("trusted, test alignments don't have same seq #\n");
      if (ka->rf == NULL)
	esl_fatal("trusted alignment has no reference annotation\n");
      if (ta->rf == NULL)
	esl_fatal("test alignment has no reference annotation\n");

      /* make sure the sequences are all identical */
      ESL_ALLOC(seqlen, sizeof(int) * ka->nseq);
      for(i = 0; i < ka->nseq; i++) { 
	if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); 
	ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2));
	memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ));
	esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen);

	ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2));
	memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ));
	esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen);

	if (tlen != klen)
	  esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen);

	if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) 
	  esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]);

	seqlen[i] = tlen;
	free(ks);
	free(ts);
      }

      /* determine non-gap RF length */
      rflen = 0;
      for(apos = 1; apos <= ka->alen; apos++) { 
	if((! esl_abc_CIsGap    (ka->abc, ka->rf[apos-1])) && 
	   (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++;
      }
      t_rflen = 0;
      for(apos = 1; apos <= ta->alen; apos++) { 
	if((! esl_abc_CIsGap       (ta->abc, ta->rf[apos-1])) && 
	   (! esl_abc_CIsMissing   (ta->abc, ta->rf[apos-1]))) t_rflen++;
      }
      if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen);

      /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */
      if(do_post) { 
	if(! esl_opt_IsDefault(go, "--p-mask")) {
	  if(masklen != rflen) { 
	    esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen);
	  }
	}
	if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists");
	ESL_ALLOC(ptm,     sizeof(int) * npostvals);
	ESL_ALLOC(pti,     sizeof(int) * npostvals);
	ESL_ALLOC(cor_ptm, sizeof(int) * npostvals);
	ESL_ALLOC(cor_pti, sizeof(int) * npostvals);
	esl_vec_ISet(ptm, npostvals, 0);
	esl_vec_ISet(pti, npostvals, 0);
	esl_vec_ISet(cor_ptm, npostvals, 0);
	esl_vec_ISet(cor_pti, npostvals, 0);
      }

      /* allocate and initialize our counters */
      ESL_ALLOC(kp, sizeof(int *) * ka->nseq);
      ESL_ALLOC(tp, sizeof(int *) * ta->nseq);
      for(i = 0; i < ka->nseq; i++) { 
	ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1));
	ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1));
	esl_vec_ISet(kp[i], seqlen[i]+1, -987654321);
	esl_vec_ISet(tp[i], seqlen[i]+1, -987654321);
      }

      ESL_ALLOC(km_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1));
      esl_vec_ISet(km_pos, rflen+1, 0);
      esl_vec_ISet(ki_pos, rflen+1, 0);
      esl_vec_ISet(tm_pos, rflen+1, 0);
      esl_vec_ISet(ti_pos, rflen+1, 0);
      esl_vec_ISet(cor_tm_pos, rflen+1, 0);
      esl_vec_ISet(cor_ti_pos, rflen+1, 0);

      ESL_ALLOC(km_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq);
      esl_vec_ISet(km_seq, ka->nseq, 0);
      esl_vec_ISet(ki_seq, ka->nseq, 0);
      esl_vec_ISet(tm_seq, ka->nseq, 0);
      esl_vec_ISet(ti_seq, ka->nseq, 0);
      esl_vec_ISet(cor_tm_seq, ka->nseq, 0);
      esl_vec_ISet(cor_ti_seq, ka->nseq, 0);

      /* determine non-gap RF location of each residue in known alignment */
      for(i = 0; i < ka->nseq; i++) { 
	uapos = rfpos = 0;
	for(apos = 1; apos <= ka->alen; apos++) { 
	  is_rfpos = FALSE;
	  if((! esl_abc_CIsGap       (ka->abc, ka->rf[apos-1])) &&
	     (! esl_abc_CIsMissing   (ka->abc, ka->rf[apos-1]))) { 
	    rfpos++; is_rfpos = TRUE;
	  }
	  if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { 
	    uapos++;
	    kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos);
	    if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; }
	    else         { ki_pos[rfpos]++; ki_seq[i]++; }
	  }
	}
      }

      /* determine non-gap RF location of each residue in predicted alignment */
      for(i = 0; i < ta->nseq; i++) { 
	uapos = rfpos = 0;
	for(apos = 1; apos <= ta->alen; apos++) { 
	  is_rfpos = FALSE;
	  if((! esl_abc_CIsGap       (abc, ta->rf[apos-1])) && 
	     (! esl_abc_CIsMissing   (abc, ta->rf[apos-1]))) { 
	    rfpos++; is_rfpos = TRUE;
	    if(do_post) { 
	      do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE;
	    }
	  }
	  if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { 
	    uapos++;
	    tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos);
	    if(do_post) { 
	      if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos);
	      ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]);
	      if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos);
	    }
	    if(is_rfpos) { 
	      tm_pos[rfpos]++; tm_seq[i]++; 
	      if(do_post_for_this_rfpos) ptm[ppidx]++;
	    }
	    else { 
	      ti_pos[rfpos]++; ti_seq[i]++; 
	      if(do_post) pti[ppidx]++;
	    }
	    if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */
	      if(is_rfpos) { 
		cor_tm_seq[i]++; cor_tm_pos[rfpos]++; 
		if(do_post_for_this_rfpos) cor_ptm[ppidx]++;
	      } 
	      else {
		cor_ti_seq[i]++; cor_ti_pos[rfpos]++; 
		if(do_post) cor_pti[ppidx]++;
	      } 
	    }
	  }
	}
      }
      if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { 
	/* print per sequence statistics */
	/* determine the longest name in msa */
	for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni]));
	ESL_ALLOC(namedashes, sizeof(char) * namewidth+1);
	namedashes[namewidth] = '\0';
	for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-';
	
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, "seq name", "len",    "match columns", "insert columns", "all columns");
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------");
	for(i = 0; i < ta->nseq; i++) { 
	  printf("  %-*s  %6d  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)\n", namewidth, ka->sqname[i], seqlen[i],
		 cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), 
		 cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), 
		 (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); 
	}
	cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq);
	cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq);
	km = esl_vec_ISum(km_seq, ka->nseq);
	ki = esl_vec_ISum(ki_seq, ka->nseq);
	
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------");
	printf("# %-*s  %6s  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)\n",
	       namewidth, "*all*", "-", 
	       cor_tm, km, ((float) cor_tm / (float) km), 
	       cor_ti, ki, ((float) cor_ti / (float) ki), 
	       (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); 
	free(namedashes);
	for(i = 0; i < ka->nseq; i++) { 
	  free(kp[i]); 
	  free(tp[i]); 
	}
      }
      else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */
	printf("# %5s  %20s  %20s  %20s\n", "rfpos", "match", "insert", "both");
	printf("# %5s  %20s  %20s  %20s\n", "-----", "--------------------", "--------------------", "--------------------");
	for(rfpos = 0; rfpos <= rflen; rfpos++) { 
	  printf("  %5d  %4d / %4d  (%.3f)  %4d / %4d  (%.3f)  %4d / %4d  (%.3f)\n", rfpos, 
		 
		 cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), 
		 cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), 
		 (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); 
	}
      }
      else if(do_post) { /* do posterior output */
	if(mask == NULL) { 
	  printf("# %2s  %29s  %29s\n", "",   "      match columns          ", "      insert columns         ");
	  printf("# %2s  %29s  %29s\n", "",   "-----------------------------", "-----------------------------") ;
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "PP", "ncorrect", "ntotal",   "fractcor",  "ncorrect", "ntotal",   "fractcor");
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------");
	}
	else { 
	  printf("# %2s  %29s  %29s\n", "",   " match columns within mask   ", "      insert columns         ");
	  printf("# %2s  %29s  %29s\n", "",   "-----------------------------", "-----------------------------") ;
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "PP", "ncorrect", "ntotal",   "fractcor",  "ncorrect", "ntotal",   "fractcor");
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------");
	}
	cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0;
	//tot_ptm = esl_vec_ISum(ptm, npostvals);
	//tot_pti = esl_vec_ISum(pti, npostvals);
	//tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals);
	//tot_cor_pti = esl_vec_ISum(cor_pti, npostvals);
	//tot_incor_ptm = tot_ptm - tot_cor_ptm;
	//tot_incor_pti = tot_pti - tot_cor_pti;
	for(p = (npostvals-1); p >= 0; p--) { 
	  cm_cor_ptm += cor_ptm[p];
	  cm_cor_pti += cor_pti[p];
	  cm_ptm     += ptm[p];
	  cm_pti     += pti[p];
	  cm_incor_ptm += ptm[p] - cor_ptm[p];
	  cm_incor_pti += pti[p] - cor_pti[p];
	  printf("  %2c  %8d / %8d (%.5f)  %8d / %8d (%.5f)\n", 
		 ppchars[p], cor_ptm[p], ptm[p], 
		 (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], 
		 cor_pti[p], pti[p], 
		 (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]);
	}
      }

      /* handle --c2dfile */
      if (dfp != NULL) { 
	/* match stats, 4 fields, CMYK color values */
	for(rfpos = 1; rfpos <= rflen; rfpos++) { 
	  if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	  }
	  else { 
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		    0., /* cyan */
		    1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */
		    1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */
		    0.);
	  }		 
	}	
	fprintf(dfp, "//\n");
	/* insert stats, 4 fields, CMYK color values */
	rfpos = 0; /* special case, combine insert posn 0 and 1 together */
	if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	  fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	}
	else { 
	  fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		  0., /* cyan */
		  1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */
		  0.,
		  0.);
	}
	/* insert stats posn 2..rflen */
	for(rfpos = 2; rfpos <= rflen; rfpos++) { 
	  if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	  }
	  else { 
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		    0., /* cyan */
		    1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */
		    0.,
		    0.);
	  }
	} 
	fprintf(dfp, "//\n");
      }
      
      if(ptm != NULL) free(ptm);
      if(pti != NULL) free(pti);
      if(cor_ptm != NULL) free(cor_ptm);
      if(cor_ptm != NULL) free(cor_pti);
      free(kp);
      free(tp);
      free(km_seq);
      free(ki_seq);
      free(tm_seq);
      free(ti_seq);
      free(cor_tm_seq);
      free(cor_ti_seq);
      free(km_pos);
      free(ki_pos);
      free(tm_pos);
      free(ti_pos);
      free(cor_tm_pos);
      free(cor_ti_pos);
      free(seqlen);
      esl_msa_Destroy(ka);
      esl_msa_Destroy(ta);
    }

  if(mask != NULL) free(mask);
  if(dfp != NULL) { 
    fclose(dfp);
    printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile"));
  }
	   
  if(abc) esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  esl_msafile_Close(tfp);
  esl_msafile_Close(kfp);
  return 0;

 ERROR:
  return status;
}
コード例 #11
0
ファイル: esl-alimap.c プロジェクト: ElofssonLab/TOPCONS2
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* application configuration       */
  ESL_ALPHABET *abc     = NULL;	/* biological alphabet             */
  char         *alifile1= NULL;	/* alignment 1 file name           */
  char         *alifile2= NULL;	/* alignment 2 file name           */
  int           fmt;		/* format code for alifiles        */
  ESLX_MSAFILE *afp1    = NULL;	/* open alignment file 1           */
  ESLX_MSAFILE *afp2    = NULL;	/* open alignment file 2           */
  ESL_MSA      *msa1    = NULL;	/* multiple sequence alignment 1   */
  ESL_MSA      *msa2    = NULL;	/* multiple sequence alignment 2   */
  int           status;		/* easel return code               */
  char          errbuf[eslERRBUFSIZE*4];

  int  *msa1_to_msa2_map;       /* map from <msafile1> to <msafile2> */
  char *sub_msa1_to_msa2_mask;  /* with --sub the map from <msafile1> to <msafile2> in mask form */
  FILE *subfp = NULL;

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\nwhere basic options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 2) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile1 = esl_opt_GetArg(go, 1);
  alifile2 = esl_opt_GetArg(go, 2);

  fmt             = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the MSA files
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  if ( (status = eslx_msafile_Open(&abc, alifile1, NULL, fmt, NULL, &afp1)) != eslOK) eslx_msafile_OpenFailure(afp1, status);
  if ( (status = eslx_msafile_Open(&abc, alifile2, NULL, fmt, NULL, &afp2)) != eslOK) eslx_msafile_OpenFailure(afp2, status);

  /******************************************************************
   * Read first alignment from each file, we only use the first one 
   ******************************************************************/

  if ((status = eslx_msafile_Read(afp1, &msa1)) != eslOK) eslx_msafile_ReadFailure(afp1, status);
  if ((status = eslx_msafile_Read(afp2, &msa2)) != eslOK) eslx_msafile_ReadFailure(afp2, status);

  /* map the alignments in msa1 and msa2 */
  if(! esl_opt_IsOn(go, "--submap")) { 
    if((status = map_msas(go, errbuf, msa1, msa2, &msa1_to_msa2_map)) != eslOK) goto ERROR;
    free(msa1_to_msa2_map);
  }

  /* --submap: if nec, map <msafile1> to a subset of it's own columns in <msafile2>  */
  else { /* --submap was enabled */
    if ((subfp = fopen(esl_opt_GetString(go, "--submap"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errbuf, "Failed to open --submap output file %s\n", esl_opt_GetString(go, "--submap"));
    if((status = map_sub_msas(go, errbuf, msa1, msa2, &sub_msa1_to_msa2_mask)) != eslOK) goto ERROR;
    fprintf(subfp, "%s\n", sub_msa1_to_msa2_mask);
    fclose(subfp);
    subfp = NULL;
    printf("# Mask of 1/0s with 1 indicating aln column in %s maps to a column in %s saved to file %s.\n", alifile1, alifile2, esl_opt_GetString(go, "--submap")); 
    free(sub_msa1_to_msa2_mask);
  }
  
  /* Cleanup, normal return
   */
  eslx_msafile_Close(afp1);
  eslx_msafile_Close(afp2);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  return 0;
  
 ERROR:
  if (afp1)   eslx_msafile_Close(afp1);
  if (afp2)   eslx_msafile_Close(afp2);
  if (go)     esl_getopts_Destroy(go);
  if (msa1)   esl_msa_Destroy(msa1);
  if (msa2)   esl_msa_Destroy(msa2);
  if (subfp)  fclose(subfp);
  esl_fatal(errbuf);
  return 1; /* never reached */
}
コード例 #12
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	               /* application configuration       */
  ESL_ALPHABET *abc     = NULL;      	       /* biological alphabet             */
  char         *alifile = NULL;	               /* alignment file name             */
  int           fmt     = eslMSAFILE_UNKNOWN;  /* format code for alifile         */
  ESLX_MSAFILE *afp     = NULL;		       /* open msa file                   */
  ESL_MSAFILE2 *old_afp = NULL;	               /* open msa file, legacy (--small) */
  ESL_MSA      *msa     = NULL;	               /* one multiple sequence alignment */
  int           nali;		               /* number of alignments read       */
  int           i;		               /* counter over seqs               */
  int64_t       alen;		               /* alignment length                */
  int           nseq;                          /* number of sequences in the msa */
  int64_t       rlen;		               /* a raw (unaligned) seq length    */
  int64_t       small, large;	               /* smallest, largest sequence      */
  int64_t       nres;		               /* total # of residues in msa      */
  double        avgid;		               /* average fractional pair id      */
  int           max_comparisons;               /* maximum # comparisons for avg id */
  int           do_stall;                      /* used to stall when debugging     */
  double      **abc_ct = NULL;                 /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */
  double     ***bp_ct  = NULL;                 /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair *
						* count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum *
						* of 'i:j' for a pair between i and j, where i < j. */
  double       **pp_ct = NULL;                 /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */  
  int  *i_am_rf = NULL;                        /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */
  int  *rf2a_map = NULL;                       /* [0..rfpos..rflen-1] = apos,                     
						* apos is the alignment position (0..msa->alen-1) that     
						* is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */
  int rflen = -1;                              /* nongap RF length */
  char          errbuf[eslERRBUFSIZE];
  int           status;		               /* easel return code               */

  /* optional output files */
  FILE *iinfofp  = NULL; /* output file for --iinfo */
  FILE *pcinfofp = NULL; /* output file for --pcinfo */
  FILE *psinfofp = NULL; /* output file for --psinfo */
  FILE *rinfofp  = NULL; /* output file for --rinfo */
  FILE *icinfofp = NULL; /* output file for --icinfo */
  FILE *listfp   = NULL; /* output file for --list */
  FILE *cinfofp  = NULL; /* output file for --cinfo */
  FILE *bpinfofp = NULL; /* output file for --bpinfo */
  int use_weights;       /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */
  int weights_exist;     /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ 

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:");
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      puts("\n optional output files:");
      esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile = esl_opt_GetArg(go, 1);

  if (esl_opt_IsOn(go, "--informat") &&
      (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); 
    
  if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); 

  max_comparisons = 1000;

  do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */
  while (do_stall); 

  /***********************************************
   * Open the MSA file; determine alphabet; set for digital input
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  /* We'd like to get rid of the legacy msafile interface, but it
   * includes small memory functionality for Pfam format which we have
   * to replace first. For now, use both interfaces, new and legacy
   */
  if ( esl_opt_GetBoolean(go, "--small") )
    {
      if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified.");

      status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp);
      if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile);
      else if (status == eslEFORMAT)   esl_fatal("Couldn't determine format of alignment %s\n", alifile);
      else if (status != eslOK)        esl_fatal("Alignment file open failed with error %d\n", status);
    }
  else
    {
      if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK)
	eslx_msafile_OpenFailure(afp, status);
    }

  /**************************************
   * Open optional output files, as nec *
   **************************************/
  /* determine name for first list file, if nec */
  if( esl_opt_IsOn(go, "--list")) {
    if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) 
      esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list"));
  }
  if( esl_opt_IsOn(go, "--icinfo")) {
    if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo"));
  }
  if( esl_opt_IsOn(go, "--rinfo")) {
    if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo"));
  }
  if( esl_opt_IsOn(go, "--pcinfo")) {
    if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo"));
  }
  if( esl_opt_IsOn(go, "--psinfo")) {
    if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo"));
  }
  if( esl_opt_IsOn(go, "--iinfo")) {
    if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo"));
  }
  if( esl_opt_IsOn(go, "--cinfo")) {
    if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo"));
  }
  if( esl_opt_IsOn(go, "--bpinfo")) {
    if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo"));
  }

  /***********************************************
   * Read MSAs one at a time.
   ***********************************************/

  if (esl_opt_GetBoolean(go, "-1")) {
    puts("#");
    if(! esl_opt_GetBoolean(go, "--small")) { 
      printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id");
      printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---");
    }
    else { 
      printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen");
      printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------");
    }
  }

  nali = 0;
  
  fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format);

  while ( (status = ( esl_opt_GetBoolean(go, "--small") ? 
		      esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) :
		      eslx_msafile_Read        (afp, &msa))) == eslOK)
    { 
      nali++;
      nres = 0;

      if (! esl_opt_GetBoolean(go, "--small")) { 
	nseq = msa->nseq;
	alen = msa->alen;
	small = large = -1;
	for (i = 0; i < msa->nseq; i++)
	  {
	    rlen  = esl_abc_dsqrlen(msa->abc, msa->ax[i]);
	    nres += rlen;
	    if (small == -1 || rlen < small) small = rlen;
	    if (large == -1 || rlen > large) large = rlen;
	  }

	esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid);
      }
      else { /* --small invoked */
	for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K);
      }

      if (esl_opt_GetBoolean(go, "-1")) 
	{
	  printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, 
		 nali, 
		 msa->name,
		 eslx_msafile_DecodeFormat(fmt),
		 nseq,
		 alen,
		 nres);
	  if (! esl_opt_GetBoolean(go, "--small")) { 
	    printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n",
		   small,
		   large,
		   (double) nres / (double) msa->nseq,
		   100.*avgid);
	  }
	  else { 
	    printf(" %10.1f\n", (double) nres / (double) nseq);
	  }
	}
      else
	{
	  printf("Alignment number:    %d\n",     nali);
	  if (msa->name != NULL)
	    printf("Alignment name:      %s\n",        msa->name); 
	  printf("Format:              %s\n",          eslx_msafile_DecodeFormat(fmt));
	  printf("Number of sequences: %d\n",          nseq);
	  printf("Alignment length:    %" PRId64 "\n", alen);
	  printf("Total # residues:    %" PRId64 "\n", nres);
	  if(! esl_opt_GetBoolean(go, "--small")) { 
	    printf("Smallest:            %" PRId64 "\n", small);
	    printf("Largest:             %" PRId64 "\n", large);
	  }
	  printf("Average length:      %.1f\n",        (double) nres / (double) nseq);
	  if(! esl_opt_GetBoolean(go, "--small")) { 
	    printf("Average identity:    %.0f%%\n",      100.*avgid); 
	  }
	  printf("//\n");
	}

      /* Dump data to optional output files, if nec */
      if(esl_opt_IsOn(go, "--list")) {
	if(! esl_opt_GetBoolean(go, "--small")) { 
	    /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */
	    for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]);
	}
      }

      /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions
       * and rf2a_map, a map of non-gap RF positions to overall alignment positions */
      if(msa->rf != NULL) {
	if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf);
      }
      else i_am_rf = NULL;

      weights_exist = check_msa_weights(msa);
      use_weights   = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE;
      
      if( (! esl_opt_GetBoolean(go, "--small")) && 
	  (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo")  || esl_opt_IsOn(go, "--pcinfo") || 
	   esl_opt_IsOn(go, "--cinfo")  || esl_opt_IsOn(go, "--bpinfo")))  {
	/* collect counts of each residue and PPs (if they exist) from the msa */
	if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali);
	if((status = count_msa(msa, errbuf, nali, 
			       esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */
			       esl_opt_GetBoolean(go, "--weight"),  /* use msa->wgt sequence weights? */
			       &abc_ct, 
			       ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */
			       (msa->pp != NULL ? &pp_ct : NULL)))   /* get PP counts? */
	   != eslOK) esl_fatal(errbuf);
      }

      if( esl_opt_IsOn(go, "--icinfo")) {
	if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--rinfo")) {
	if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if(esl_opt_IsOn(go, "--pcinfo")) {
	if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali);
	if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if(esl_opt_IsOn(go, "--psinfo")) {
	if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali);
	if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--iinfo")) {
	if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali);
	if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali);
	if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--cinfo")) {
	if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--bpinfo")) {
	if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali);
	if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }

      esl_msa_Destroy(msa);
      if(abc_ct != NULL)   { esl_Free2D((void **) abc_ct, alen);          abc_ct   = NULL; }
      if(bp_ct != NULL)    { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct    = NULL; }
      if(pp_ct != NULL)    { esl_Free2D((void **) pp_ct, alen);           pp_ct    = NULL; }
      if(i_am_rf != NULL)  { free(i_am_rf);                               i_am_rf  = NULL; }
      if(rf2a_map != NULL) { free(rf2a_map);                              rf2a_map = NULL; }
    }
  
  /* If an msa read failed, we've dropped out to here with an informative status code. 
   * we have to handle failures from new vs. legacy msa parsing differently 
   */
  if (esl_opt_GetBoolean(go, "--small")) 
    {
      if      (status == eslEFORMAT)  esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf);	
      else if (status != eslEOF)      esl_fatal("Alignment file read failed with error code %d\n", status);
      else if (nali   == 0)           esl_fatal("No alignments found in file %s\n", alifile);
    }
  else
    {
      if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status);
    }

  /* Cleanup, normal return
   */
  if(listfp != NULL) { 
    fclose(listfp);
    printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list"));
  }
  if(icinfofp != NULL) { 
    fclose(icinfofp);
    printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); 
  }
  if(rinfofp != NULL) { 
    fclose(rinfofp);
    printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); 
  }
  if(pcinfofp != NULL) { 
    fclose(pcinfofp);
    printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); 
  }
  if(psinfofp != NULL) { 
    fclose(psinfofp);
    printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); 
  }
  if(iinfofp != NULL) { 
    printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); 
    fclose(iinfofp);
  }
  if(cinfofp != NULL) { 
    printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); 
    fclose(cinfofp);
  }
  if(bpinfofp != NULL) { 
    printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); 
    fclose(bpinfofp);
  }


  if (afp)     eslx_msafile_Close(afp);
  if (old_afp) esl_msafile2_Close(old_afp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}