int main(int argc,char ** argv)
{
  SequenceDB * db;
  Sequence * seq;
  SeqLookupInterface * sli;
  SeqLookupPos * slp;
  HSPScanInterface * hsi;
  LinearHSPmanager * lm;
  HitList * hl;
  CompMat * mat;
  int ret;
  HSPScanInterfacePara p;


  p.min_score= 30;
  p.max_results = 200;

  db = single_fasta_SequenceDB(argv[1]);

  mat = read_Blast_file_CompMat("blosum62.bla");

  sli = new_ghash_SeqLookupInterface();

  for(seq = init_SequenceDB(db,&ret); seq != NULL;seq = get_next_SequenceDB(db) ) {
    load_aa_flat_Sequence_SeqLookupInterface(sli,hard_link_Sequence(seq));
  }


  seq = read_fasta_file_Sequence(argv[2]);

  assert(seq);

  hsi = Wise2_new_one_off_HSPScanInterface(sli,mat,20,10);

/*  hspm = simple_HSPScan_scan_query((void*)hsi->data,seq); */

  lm = (*hsi->scan_query)(hsi->data,seq,&p); 

  hl = Wise2_HitList_from_LinearHSPmanager(lm);

  Wise2_write_pseudoblast_HitList(hl,stdout);

}
boolean build_objects(void)
{
  boolean ret = TRUE;
  Protein * pro_temp;
  Genomic * gen_temp;
  FILE * ifp;





  startend = threestatemodel_mode_from_string(startend_string);
  if( startend == TSM_unknown ) {
    warn("String %s was unable to converted into a start/end policy\n",startend_string);
    ret = FALSE;
  }


  if( tstart_str != NULL ) {
    if( is_integer_string(tstart_str,&tstart) == FALSE || tstart < 0) {
      warn("Could not make %s out as target start",tstart);
      ret = FALSE;
    }
  }

  if( tend_str != NULL ) {
    if( is_integer_string(tend_str,&tend) == FALSE || tend < 0) {
      warn("Could not make %s out as target end",tend);
      ret = FALSE;
    }
  }

  if( is_integer_string(gap_str,&gap) == FALSE ) {
      warn("Could not make %s out as gap penalty (must be integer at the moment)",gap_str);
      ret = FALSE;
  }
  

  if( is_integer_string(ext_str,&ext) == FALSE ) {
    warn("Could not make %s out as gap penalty (must be integer at the moment)",ext_str);
    ret = FALSE;
  }

  if( is_embl == FALSE ) {
    if( (gen = read_fasta_file_Genomic(dna_seq_file,length_of_N)) == NULL ) {
      ret = FALSE;
      warn("Could not read genomic sequence in %s",dna_seq_file);
      gen = NULL;
    } 
  } else {
    embl = read_EMBL_GenomicRegion_file(dna_seq_file);
    if( embl == NULL ) {
      warn("Could not read genomic EMBL file in %s",dna_seq_file);
      gen = NULL;
      ret = FALSE;
    } else {
      gen = hard_link_Genomic(embl->genomic);
      
    }
  }

  if( gen != NULL ) {

    if( tstart != -1 || tend != -1 ) {
      if( tstart == -1 )
	tstart = 0;
      if( tend == -1 ) 
	tend = gen->baseseq->len;
      gen_temp = truncate_Genomic(gen,tstart-1,tend);
      if( gen_temp == NULL ){
	ret = FALSE;
      } else {
	free_Genomic(gen);
	gen = gen_temp;
      }
    } else {
      /* no truncation required */
    }
  

    if( reverse == TRUE ) {
      if( tstart > tend ) {
	warn("You have already reversed the DNA by using %d - %d truncation. Re-reversing",tstart,tend);
    }
      
      gen_temp = reverse_complement_Genomic(gen); 
      free_Genomic(gen);
      gen = gen_temp;
    }
  }

  /*
   * Can't truncate on GenomicRegion (for good reasons!).
   * but we want only a section of the EMBL file to be used
   * 
   * So... swap genomic now. Positions in EMBL are still valid,
   * however - some genes will loose their sequence, which will be damaging. ;)
   */

  
  if( is_embl ) {
    free_Genomic(embl->genomic);
    embl->genomic = hard_link_Genomic(gen); /* pointer could be dead anyway ;) */
  }


  if( target_abs == TRUE ) {
    if( is_embl == TRUE ) {
      warn("Sorry you can't both use absolute positioning and EMBL files as I can't cope with all the coordinate remapping. You'll have to convert to fasta.");
      ret =  FALSE;
    }

    gen->baseseq->offset = 1;
    gen->baseseq->end  = strlen(gen->baseseq->seq);
  }

  if( alg_str != NULL ) {
    alg = gwrap_alg_type_from_string(alg_str);
  } else {
    if( use_tsm == TRUE ) {
      alg_str = "623L";
    } else {
      alg_str = "623";
    }
    alg = gwrap_alg_type_from_string(alg_str);
  }
      

  if( qstart_str != NULL ) {
    if( is_integer_string(qstart_str,&qstart) == FALSE || qstart < 0) {
      warn("Could not make %s out as query start",qstart);
      ret = FALSE;
    }
  }

  if( qend_str != NULL ) {
    if( is_integer_string(qend_str,&qend) == FALSE || qend < 0) {
      warn("Could not make %s out as query end",qend);
      ret = FALSE;
    }
  }


  if( use_tsm == FALSE ) {
    if( startend != TSM_default && startend != TSM_global && startend != TSM_local && startend != TSM_endbiased) {
      warn("Proteins can only have local/global/endbias startend policies set, not %s",startend_string);
      ret = FALSE;
    }
    if( (pro = read_fasta_file_Protein(protein_file)) == NULL ) {
      ret = FALSE;
      warn("Could not read Protein sequence in %s",protein_file);
    } else {
      
      if( qstart != -1 || qend != -1 ) {
	if( qstart == -1 )
	  qstart = 0;
	if( qend == -1 ) 
	  qend = pro->baseseq->len;
	
	pro_temp = truncate_Protein(pro,qstart-1,qend);
	if( pro_temp == NULL ){
	  ret = FALSE;
	} else {
	  free_Protein(pro);
	  pro = pro_temp;
	}
      }
    }
  } else {
    /** using a HMM **/
    
    /*tsm = read_HMMer_1_7_ascii_file(hmm_file);*/
    /*tsm = Wise2_read_ThreeStateModel_from_hmmer1_file(hmm_file);*/
    tsm = HMMer2_read_ThreeStateModel(hmm_file);
    
    
      if( tsm == NULL ) {
	warn("Could not read hmm from %s\n",hmm_file);
	ret = FALSE;
      }  else {
	
	display_char_in_ThreeStateModel(tsm);
	if( hmm_name != NULL ) {
	  if( tsm->name != NULL ) 
	    ckfree(tsm->name);
	  tsm->name = stringalloc(hmm_name);
	}
	
	if( tsm == NULL ) {
	  warn("Could not read %s as a hmm",hmm_file);
	}
	
	/** have to set start/end **/
	set_startend_policy_ThreeStateModel(tsm,startend,30,0.1);
	
      }
  } /* end of else tsm != NULL */
  

  
  if( main_block_str != NULL ) {
    if( is_integer_string(main_block_str,&main_block) == FALSE ) {
      warn("Could not get maximum main_block number %s",main_block_str);
      ret = FALSE;
    }
  }
   


  if( is_double_string(subs_string,&subs_error) == FALSE ) {
    warn("Could not convert %s to a double",subs_error);
    ret = FALSE;
  }

  if( is_double_string(indel_string,&indel_error) == FALSE ) {
    warn("Could not convert %s to a double",indel_error);
    ret = FALSE;
  }

  if( is_double_string(allN_string,&allN) == FALSE ) {
    warn("Could not convert %s to a double",allN_string);
    ret = FALSE;
  }

  
  if( strcmp(cfreq_string,"model") == 0 ) {
    model_codon = TRUE;
  } else if ( strcmp(cfreq_string,"flat") == 0 ) {
    model_codon = FALSE;
  } else {
    warn("Cannot interpret [%s] as a codon modelling parameter\n",cfreq_string);
    ret = FALSE;
  }
  

  if( strcmp(splice_string,"model") == 0 ) {
    model_splice = TRUE;
  } else if ( strcmp(splice_string,"flat") == 0 ) {
    model_splice = FALSE;
    gmp->use_gtag_splice = TRUE;
  } else {
    warn("Cannot interpret [%s] as a splice modelling parameter\n",splice_string);
    ret = FALSE;
  }

  if( strcmp(null_string,"syn") == 0 ) {
    use_syn = TRUE;
  } else if ( strcmp(null_string,"flat") == 0 ) {
    use_syn = FALSE;
  } else {
    warn("Cannot interpret [%s] as a null model string\n",null_string);
    ret = FALSE;
  }

  if( strcmp(intron_string,"model") == 0 ) {
    use_tied_model = FALSE;
  } else if ( strcmp(intron_string,"tied") == 0 ) {
    use_tied_model = TRUE;
  } else {
    warn("Cannot interpret [%s] as a intron tieing switch\n",intron_string);
    ret = FALSE;
  }



  if( (rm = default_RandomModel()) == NULL) {
    warn("Could not make default random model\n");
    ret = FALSE;
  }

  if( use_new_stats == 0 ) {
    if( (gf = read_GeneFrequency21_file(gene_file)) == NULL) {
      ret = FALSE;
      warn("Could not read a GeneFrequency file in %s",gene_file);
    }
  } else {
    if( (gs = GeneStats_from_GeneModelParam(gmp)) == NULL ){
      ret=FALSE;
      warn("Could not read gene statistics in %s",new_gene_file);
    }
  } /* end of else using new gene stats */


  if( (mat = read_Blast_file_CompMat(matrix_file)) == NULL) {
    if( use_tsm == TRUE ) {
      info("I could not read the Comparison matrix file in %s; however, you are using a HMM so it is not needed. Please set the WISECONFIGDIR or WISEPERSONALDIR variable correctly to prevent this message.",matrix_file);
    } else {
      warn("Could not read Comparison matrix file in %s",matrix_file);
      ret = FALSE;
    }
  }

  if( (ct = read_CodonTable_file(codon_file)) == NULL) {
    ret = FALSE;
    warn("Could not read codon table file in %s",codon_file);
  }

  if( (ofp = openfile(output_file,"W")) ==  NULL) {
    warn("Could not open %s as an output file",output_file);
    ret = FALSE;
  }

  rmd = RandomModelDNA_std();
  return ret;

}
Exemple #3
0
boolean build_objects(void)
{
  boolean ret = TRUE;
  Protein * pro_temp;
  SequenceDB * psdb;



  startend = threestatemodel_mode_from_string(startend_string);
  if( startend == TSM_unknown ) {
    warn("String %s was unable to converted into a start/end policy\n",startend_string);
    ret = FALSE;
  }

  if( use_single_dna == TRUE ) {
    cdna = read_fasta_file_cDNA(dna_seq_file);
    if( cdna == NULL ) {
      warn("Could not open single dna sequence in %s",dna_seq_file);
      ret = FALSE;
    }
  } else {
    sdb = single_fasta_SequenceDB(dna_seq_file);
    
 
    if( sdb == NULL ) {
      warn("Could not build a sequence database on %s",dna_seq_file);
      ret = FALSE;
    }
  }

  rm = default_RandomModel();


  if( (mat = read_Blast_file_CompMat(matrix_file)) == NULL) {
    if( use_tsm == TRUE ) {
      info("I could not read the Comparison matrix file in %s; however, you are using a HMM so it is not needed. Please set the WISECONFIGDIR or WISEPERSONALDIR variable correctly to prevent this message.",matrix_file);
    } else {
      warn("Could not read Comparison matrix file in %s",matrix_file);
      ret = FALSE;
    }
  }
      
  if( is_integer_string(gap_str,&gap) == FALSE ) {
    warn("Could not get gap string number %s",gap_str);
    ret = FALSE;
  }

  if( is_integer_string(ext_str,&ext) == FALSE ) {
    warn("Could not get ext string number %s",ext_str);
    ret = FALSE;
  }

  if( qstart_str != NULL ) {
    if( is_integer_string(qstart_str,&qstart) == FALSE || qstart < 0) {
      warn("Could not make %s out as query start",qstart);
      ret = FALSE;
    }
  }

  if( qend_str != NULL ) {
    if( is_integer_string(qend_str,&qend) == FALSE || qend < 0) {
      warn("Could not make %s out as query end",qend);
      ret = FALSE;
    }
  }


  if( aln_number_str != NULL ) {
    if( is_integer_string(aln_number_str,&aln_number) == FALSE || aln_number < 0) {
      warn("Weird aln number string %s...\n",aln_number_str);
      ret = FALSE;
    }
  }

  if( report_str != NULL ) {
    if( is_integer_string(report_str,&report_stagger) == FALSE ) {
      warn("Weird report stagger asked for %s",report_str);
      ret = FALSE;
    }
  }


  if( use_pfam1 == TRUE ) {
    tsmdb = new_PfamHmmer1DB_ThreeStateDB(protein_file);
    if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) {
      warn("Unable to set global/local switch on threestatedb");
      ret = FALSE;
    }

  } else if ( use_pfam2 == TRUE ) {
    tsmdb = HMMer2_ThreeStateDB(protein_file);
    if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) {
      warn("Unable to set global/local switch on threestatedb");
      ret = FALSE;
    }

  } else if ( use_tsm == TRUE) {
    /** using a HMM **/

    tsm = HMMer2_read_ThreeStateModel(protein_file);

    if( tsm == NULL ) {
      warn("Could not read hmm from %s\n",protein_file);
      ret = FALSE;
    }  else {

      display_char_in_ThreeStateModel(tsm);
      if( hmm_name != NULL ) {
	if( tsm->name != NULL ) 
	  ckfree(tsm->name);
	tsm->name = stringalloc(hmm_name);
      } else {
	if( tsm->name == NULL ) {
	  tsm->name = stringalloc(protein_file);
	}
      }

      
      
      /** have to set start/end **/

      set_startend_policy_ThreeStateModel(tsm,startend,15,0.2);
      tsmdb = new_single_ThreeStateDB(tsm,rm);
      if( tsmdb == NULL ) {
	warn("Could not build a threestatemodel database from a single tsm. Weird!");
	ret = FALSE;
      }
    } /* end of else tsm != NULL */
  } /* end of else is tsm */
  else if( use_single_pro ) {


    if( startend != TSM_default && startend != TSM_global && startend != TSM_local ) {
      warn("Proteins can only have local/global startend policies set, not %s",startend_string);
      ret = FALSE;
    }

    if( (pro = read_fasta_file_Protein(protein_file)) == NULL ) {
      ret = FALSE;
      warn("Could not read Protein sequence in %s",protein_file);
    } else {
      if( qstart != -1 || qend != -1 ) {
	if( qstart == -1 )
	  qstart = 0;
	if( qend == -1 ) 
	  qend = pro->baseseq->len;

	pro_temp = truncate_Protein(pro,qstart-1,qend);
	if( pro_temp == NULL ){
	  ret = FALSE;
	} else {
	  free_Protein(pro);
	  pro = pro_temp;
	}
      }


      if( startend == TSM_global) 
	tsm = global_ThreeStateModel_from_half_bit_Sequence(pro,mat,rm,-gap,-ext);
      else
	tsm = ThreeStateModel_from_half_bit_Sequence(pro,mat,rm,-gap,-ext);

      if( tsm == NULL ) {
	warn("Could not build ThreeStateModel from a single protein sequence...");
	ret = FALSE; 
      } else {
	tsmdb = new_single_ThreeStateDB(tsm,rm);
	if( tsmdb == NULL ) {
	  warn("Could not build a threestatemodel database from a single tsm. Weird!");
	  ret = FALSE;
	}
      } /* end of could build a TSM */
    } /* else is a real protein */  

  } /* end of else is single protein */
  else if (use_db_pro == TRUE ) {
    psdb = single_fasta_SequenceDB(protein_file);
    tsmdb = new_proteindb_ThreeStateDB(psdb,mat,-gap,-ext);
    free_SequenceDB(psdb);
  }
  else {
    warn("No protein input file! Yikes!");
  }

  /***
  if( use_tsm == FALSE ) {
  } else {
  ****/


  if( main_block_str != NULL ) {
    if( is_integer_string(main_block_str,&main_block) == FALSE ) {
      warn("Could not get maximum main_block number %s",main_block_str);
      ret = FALSE;
    }
  }


  if( evalue_search_str != NULL && is_double_string(evalue_search_str,&evalue_search_cutoff) == FALSE ) {
    warn("Could not convert %s to a double",evalue_search_str);
    ret = FALSE;
  }
  
  if( is_double_string(search_cutoff_str,&search_cutoff) == FALSE ) {
    warn("Could not convert %s to a double",search_cutoff_str);
    ret = FALSE;
  }


  if( is_double_string(subs_string,&subs_error) == FALSE ) {
    warn("Could not convert %s to a double",subs_error);
    ret = FALSE;
  }

  if( is_double_string(indel_string,&indel_error) == FALSE ) {
    warn("Could not convert %s to a double",indel_error);
    ret = FALSE;
  }


  if( is_double_string(allN_string,&allN) == FALSE ) {
    warn("Could not convert %s to a double",allN_string);
    ret = FALSE;
  }
  


  if( strcmp(null_string,"syn") == 0 ) {
    use_syn = TRUE;
  } else if ( strcmp(null_string,"flat") == 0 ) {
    use_syn = FALSE;
  } else {
    warn("Cannot interpret [%s] as a null model string\n",null_string);
    ret = FALSE;
  }

   
  if( alg_str != NULL ) {
    alg = alg_estwrap_from_string(alg_str);
  } else {
    alg_str = "312";
    alg = alg_estwrap_from_string(alg_str);
  }

  if( aln_alg_str != NULL ) {
    aln_alg = alg_estwrap_from_string(aln_alg_str);
  } else {
    /* if it is a protein, don't loop */
    if( use_single_pro == TRUE || use_db_pro == TRUE ) 
      aln_alg_str = "333";
    else 
      aln_alg_str = "333L";
    aln_alg = alg_estwrap_from_string(aln_alg_str);
  }


  if( (rm = default_RandomModel()) == NULL) {
    warn("Could not make default random model\n");
    ret = FALSE;
  }

  if( (ct = read_CodonTable_file(codon_file)) == NULL) {
    ret = FALSE;
    warn("Could not read codon table file in %s",codon_file);
  }

  if( (ofp = openfile(output_file,"W")) ==  NULL) {
    warn("Could not open %s as an output file",output_file);
    ret = FALSE;
  }

  rmd = RandomModelDNA_std();


  cps = flat_cDNAParser(indel_error);
  cm = flat_CodonMapper(ct);
  sprinkle_errors_over_CodonMapper(cm,subs_error);

  return ret;

}
HSPScanInterface * new_HSPScanInterface_from_ScanWiseHSPImpl(ScanWiseHSPImpl * i,ProteinIndexConstructor * pic,SeqLookupLoadPara * slp)
{
  HSPScanInterface * out;
  SeqLookupInterface * sli;
  SequenceDB * db;
  Sequence * seq;
  CompMat * mat;
  int ret;
  int c;
  
  mat = read_Blast_file_CompMat(i->matrix_file);
 
  if( i->use_corba == FALSE && i->use_mysql == FALSE && i->use_wiseserver == FALSE && i->use_compress == FALSE && i->use_multiscan == FALSE) {
    if( i->direct_sequence == NULL ) {
      fatal("If no server based sequence, must have direct sequence");
    } else {
      db = single_fasta_SequenceDB(i->direct_sequence);

      sli = new_SeqLookupInterface_from_ProteinIndexConstructor(pic);

      load_SequenceDB_SeqLookupLoadPara(slp,db,sli);

      free_SequenceDB(db);

      out = new_one_off_HSPScanInterface(sli,mat,15,40);
    }
  } else if( i->use_corba == TRUE ) {
#ifdef SCAN_CORBA 
    if( i->ior_file == NULL ) {
      fatal("Corba specified, but no ior file given");
    }


    out = new_corba_HSPScan(sorb,i->ior_file,mat);
#else
    fatal("Asking for CORBA, but scanwisep was not compiled with SCAN_CORBA defined.");
#endif
  } else if ( i->use_mysql == TRUE ) {
#ifdef SCAN_MYSQL
    out = new_HSPScanInterface_MysqlProteinIndex(i->host,i->dbname,i->username,i->password,mat,i->step);
#else
    fatal("Asking for mysql, but scanwisep was not compiled with SCAN_MYSQL defined");
#endif
  } else if ( i->use_wiseserver == TRUE ) {

#ifdef SCAN_WISESERVER
    out = new_wise_transfer_HSPScanInterface(i->host,i->port);
#else
    fatal("Asking for wiseserver, but scanwisep was not compiled with SCAN_WISESERVER defined");
#endif
  } else if ( i->use_compress == TRUE ) {

#ifdef SCAN_COMPRESS
    sli = new_direct_CompressedProteinLookup();
    db = single_fasta_SequenceDB(i->direct_sequence);

    load_SequenceDB_SeqLookupLoadPara(slp,db,sli);

    free_SequenceDB(db);

    out = new_one_off_HSPScanInterface(sli,mat,15,40);

#else
    fatal("Asking for compressed, but scanwisep was not compiled with SCAN_COMPRESS defined");
#endif
  } else if( i->use_multiscan == TRUE ) {
    if( i->multiscan_file == NULL ) {
      fatal("Must provide a file for a multiple server scan");
    }
    
    out = new_multiclient_HSPScanInterface(i->multiscan_file);
  }



  assert(out != NULL);


  free_CompMat(mat); /* hard linked internally */
  return out;
}
int main(int argc,char ** argv)
{
  DPRunImpl * dpri = NULL;
  ScanWiseHSPImpl * scani = NULL;
  HSP2HitListImpl * hsp2hiti = NULL;
  HitListOutputImpl * hloi = NULL;
  ProteinIndexConstructor * pic = NULL;
 


  HSPScanInterface * hsi;
  HSPScanInterfacePara * para;
  SearchStatInterface * ssi;
  SearchStatInterface * ssl;
  SeqLookupLoadPara * slp;

  HSPset2HitPairPara * hsp2hit;
  CompMat * mat;
  SequenceDB * db;
  Sequence * seq;
  int ret;
  int i;
  int effective_db_size = 300000;
  int kk;
  
  int count = 0;

  LinearHSPmanager * lm;
  HitList * hl;

  boolean use_mott = 1;

  boolean trunc_best_hsp = 0;
  boolean verbose = 0;
  static struct rusage use;

  struct timeval t0, t1;

  gettimeofday(&t0, NULL);


  dpri      = new_DPRunImpl_from_argv(&argc,argv);

  dpri->memory = DPIM_Explicit;

  scani     = new_ScanWiseHSPImpl_from_argv(&argc,argv);
  
  hsp2hiti  = new_HSP2HitListImpl_from_argv(&argc,argv);

  hloi = new_HitListOutputImpl_from_argv(&argc,argv);

  slp = new_SeqLookupLoadPara_from_argv(&argc,argv);

  pic = new_ProteinIndexConstructor_from_argv(&argc,argv);

  hsp2hit = new_HSPset2HitPairPara_from_argv(&argc,argv);

  para = new_HSPScanInterfacePara_from_argv(&argc,argv);

  verbose = strip_out_boolean_argument(&argc,argv,"verbose") ;


  strip_out_boolean_def_argument(&argc,argv,"mott",&use_mott);

  strip_out_boolean_def_argument(&argc,argv,"besthsp",&trunc_best_hsp);

  strip_out_integer_argument(&argc,argv,"dbsize",&effective_db_size);

  

#ifdef SCAN_CORBA
  sorb = get_Wise2Corba_Singleton(&argc,argv,"orbit-local-orb");
#endif

  if( dpri == NULL ) {
    fatal("Unable to build DPRun implementation. Bad arguments");
  }

  strip_out_standard_options(&argc,argv,show_help,show_version);
  if( argc != 2 ) {
    show_help(stdout);
    exit(12);
  }

  /* ugly, but we don't want to bounce matrices around the network... */

  mat = read_Blast_file_CompMat("BLOSUM62.bla");
  
  erroroff(REPORT);

  hsi = new_HSPScanInterface_from_ScanWiseHSPImpl(scani,pic,slp);

  ssi = new_Mott_SearchStatInterface();

  ssl = new_lookup_SearchStatInterface(40,2.3);


  if( verbose ) {
    info("contacted database");
  }

  db = single_fasta_SequenceDB(argv[1]);

  if( db == NULL ) {
    fatal("Could not open sequence db...\n");
  }

  for(seq = init_SequenceDB(db,&ret); seq != NULL;seq = get_next_SequenceDB(db) ) {

	count++;

    for(i=0;i<seq->len;i++) {
      if( !isalpha(seq->seq[i]) ) {
	fatal("Sequence position %d [%c] is not valid",i,seq->seq[i]);
      }
      seq->seq[i] = toupper(seq->seq[i]);
    }

    info("Processing %s",seq->name);

    getrusage(RUSAGE_SELF,&use);
    
    /*    info("Before query %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */

    lm = (*hsi->scan_query)(hsi->data,seq,para);


    fprintf(stderr,"Got linear manager is %d entries\n",lm->len);

    if( lm->mat == NULL ) {
      lm->mat = hard_link_CompMat(mat);
    }

    getrusage(RUSAGE_SELF,&use);
    /*
    info("After query %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */
    sort_LinearHSPmanager(lm,compare_HSPset_score);


    if( trunc_best_hsp == 1 ) {
      for(kk=1;kk<lm->len;kk++) {
	free_HSPset(lm->set[kk]);
	lm->set[kk] = NULL;
      }
      lm->len = 1;
    }

    getrusage(RUSAGE_SELF,&use);
    
    /*
    info("After sort %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */
    hl   = HitList_from_HSP_HSP2HitListImpl(hsp2hiti,lm,dpri,hsp2hit);


    getrusage(RUSAGE_SELF,&use);
    /*
    info("After conversion %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */
    free_LinearHSPmanager(lm);

    if( use_mott == 1 ) {
      apply_SearchStat_to_HitList(hl,ssi,effective_db_size);
    } else {
      for(kk=0;kk<hl->len;kk++) {
	hl->pair[kk]->bit_score = hl->pair[kk]->raw_score / 2.0; 
      }
    }

    sort_HitList_by_score(hl);

    show_HitList_HitListOutputImpl(hloi,hl,stdout);

    getrusage(RUSAGE_SELF,&use);
    /*
    info("After output %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */

    free_HitList(hl);
    free_Sequence(seq);
  }
    

  free_DPRunImpl(dpri);
  free_HSPScanInterface(hsi);

  gettimeofday(&t1, NULL);
  fprintf(stderr, "[client stats] queries, time (s): %d %f\n",
                count,
		(t1.tv_sec - t0.tv_sec) +
                (t1.tv_usec - t0.tv_usec) * 1e-6);

  return 0;

}
Exemple #6
0
int main(int argc,char ** argv)
{
  Sequence * query;
  Sequence * target;
  CompMat * comp;
  char * comp_file;
  int gap = (12);
  int ext = (2);
  int a = 120;
  int b = 10;
  int c = 3;
  ComplexSequence * query_cs;
  ComplexSequence * target_cs;
  ComplexSequenceEvalSet * evalfunc;

  boolean show_label_output = FALSE;
  boolean show_fancy_output = FALSE;
  boolean use_abc = FALSE;

  PackAln * pal;
  AlnBlock * alb;

  DPRunImpl * dpri = NULL;

  /*
   * Process command line options
   * -h or -help gives us help
   * -g for gap value (an int) - rely on commandline error processing
   * -e for ext value (an int) - rely on commandline error processing
   * -m for matrix (a char)
   * -l - label output
   * -f - fancy output
   *
   *
   * Use calls to commandline.h functions
   *
   */
  
  if( strip_out_boolean_argument(&argc,argv,"h") == TRUE || strip_out_boolean_argument(&argc,argv,"-help") == TRUE) {
    show_help(stdout);
    exit(1);
  }

  dpri = new_DPRunImpl_from_argv(&argc,argv);
  if( dpri == NULL ) {
    fatal("Unable to build DPRun implementation. Bad arguments");
  }

  show_label_output = strip_out_boolean_argument(&argc,argv,"l");
  show_fancy_output = strip_out_boolean_argument(&argc,argv,"f");


  /** if all FALSE, set fancy to TRUE **/

  if( show_label_output == FALSE ) 
    show_fancy_output = TRUE;


  (void) strip_out_integer_argument(&argc,argv,"g",&gap);
  (void) strip_out_integer_argument(&argc,argv,"e",&ext);
  (void) strip_out_integer_argument(&argc,argv,"a",&a);
  (void) strip_out_integer_argument(&argc,argv,"b",&b);
  (void) strip_out_integer_argument(&argc,argv,"c",&c);

  use_abc = strip_out_boolean_argument(&argc,argv,"abc"); 
  
  comp_file = strip_out_assigned_argument(&argc,argv,"m");
  if( comp_file == NULL)
    comp_file = "blosum62.bla";

  
  
  if( argc != 3 ) {
    warn("Must have two arguments for sequence 1 and sequence 2 %d",argc);
    show_help(stdout);
    exit(1);
  }
  
  /*
   * Read in two sequences
   */
  
  if( (query=read_fasta_file_Sequence(argv[1])) == NULL ) {
    fatal("Unable to read the sequence in file %s",argv[1]);
  }
  
  if( (target=read_fasta_file_Sequence(argv[2])) == NULL ) {
    fatal("Unable to read the sequence in file %s",argv[2]);
  }
  
  
  /*
   * Open a blosum matrix. This will be opened from WISECONFIGDIR
   * or WISEPERSONALDIR if it is not present in the current directory.
   */
  
  comp = read_Blast_file_CompMat(comp_file);
  
  if( comp == NULL ) {
    fatal("unable to read file %s",comp_file);
  }
  
  /* if abc - factor up matrix! */

  if( use_abc == TRUE ) {
    factor_CompMat(comp,10);
  }


  /*
   * Make an alignment. I don't care about the implementation:
   * hand it over to sw_wrap function to do it
   *
   */		 

  if( use_abc ) {
    evalfunc = default_aminoacid_ComplexSequenceEvalSet();
  
    query_cs = new_ComplexSequence(query,evalfunc);
    if( query_cs == NULL )
      fatal("Cannot build cs objects!");
    target_cs = new_ComplexSequence(target,evalfunc);
    if( target_cs == NULL )
      fatal("Cannot build cs objects!");

    pal = PackAln_bestmemory_abc(query_cs,target_cs,comp,-a,-b,-c,NULL,dpri);
    alb = convert_PackAln_to_AlnBlock_abc(pal);
    free_ComplexSequence(query_cs);
    free_ComplexSequence(target_cs);
  } else {
    alb = Align_Sequences_ProteinSmithWaterman(query,target,comp,-gap,-ext,dpri);
  }


  /*
   * show output. If multiple outputs, divide using //
   */


  if( show_label_output == TRUE ) {
    show_flat_AlnBlock(alb,stdout);
    puts("//\n");
  }

  if( show_fancy_output == TRUE ) {
    write_pretty_seq_align(alb,query,target,15,50,stdout);
    puts("//\n");
  }

  /*
   * Destroy the memory.
   */	

  free_Sequence(query);
  free_Sequence(target);
  free_CompMat(comp);
  free_AlnBlock(alb);

  return 0;
}
Exemple #7
0
int main(int argc,char ** argv)
{
  Sequence * cdna;
  Sequence * gen;
  Sequence * active_gen;
  Sequence * active_cdna;

  int i;
  int dstart = -1;
  int dend   = -1;

  int cstart = -1;
  int cend   = -1;

  CodonTable * ct = NULL;
  CodonMatrixScore * cm = NULL;
  RandomCodon * rndcodon = NULL;
  RandomCodonScore * rndcodonscore = NULL;
  DnaMatrix * dm   = NULL;

  DPRunImpl * dpri = NULL;
 
  GeneModel * gm;
  GeneModelParam * gmp;
  GeneStats * gs;
  GeneParser21 * gp21;
  GeneParser21Score * gp21s;
  GeneParser4Score * gp;


  ComplexSequenceEvalSet * cdna_cses;
  ComplexSequenceEvalSet * gen_cses;

  ComplexSequence * cs_cdna;
  ComplexSequence * cs_gen;
  
  Genomic * gent;
  GenomicRegion * gr;

  CompMat  * cmat;
  CompProb * cprob;
  char * matfile = "blosum62.bla";
  Protein * trans;

  PackAln * pal;
  AlnBlock * alb;

  FILE * ofp = stdout;

  dpri = new_DPRunImpl_from_argv(&argc,argv);
  gmp  = new_GeneModelParam_from_argv(&argc,argv);

  strip_out_integer_argument(&argc,argv,"u",&dstart);
  strip_out_integer_argument(&argc,argv,"v",&dend);

  strip_out_integer_argument(&argc,argv,"s",&cstart);
  strip_out_integer_argument(&argc,argv,"t",&cend);


  strip_out_standard_options(&argc,argv,show_help,show_version);


  ct = read_CodonTable_file(codon_file);

  cmat = read_Blast_file_CompMat(matfile);
  cprob = CompProb_from_halfbit(cmat);
  cm = naive_CodonMatrixScore_from_prob(ct,cprob);
  
  gm = GeneModel_from_GeneModelParam(gmp);

  cdna = read_fasta_file_Sequence(argv[1]);
  gen = read_fasta_file_Sequence(argv[2]);

  if( dstart != -1 || dend != -1 ) {
    if( dstart == -1 ) {
      dstart = 1;
    }
    if( dend == -1 ) {
      dend = gen->len;
    }
    active_gen = magic_trunc_Sequence(gen,dstart,dend);
  } else {
    active_gen = hard_link_Sequence(gen);
  }

  if( cstart != -1 || cend != -1 ) {
    if( cstart == -1 ) {
      cstart = 1;
    }
    if( cend == -1 ) {
      cend = gen->len;
    }
    active_cdna = magic_trunc_Sequence(gen,cstart,cend);
  } else {
    active_cdna = hard_link_Sequence(gen);
  }

  

  rndcodon = RandomCodon_from_raw_CodonFrequency(gm->codon,ct);
  fold_in_RandomModelDNA_into_RandomCodon(rndcodon,gm->rnd);

  rndcodonscore = RandomCodonScore_from_RandomCodon(rndcodon);

  assert(active_cdna);
  assert(active_gen);

  cdna_cses = default_cDNA_ComplexSequenceEvalSet();
  gen_cses  = new_ComplexSequenceEvalSet_from_GeneModel(gm);

  cs_cdna = new_ComplexSequence(active_cdna,cdna_cses);
  cs_gen  = new_ComplexSequence(active_gen,gen_cses);

  gp21 = std_GeneParser21();
  GeneParser21_fold_in_RandomModelDNA(gp21,gm->rnd);
  gp21s = GeneParser21Score_from_GeneParser21(gp21);
  gp = GeneParser4Score_from_GeneParser21Score(gp21s);
 
  dm = identity_DnaMatrix(Probability2Score(halfbit2Probability(1)),Probability2Score(halfbit2Probability(-1)));

  assert(cs_cdna);
  assert(cs_gen);
  assert(gp);
  assert(rndcodonscore);
  assert(dm);
  assert(dpri);
  
  /*  show_CodonMatrixScore(cm,ct,ofp);*/

  pal = PackAln_bestmemory_CdnaWise10(cs_cdna,cs_gen,gp,cm,rndcodonscore,dm,
				      Probability2Score(halfbit2Probability(-12)),
				      Probability2Score(halfbit2Probability(-2)),
				      Probability2Score(halfbit2Probability(-5)),
				      Probability2Score(halfbit2Probability(0)),
				      NULL,
				      dpri);


  alb = convert_PackAln_to_AlnBlock_CdnaWise10(pal);

  gent = Genomic_from_Sequence(gen);
  assert(gent);

  gr = new_GenomicRegion(gent);
  assert(gr);


  add_Genes_to_GenomicRegion_GeneWise(gr,active_gen->offset,active_gen->end,alb,cdna->name,0,NULL);
				      
  mapped_ascii_AlnBlock(alb,Score2Bits,0,ofp);

  show_pretty_GenomicRegion(gr,0,ofp);

  for(i=0;i<gr->len;i++) {
    if( gr->gene[i]->ispseudo == TRUE ) {
	fprintf(ofp,"#Gene %d is a pseudo gene - no translation possible\n",i);
    } else {
      trans = get_Protein_from_Translation(gr->gene[i]->transcript[0]->translation[0],ct);
      write_fasta_Sequence(trans->baseseq,ofp);
    }
  } 
 
}
Exemple #8
0
int main(int argc,char ** argv)
{
  Sequence * query;
  Sequence * target;
  ComplexSequence * query_cs;
  ComplexSequence * target_cs;
  ComplexSequenceEvalSet  * evalfunc;
  CompMat * comp;
  char * comp_file;
  int gap = (12);
  int ext = (2);

  boolean show_raw_output = FALSE;
  boolean show_label_output = FALSE;
  boolean show_fancy_output = FALSE;
  boolean has_outputted = FALSE;

  PackAln * pal;
  AlnBlock * alb;
  
  /*
   * Process command line options
   * -h or -help gives us help
   * -g for gap value (an int) - rely on commandline error processing
   * -e for ext value (an int) - rely on commandline error processing
   * -m for matrix (a char)
   * -r - raw matrix output
   * -l - label output
   * -f - fancy output
   *
   *
   * Use calls to commandline.h functions
   *
   */
  
  if( strip_out_boolean_argument(&argc,argv,"h") == TRUE || strip_out_boolean_argument(&argc,argv,"-help") == TRUE) {
    show_help(stdout);
    exit(1);
  }

  show_raw_output = strip_out_boolean_argument(&argc,argv,"r");
  show_label_output = strip_out_boolean_argument(&argc,argv,"l");
  show_fancy_output = strip_out_boolean_argument(&argc,argv,"f");


  /** if all FALSE, set fancy to TRUE **/

  if( show_raw_output == FALSE && show_label_output == FALSE ) 
    show_fancy_output = TRUE;


  (void) strip_out_integer_argument(&argc,argv,"g",&gap);
  (void) strip_out_integer_argument(&argc,argv,"e",&ext);

  comp_file = strip_out_assigned_argument(&argc,argv,"m");
  if( comp_file == NULL)
    comp_file = "blosum62.bla";

  
  
  if( argc != 3 ) {
    warn("Must have two arguments for sequence 1 and sequence 2 %d",argc);
    show_help(stdout);
    exit(1);
  }
  
  /*
   * Read in two sequences
   */
  
  if( (query=read_fasta_file_Sequence(argv[1])) == NULL ) {
    fatal("Unable to read the sequence in file %s",argv[1]);
  }
  
  if( (target=read_fasta_file_Sequence(argv[2])) == NULL ) {
    fatal("Unable to read the sequence in file %s",argv[2]);
  }
  
  
  /*
   * Open a blosum matrix. This will be opened from WISECONFIGDIR
   * or WISEPERSONALDIR if it is not present in the current directory.
   */
  
  comp = read_Blast_file_CompMat(comp_file);
  
  if( comp == NULL ) {
    fatal("unable to read file %s",comp_file);
  }
  
  /*
   * Convert sequences to ComplexSequences: 
   * To do this we need an protein ComplexSequenceEvalSet
   *
   */
  
  evalfunc = default_aminoacid_ComplexSequenceEvalSet();
  
  query_cs = new_ComplexSequence(query,evalfunc);
  if( query_cs == NULL ) {
    fatal("Unable to make a protein complex sequence from %s",query->name);
  }
  
  target_cs = new_ComplexSequence(target,evalfunc);
  if( target_cs == NULL ) {
    fatal("Unable to make a protein complex sequence from %s",target->name);
  }
  
  /*
   * Make an alignment. I don't care about the implementation:
   * If the sequences are small enough then it should use explicit memory.
   * Long sequences should use divide and conquor methods.
   *
   * Calling PackAln_bestmemory_ProteinSW is the answer
   * This function decides on the best method considering the
   * memory and changes accordingly. It frees the matrix memory 
   * at the end as well.
   *
   */		 

  pal = PackAln_bestmemory_ProteinSW(query_cs,target_cs,comp,-gap,-ext,NULL);

  if( pal == NULL ) {
    fatal("Unable to make an alignment from %s and %s",query->name,target->name);
  }

  /*
   * ok, make other alignment forms, and be ready to show
   */



  alb = convert_PackAln_to_AlnBlock_ProteinSW(pal);


  /*
   * show output. If multiple outputs, divide using //
   */

  if( show_raw_output == TRUE ) {
    show_simple_PackAln(pal,stdout);
    puts("//\n");
  }

  if( show_label_output == TRUE ) {
    show_flat_AlnBlock(alb,stdout);
  }

  if( show_fancy_output == TRUE ) {
    write_pretty_seq_align(alb,query,target,15,50,stdout);
    puts("//\n");
  }

  /*
   * Destroy the memory.
   */	

  free_Sequence(query);
  free_Sequence(target);
  free_CompMat(comp);
  free_ComplexSequence(query_cs);
  free_ComplexSequence(target_cs);
  free_PackAln(pal);
  free_AlnBlock(alb);

  return 0;
}