Пример #1
0
int main(int argc,char ** argv)
{
  SequenceDB * db;
  Sequence * seq;
  SeqLookupInterface * sli;
  SeqLookupPos * slp;
  HSPScanInterface * hsi;
  LinearHSPmanager * lm;
  HitList * hl;
  CompMat * mat;
  int ret;
  HSPScanInterfacePara p;


  p.min_score= 30;
  p.max_results = 200;

  db = single_fasta_SequenceDB(argv[1]);

  mat = read_Blast_file_CompMat("blosum62.bla");

  sli = new_ghash_SeqLookupInterface();

  for(seq = init_SequenceDB(db,&ret); seq != NULL;seq = get_next_SequenceDB(db) ) {
    load_aa_flat_Sequence_SeqLookupInterface(sli,hard_link_Sequence(seq));
  }


  seq = read_fasta_file_Sequence(argv[2]);

  assert(seq);

  hsi = Wise2_new_one_off_HSPScanInterface(sli,mat,20,10);

/*  hspm = simple_HSPScan_scan_query((void*)hsi->data,seq); */

  lm = (*hsi->scan_query)(hsi->data,seq,&p); 

  hl = Wise2_HitList_from_LinearHSPmanager(lm);

  Wise2_write_pseudoblast_HitList(hl,stdout);

}
Пример #2
0
HSPScanInterface * new_HSPScanInterface_from_ScanWiseHSPImpl(ScanWiseHSPImpl * i,ProteinIndexConstructor * pic,SeqLookupLoadPara * slp)
{
  HSPScanInterface * out;
  SeqLookupInterface * sli;
  SequenceDB * db;
  Sequence * seq;
  CompMat * mat;
  int ret;
  int c;
  
  mat = read_Blast_file_CompMat(i->matrix_file);
 
  if( i->use_corba == FALSE && i->use_mysql == FALSE && i->use_wiseserver == FALSE && i->use_compress == FALSE && i->use_multiscan == FALSE) {
    if( i->direct_sequence == NULL ) {
      fatal("If no server based sequence, must have direct sequence");
    } else {
      db = single_fasta_SequenceDB(i->direct_sequence);

      sli = new_SeqLookupInterface_from_ProteinIndexConstructor(pic);

      load_SequenceDB_SeqLookupLoadPara(slp,db,sli);

      free_SequenceDB(db);

      out = new_one_off_HSPScanInterface(sli,mat,15,40);
    }
  } else if( i->use_corba == TRUE ) {
#ifdef SCAN_CORBA 
    if( i->ior_file == NULL ) {
      fatal("Corba specified, but no ior file given");
    }


    out = new_corba_HSPScan(sorb,i->ior_file,mat);
#else
    fatal("Asking for CORBA, but scanwisep was not compiled with SCAN_CORBA defined.");
#endif
  } else if ( i->use_mysql == TRUE ) {
#ifdef SCAN_MYSQL
    out = new_HSPScanInterface_MysqlProteinIndex(i->host,i->dbname,i->username,i->password,mat,i->step);
#else
    fatal("Asking for mysql, but scanwisep was not compiled with SCAN_MYSQL defined");
#endif
  } else if ( i->use_wiseserver == TRUE ) {

#ifdef SCAN_WISESERVER
    out = new_wise_transfer_HSPScanInterface(i->host,i->port);
#else
    fatal("Asking for wiseserver, but scanwisep was not compiled with SCAN_WISESERVER defined");
#endif
  } else if ( i->use_compress == TRUE ) {

#ifdef SCAN_COMPRESS
    sli = new_direct_CompressedProteinLookup();
    db = single_fasta_SequenceDB(i->direct_sequence);

    load_SequenceDB_SeqLookupLoadPara(slp,db,sli);

    free_SequenceDB(db);

    out = new_one_off_HSPScanInterface(sli,mat,15,40);

#else
    fatal("Asking for compressed, but scanwisep was not compiled with SCAN_COMPRESS defined");
#endif
  } else if( i->use_multiscan == TRUE ) {
    if( i->multiscan_file == NULL ) {
      fatal("Must provide a file for a multiple server scan");
    }
    
    out = new_multiclient_HSPScanInterface(i->multiscan_file);
  }



  assert(out != NULL);


  free_CompMat(mat); /* hard linked internally */
  return out;
}
Пример #3
0
boolean build_objects(void)
{
  boolean ret = TRUE;
  Protein * pro_temp;
  SequenceDB * psdb;



  startend = threestatemodel_mode_from_string(startend_string);
  if( startend == TSM_unknown ) {
    warn("String %s was unable to converted into a start/end policy\n",startend_string);
    ret = FALSE;
  }

  if( use_single_dna == TRUE ) {
    cdna = read_fasta_file_cDNA(dna_seq_file);
    if( cdna == NULL ) {
      warn("Could not open single dna sequence in %s",dna_seq_file);
      ret = FALSE;
    }
  } else {
    sdb = single_fasta_SequenceDB(dna_seq_file);
    
 
    if( sdb == NULL ) {
      warn("Could not build a sequence database on %s",dna_seq_file);
      ret = FALSE;
    }
  }

  rm = default_RandomModel();


  if( (mat = read_Blast_file_CompMat(matrix_file)) == NULL) {
    if( use_tsm == TRUE ) {
      info("I could not read the Comparison matrix file in %s; however, you are using a HMM so it is not needed. Please set the WISECONFIGDIR or WISEPERSONALDIR variable correctly to prevent this message.",matrix_file);
    } else {
      warn("Could not read Comparison matrix file in %s",matrix_file);
      ret = FALSE;
    }
  }
      
  if( is_integer_string(gap_str,&gap) == FALSE ) {
    warn("Could not get gap string number %s",gap_str);
    ret = FALSE;
  }

  if( is_integer_string(ext_str,&ext) == FALSE ) {
    warn("Could not get ext string number %s",ext_str);
    ret = FALSE;
  }

  if( qstart_str != NULL ) {
    if( is_integer_string(qstart_str,&qstart) == FALSE || qstart < 0) {
      warn("Could not make %s out as query start",qstart);
      ret = FALSE;
    }
  }

  if( qend_str != NULL ) {
    if( is_integer_string(qend_str,&qend) == FALSE || qend < 0) {
      warn("Could not make %s out as query end",qend);
      ret = FALSE;
    }
  }


  if( aln_number_str != NULL ) {
    if( is_integer_string(aln_number_str,&aln_number) == FALSE || aln_number < 0) {
      warn("Weird aln number string %s...\n",aln_number_str);
      ret = FALSE;
    }
  }

  if( report_str != NULL ) {
    if( is_integer_string(report_str,&report_stagger) == FALSE ) {
      warn("Weird report stagger asked for %s",report_str);
      ret = FALSE;
    }
  }


  if( use_pfam1 == TRUE ) {
    tsmdb = new_PfamHmmer1DB_ThreeStateDB(protein_file);
    if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) {
      warn("Unable to set global/local switch on threestatedb");
      ret = FALSE;
    }

  } else if ( use_pfam2 == TRUE ) {
    tsmdb = HMMer2_ThreeStateDB(protein_file);
    if( set_search_type_ThreeStateDB(tsmdb,startend_string) == FALSE) {
      warn("Unable to set global/local switch on threestatedb");
      ret = FALSE;
    }

  } else if ( use_tsm == TRUE) {
    /** using a HMM **/

    tsm = HMMer2_read_ThreeStateModel(protein_file);

    if( tsm == NULL ) {
      warn("Could not read hmm from %s\n",protein_file);
      ret = FALSE;
    }  else {

      display_char_in_ThreeStateModel(tsm);
      if( hmm_name != NULL ) {
	if( tsm->name != NULL ) 
	  ckfree(tsm->name);
	tsm->name = stringalloc(hmm_name);
      } else {
	if( tsm->name == NULL ) {
	  tsm->name = stringalloc(protein_file);
	}
      }

      
      
      /** have to set start/end **/

      set_startend_policy_ThreeStateModel(tsm,startend,15,0.2);
      tsmdb = new_single_ThreeStateDB(tsm,rm);
      if( tsmdb == NULL ) {
	warn("Could not build a threestatemodel database from a single tsm. Weird!");
	ret = FALSE;
      }
    } /* end of else tsm != NULL */
  } /* end of else is tsm */
  else if( use_single_pro ) {


    if( startend != TSM_default && startend != TSM_global && startend != TSM_local ) {
      warn("Proteins can only have local/global startend policies set, not %s",startend_string);
      ret = FALSE;
    }

    if( (pro = read_fasta_file_Protein(protein_file)) == NULL ) {
      ret = FALSE;
      warn("Could not read Protein sequence in %s",protein_file);
    } else {
      if( qstart != -1 || qend != -1 ) {
	if( qstart == -1 )
	  qstart = 0;
	if( qend == -1 ) 
	  qend = pro->baseseq->len;

	pro_temp = truncate_Protein(pro,qstart-1,qend);
	if( pro_temp == NULL ){
	  ret = FALSE;
	} else {
	  free_Protein(pro);
	  pro = pro_temp;
	}
      }


      if( startend == TSM_global) 
	tsm = global_ThreeStateModel_from_half_bit_Sequence(pro,mat,rm,-gap,-ext);
      else
	tsm = ThreeStateModel_from_half_bit_Sequence(pro,mat,rm,-gap,-ext);

      if( tsm == NULL ) {
	warn("Could not build ThreeStateModel from a single protein sequence...");
	ret = FALSE; 
      } else {
	tsmdb = new_single_ThreeStateDB(tsm,rm);
	if( tsmdb == NULL ) {
	  warn("Could not build a threestatemodel database from a single tsm. Weird!");
	  ret = FALSE;
	}
      } /* end of could build a TSM */
    } /* else is a real protein */  

  } /* end of else is single protein */
  else if (use_db_pro == TRUE ) {
    psdb = single_fasta_SequenceDB(protein_file);
    tsmdb = new_proteindb_ThreeStateDB(psdb,mat,-gap,-ext);
    free_SequenceDB(psdb);
  }
  else {
    warn("No protein input file! Yikes!");
  }

  /***
  if( use_tsm == FALSE ) {
  } else {
  ****/


  if( main_block_str != NULL ) {
    if( is_integer_string(main_block_str,&main_block) == FALSE ) {
      warn("Could not get maximum main_block number %s",main_block_str);
      ret = FALSE;
    }
  }


  if( evalue_search_str != NULL && is_double_string(evalue_search_str,&evalue_search_cutoff) == FALSE ) {
    warn("Could not convert %s to a double",evalue_search_str);
    ret = FALSE;
  }
  
  if( is_double_string(search_cutoff_str,&search_cutoff) == FALSE ) {
    warn("Could not convert %s to a double",search_cutoff_str);
    ret = FALSE;
  }


  if( is_double_string(subs_string,&subs_error) == FALSE ) {
    warn("Could not convert %s to a double",subs_error);
    ret = FALSE;
  }

  if( is_double_string(indel_string,&indel_error) == FALSE ) {
    warn("Could not convert %s to a double",indel_error);
    ret = FALSE;
  }


  if( is_double_string(allN_string,&allN) == FALSE ) {
    warn("Could not convert %s to a double",allN_string);
    ret = FALSE;
  }
  


  if( strcmp(null_string,"syn") == 0 ) {
    use_syn = TRUE;
  } else if ( strcmp(null_string,"flat") == 0 ) {
    use_syn = FALSE;
  } else {
    warn("Cannot interpret [%s] as a null model string\n",null_string);
    ret = FALSE;
  }

   
  if( alg_str != NULL ) {
    alg = alg_estwrap_from_string(alg_str);
  } else {
    alg_str = "312";
    alg = alg_estwrap_from_string(alg_str);
  }

  if( aln_alg_str != NULL ) {
    aln_alg = alg_estwrap_from_string(aln_alg_str);
  } else {
    /* if it is a protein, don't loop */
    if( use_single_pro == TRUE || use_db_pro == TRUE ) 
      aln_alg_str = "333";
    else 
      aln_alg_str = "333L";
    aln_alg = alg_estwrap_from_string(aln_alg_str);
  }


  if( (rm = default_RandomModel()) == NULL) {
    warn("Could not make default random model\n");
    ret = FALSE;
  }

  if( (ct = read_CodonTable_file(codon_file)) == NULL) {
    ret = FALSE;
    warn("Could not read codon table file in %s",codon_file);
  }

  if( (ofp = openfile(output_file,"W")) ==  NULL) {
    warn("Could not open %s as an output file",output_file);
    ret = FALSE;
  }

  rmd = RandomModelDNA_std();


  cps = flat_cDNAParser(indel_error);
  cm = flat_CodonMapper(ct);
  sprinkle_errors_over_CodonMapper(cm,subs_error);

  return ret;

}
Пример #4
0
int main(int argc,char ** argv)
{
  DPRunImpl * dpri = NULL;
  ScanWiseHSPImpl * scani = NULL;
  HSP2HitListImpl * hsp2hiti = NULL;
  HitListOutputImpl * hloi = NULL;
  ProteinIndexConstructor * pic = NULL;
 


  HSPScanInterface * hsi;
  HSPScanInterfacePara * para;
  SearchStatInterface * ssi;
  SearchStatInterface * ssl;
  SeqLookupLoadPara * slp;

  HSPset2HitPairPara * hsp2hit;
  CompMat * mat;
  SequenceDB * db;
  Sequence * seq;
  int ret;
  int i;
  int effective_db_size = 300000;
  int kk;
  
  int count = 0;

  LinearHSPmanager * lm;
  HitList * hl;

  boolean use_mott = 1;

  boolean trunc_best_hsp = 0;
  boolean verbose = 0;
  static struct rusage use;

  struct timeval t0, t1;

  gettimeofday(&t0, NULL);


  dpri      = new_DPRunImpl_from_argv(&argc,argv);

  dpri->memory = DPIM_Explicit;

  scani     = new_ScanWiseHSPImpl_from_argv(&argc,argv);
  
  hsp2hiti  = new_HSP2HitListImpl_from_argv(&argc,argv);

  hloi = new_HitListOutputImpl_from_argv(&argc,argv);

  slp = new_SeqLookupLoadPara_from_argv(&argc,argv);

  pic = new_ProteinIndexConstructor_from_argv(&argc,argv);

  hsp2hit = new_HSPset2HitPairPara_from_argv(&argc,argv);

  para = new_HSPScanInterfacePara_from_argv(&argc,argv);

  verbose = strip_out_boolean_argument(&argc,argv,"verbose") ;


  strip_out_boolean_def_argument(&argc,argv,"mott",&use_mott);

  strip_out_boolean_def_argument(&argc,argv,"besthsp",&trunc_best_hsp);

  strip_out_integer_argument(&argc,argv,"dbsize",&effective_db_size);

  

#ifdef SCAN_CORBA
  sorb = get_Wise2Corba_Singleton(&argc,argv,"orbit-local-orb");
#endif

  if( dpri == NULL ) {
    fatal("Unable to build DPRun implementation. Bad arguments");
  }

  strip_out_standard_options(&argc,argv,show_help,show_version);
  if( argc != 2 ) {
    show_help(stdout);
    exit(12);
  }

  /* ugly, but we don't want to bounce matrices around the network... */

  mat = read_Blast_file_CompMat("BLOSUM62.bla");
  
  erroroff(REPORT);

  hsi = new_HSPScanInterface_from_ScanWiseHSPImpl(scani,pic,slp);

  ssi = new_Mott_SearchStatInterface();

  ssl = new_lookup_SearchStatInterface(40,2.3);


  if( verbose ) {
    info("contacted database");
  }

  db = single_fasta_SequenceDB(argv[1]);

  if( db == NULL ) {
    fatal("Could not open sequence db...\n");
  }

  for(seq = init_SequenceDB(db,&ret); seq != NULL;seq = get_next_SequenceDB(db) ) {

	count++;

    for(i=0;i<seq->len;i++) {
      if( !isalpha(seq->seq[i]) ) {
	fatal("Sequence position %d [%c] is not valid",i,seq->seq[i]);
      }
      seq->seq[i] = toupper(seq->seq[i]);
    }

    info("Processing %s",seq->name);

    getrusage(RUSAGE_SELF,&use);
    
    /*    info("Before query %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */

    lm = (*hsi->scan_query)(hsi->data,seq,para);


    fprintf(stderr,"Got linear manager is %d entries\n",lm->len);

    if( lm->mat == NULL ) {
      lm->mat = hard_link_CompMat(mat);
    }

    getrusage(RUSAGE_SELF,&use);
    /*
    info("After query %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */
    sort_LinearHSPmanager(lm,compare_HSPset_score);


    if( trunc_best_hsp == 1 ) {
      for(kk=1;kk<lm->len;kk++) {
	free_HSPset(lm->set[kk]);
	lm->set[kk] = NULL;
      }
      lm->len = 1;
    }

    getrusage(RUSAGE_SELF,&use);
    
    /*
    info("After sort %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */
    hl   = HitList_from_HSP_HSP2HitListImpl(hsp2hiti,lm,dpri,hsp2hit);


    getrusage(RUSAGE_SELF,&use);
    /*
    info("After conversion %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */
    free_LinearHSPmanager(lm);

    if( use_mott == 1 ) {
      apply_SearchStat_to_HitList(hl,ssi,effective_db_size);
    } else {
      for(kk=0;kk<hl->len;kk++) {
	hl->pair[kk]->bit_score = hl->pair[kk]->raw_score / 2.0; 
      }
    }

    sort_HitList_by_score(hl);

    show_HitList_HitListOutputImpl(hloi,hl,stdout);

    getrusage(RUSAGE_SELF,&use);
    /*
    info("After output %s %.3fu %.3fs\n", seq->name,
	 use.ru_utime.tv_sec + use.ru_utime.tv_usec*MICROSECOND,
	 use.ru_stime.tv_sec + use.ru_stime.tv_usec*MICROSECOND
	);
    */

    free_HitList(hl);
    free_Sequence(seq);
  }
    

  free_DPRunImpl(dpri);
  free_HSPScanInterface(hsi);

  gettimeofday(&t1, NULL);
  fprintf(stderr, "[client stats] queries, time (s): %d %f\n",
                count,
		(t1.tv_sec - t0.tv_sec) +
                (t1.tv_usec - t0.tv_usec) * 1e-6);

  return 0;

}