Esempio n. 1
void g_geneseq( VariantGroup & vars , void * p )

  Out & pout = Out::stream( "gsview" );
  Opt_geneseq * aux = (Opt_geneseq*)p;
  Out * rout = aux->R_plot ? &Out::stream( "gsview.R" ) : NULL ;   
  if ( aux->R_plot ) geneseq_define_Rfunc( rout );

  Region region = g.locdb.get_region( PLINKSeq::DEFAULT_LOC_GROUP() , ) ;  

  if ( region.subregion.size() == 0 ) return;

  int s = region.subregion[0].meta.get1_int( PLINKSeq::TRANSCRIPT_STRAND() ) ;
  bool positive_strand = s > 0 ;

  if ( s == 0 ) Helper::halt( "no strand info" );

  // get list of all 'events' (variants, intron/exon boundaries, reference variants)
  std::set<int> events;  
  std::map<int,const Variant*> evars;
  std::map<int,int> elocstart;
  std::map<int,int> elocstop;
  std::map<int,int> cds_num;
  std::map<int,const RefVariant*> erefvars;
  for (int v=0; v<vars.size(); v++)
      int pos = positive_strand ? vars(v).position() : -vars(v).position() ;
      events.insert( vars(v).position() ) ;
      evars[ vars(v).position() ] = &vars(v);
  // enumerate CDS exons first

  int num_cds_exons = 0;
  int cds_bp = 0;
  int genomic_start = 0;
  int genomic_stop = 0;

  for (int s = 0 ; s < region.subregion.size(); s++) 
      // only consider CDS exons      

      if ( ! region.subregion[s].CDS() ) continue;      

      cds_num[s] = num_cds_exons;
      int pos1 = region.subregion[s].start.position();
      int pos2 = region.subregion[s].stop.position();
      if ( genomic_start == 0 ) genomic_start = pos1;
      if ( pos2 > genomic_stop ) genomic_stop = pos2;

      // track start & stop CDS (and record exon #)
      elocstart[ pos1 ] = num_cds_exons;
      elocstop[ pos2 ] = num_cds_exons;

      // total CDS extent in base-pairs
      cds_bp += region.subregion[s].stop.position() - region.subregion[s].start.position()  + 1;

  // reference variant

  std::set<RefVariant> rvars = g.refdb.lookup( region , aux->ref );
  std::set<RefVariant>::iterator i = rvars.begin();
  while ( i != rvars.end() ) 

      // for now, just place 'start' of reference-variant events
      //int pos = positive_strand ? i->start() : - i->start() ; 
      int pos = i->start();
      events.insert( pos );
      erefvars[ pos ] = &(*i);

      //      std::cout << "found " << pos << "\n";

  pout << << " | "
       << g.locdb.alias( , false ) << " | "
       << num_cds_exons << " CDS exons | " 
       << ( positive_strand ? "+ve strand | " : "-ve strand | " )
       << vars.size() << " variants | ";
  if ( rvars.size() > 0 ) 
    pout << rvars.size() << " refvars | ";      

  pout << region.coordinate() << " | " 
       << ( region.stop.position() - region.start.position() + 1 )/1000.0 << " kb | "      
       << cds_bp << " coding bases\n";
  // attach LOCDB

  LocDBase * locdb = g.resolve_locgroup( PLINKSeq::DEFAULT_LOC_GROUP() ) ;
  if ( ! locdb ) return;
  int gid = locdb->lookup_group_id( PLINKSeq::DEFAULT_LOC_GROUP() );  
  if ( gid == 0 ) return;

  // Translate to AA sequence

  Annotate::setDB( locdb , &g.seqdb );
  std::string aa = Annotate::translate_reference( region , false );

  // Protein feature/domain annotations
  std::map<int,std::string> pdm; //protein domain map

  if ( aux->protdb )
      std::set<Feature> features = aux->protdb->fetch( );

      bool all_prot = aux->protdom.find( "*" ) != aux->protdom.end() 
	|| aux->protdom.find( "ALL" ) != aux->protdom.end() 
	|| aux->protdom.find( "all" ) != aux->protdom.end(); 

      std::set<Feature>::iterator ii = features.begin();
      while ( ii != features.end() )
	  if ( all_prot || aux->protdom.find( ii->source_id ) != aux->protdom.end() )
		  // Subtract 1 to have 0-based aa coordinates:
	      for (int aa = ii->pstart - 1; aa <= ii->pstop - 1; aa++)
		  if ( pdm[aa] != "" ) pdm[aa] += " ";
		  pdm[ aa ] += ii->source_id + "::" + ii->feature_id + ":" + ii->feature_name + " ";

  // Display full AA sequence, with 1-codon padding either side

  int pmin = genomic_start ;
  int pmax = genomic_stop  ;
  if ( ! positive_strand )
      int t = pmin;
      pmin = pmax;
      pmax = t;

  pmin += positive_strand ? -9 : +9 ;
  pmax += positive_strand ? +9 : -9 ;
  int step = positive_strand ? +1 : -1;
  // codon position, cycle 0,1,2
  // transcript CDS should always start at 0

  int cpos = 0;  
  int gpos = 0;

  // will always start just before CDS, but in 'printable' region, thus:::
  bool cds = false;
  bool printing = true;
  int exon = 0;
  int exon0 = 0;
  int last_exon = 0;
  bool split_codon = false;

  std::string codon = "";
  std::string prt_codon = "";
  std::string prt_intronic = "";
  int chr = region.start.chromosome();
  int apos = 0;
  std::string refannot = "";
  std::string varannot = "";
  bool has_evar = false;
  int stop_here = positive_strand ? pmax+1 : pmax-1;
  std::stringstream ss;

  for ( int bp = pmin ; bp != pmax+step ; bp += step )
      const int searchbp = bp;
      // Are we entering or leaving a printable region?
      // (cpos-adjusted start, to make sure we are always in-sync)
      if ( positive_strand )
	  if      ( elocstart.find( searchbp + 9 + cpos ) != elocstart.end() ) 
	      printing = true;

	      // space a new one
	      if ( ! aux->only_variant_sites ) pout << "\n";

	  else if ( elocstop.find( searchbp - ( 7 + (3 - cpos)  )  ) != elocstop.end() ) printing = false; 	  	  
	  if ( elocstop.find( searchbp - ( 9 + cpos ) ) != elocstop.end() ) 
	      printing = true;
	      // space a new one
	      if ( ! aux->only_variant_sites ) pout << "\n";
	  else if ( elocstart.find( searchbp + 7 + (3-cpos) ) != elocstart.end() ) printing = false;

      // hitting first position in a new CDS?
      if ( ( positive_strand && elocstart.find( searchbp ) != elocstart.end() ) ||
	   ( (!positive_strand) && elocstop.find( searchbp ) != elocstop.end() ) )
	  cds = true;	  
	  // track exon number

	  exon = positive_strand ? elocstart[ searchbp ] : elocstop[ searchbp ];
	  exon0 = exon;
	  if ( ! positive_strand ) exon = num_cds_exons - (exon+1) ;	  

      // Have we gone one past the end of a current exon?
      if ( cds ) 
	if ( ( positive_strand && elocstop.find( searchbp - step ) != elocstop.end() ) 
	     ( (!positive_strand) && elocstart.find( searchbp - step ) != elocstart.end() ) 
	    cds = false;
	    // track last exon code, for use printing introns
	    last_exon = exon+1;

	    if ( positive_strand )
		if ( elocstop[ searchbp - step ] != exon ) plog.warn("internal inconsistency");
		if ( elocstart[ searchbp - step ] != exon0 ) plog.warn("internal inconsistency");

	    // are we ending a CDS mid-frame? 
	    if ( cpos == 1 ) 
		split_codon = true;
		//prt_codon = ".";
	    else if ( cpos == 2 )   
		split_codon = true;
		//prt_codon = "..";
      if ( bp == stop_here ) { continue; } 

      // append variant information

      if ( printing && evars.find( searchbp ) !=  evars.end() )
	  const Variant * pvar = evars.find( searchbp )->second;

	  has_evar = true;
	  std::stringstream ss2;
	  ss2 << "[" ;
	  int case_count = 0 , case_tot = 0; 
	  int control_count = 0 , control_tot = 0;	  
	  bool ma, control_ma;
	  if ( aux->pheno )       
	      ma = pvar->n_minor_allele( &case_count , &case_tot , NULL , CASE );
	      control_ma = pvar->n_minor_allele( &control_count , &control_tot , NULL , CONTROL );
	    ma = pvar->n_minor_allele( &case_count , &case_tot );
	  // always report non-ref
	  if ( ! ma ) case_count = case_tot - case_count;
	  if ( ! control_ma ) control_count = control_tot - control_count;
	  // print minor/major allele(s)
	  if ( cpos == 0 ) 
	    ss2 << "" << pvar->alternate() << "..";
	  else if ( cpos == 1 ) 
	    ss2 << "." << pvar->alternate() << ".";
	    ss2 << ".." << pvar->alternate() ;
	  if ( aux->pheno )
	    ss2 << "|A/U=" << case_count << ":" << control_count ;
	    ss2 << "|MAC=" << case_count ;
	  ss2 << "|";

	  // always use PSEQ engine for annotation
	  // 	  if ( pvar->meta.has_field( PLINKSeq::META_ANNOT() ) )
	  // 	    ss2 << pvar->meta.get1_string( PLINKSeq::META_ANNOT() );
	  // 	    {
	  // 	  else 
	  bool exonic = Annotate::annotate( (Variant&)*pvar , region );
	  // pull out correct transcript annotation
	  std::vector<std::string> annot = pvar->meta.get_string( PLINKSeq::ANNOT_TYPE() );
	  std::vector<std::string> annot_trans = pvar->meta.get_string( PLINKSeq::ANNOT_GENE() );
	  std::vector<std::string> annot_prot = pvar->meta.get_string( PLINKSeq::ANNOT_PROTEIN() );
	  std::string a = ".";
	  std::string ap = ".";
	  if ( annot.size() == annot_trans.size() && annot.size() == annot_prot.size() )
	      for( int i=0;i<annot.size(); i++)
		  if ( annot_trans[i] == ) 
		      a = annot[i]; ap = annot_prot[i];
	  ss2 << a ;
	  if ( exonic ) 
	      ss2 << "|" << ap;
	  ss2 << "]";
	  if ( varannot != "" ) varannot += " ";
	  varannot += ss2.str();

      // append ref-variant information

      if ( printing && erefvars.find( searchbp ) !=  erefvars.end() )
	  const RefVariant * refvar = erefvars.find( searchbp )->second;
	  std::stringstream ss2;
	  ss2 << *refvar;
	  if ( refannot != "" ) refannot += " ";
	  refannot += ss2.str();
      // cpos == 2 <==> last base in codon (since still on 0-based base count within exon: 0,1,2), so only annotate once
      // [Can change to 'cpos == 0' if want to annotate the first part of the 'split codon' instead of the last part]:
      if ( printing && aux->protdb && cds && cpos == 2 && pdm.find( apos ) != pdm.end() )
	  refannot += pdm[ apos ];

      // Only show CDS

      if ( printing )
	  if ( gpos == 0 ) 
	    if ( cds )
	      ss << "exon " << exon+1 << ( exon < 9 ? " " : "" ) << "     " << Helper::chrCode( chr ) << ":" << searchbp;  	  
		if ( last_exon == 0 ) 
		  ss << "intron */1 " << ( exon < 9 ? " " : "" );
		else if ( exon == num_cds_exons - 1 ) 
		  ss << "intron " << last_exon << "/* " << ( exon < 9 ? " " : "" );
		  ss << "intron " << last_exon << "/" << last_exon+1 << " " << ( exon < 9 ? " " : "" );
		ss << Helper::chrCode( chr ) << ":" << searchbp;
      if ( printing )  

	  if ( cds )
	      codon        += g.seqdb.lookup( chr , bp ) ; 
	      prt_codon    += g.seqdb.lookup( chr , bp ) ; 
	      prt_intronic += ".";
	      prt_codon    += ".";	  
	      prt_intronic += g.seqdb.lookup( chr , bp ) ; 	      

	  // Print row
	  if ( gpos == 3 )
	      gpos = 0;	      
	      std::string aa_code = cds && cpos == 3 ? aa.substr( apos++ , 1 ) : ( split_codon ? ">" : "." ) ;
	      std::string aa_name = cds && cpos == 3 ? Annotate::aa[ aa_code ] : ( split_codon ? ">>>" : " . " );
	      ss << "\t" << prt_intronic << " " << prt_codon
		 << " " << aa_code << " " << aa_name << " " << ( cds ? Helper::int2str(apos) : ( split_codon ? "> " : ". " ) ) 
		 << "\t" << (refannot == "" ? "." : refannot ) 
		 << "\t" << (varannot == "" ? "." : varannot ) 		       
		 << "\n";
	      // print? 
	      if ( (!aux->only_variant_sites) || has_evar ) 
		pout << ss.str();
	      // reset all codon-specific stuff
	      if ( ! split_codon ) codon = "";
	      prt_codon = "";
	      prt_intronic = "";
	      refannot = "";
	      varannot = "";	      
	      has_evar = false;
	      split_codon = false;
	      // and clear stream
	      ss.str( std::string() );
      if ( gpos == 3 ) gpos = 0;
      if ( cpos == 3 ) cpos = 0;


  // Create R plot?

  if ( aux->R_plot ) 

      //  Upfront stuff 

      *rout << "## Populate data-structures for " << << "\n"
	    << "\n"
	    << "transname = \"" << << "\"\n"
	    << "genename = \"" << g.locdb.alias( , false ) << "\"\n" 
	    << "chrcode = \"" << Helper::chrCode( vars(0).chromosome() ) << "\"\n"
	    << "refname = \"" << ( aux->ref ? args.as_string( "ref" ) : "" ) << "\"\n"
	    << "var <- list();ref <- list();dom <- list();exon  <- list() \n"

	    << "# main transcript \n"

	    << "trans <<- c( " << region.start.position() << " , " << region.stop.position() << ") \n" 

	    << "# strand for main (and only) transcript (+1, -1, 0) \n"

	    << " strand <<- " << ( positive_strand ? "+1" : "-1" ) << "\n" 
	    << "# determine border \n"	
	    << " translen <<- trans[2] - trans[1] + 1 \n"
	    << " total <<- round(c( trans[1] - 1000 , trans[2] + 1000 ) ) \n"
	    << " totallen <<- total[2] - total[1] + 1  \n"

	    << " ## exon structure \n";

      // look at all CDS exons

      int exc = 1;
      for (int s = 0 ; s < region.subregion.size(); s++) 
	  if ( ! region.subregion[s].CDS() ) continue;      	  
	  *rout << " exon[[" << exc << "]] <- c( " << region.subregion[s].start.position() << " , " << region.subregion[s].stop.position() << " ) \n";
      *rout << " cdslength <<- " << cds_bp << " \n";

      // non-CDS exons

      exc = 1;
      *rout << "exon_notcds <- list() \n";
      for (int s = 0 ; s < region.subregion.size(); s++) 
	  if ( ! region.subregion[s].exon() ) continue;      	  
	  *rout << " exon_notcds[[" << exc << "]] <- c( " << region.subregion[s].start.position() << " , " << region.subregion[s].stop.position() << " ) \n";

      // All other overlapping transcripts

      std::string g_chr_code = Helper::chrCode( region.start.chromosome() );
      int g_chr = region.start.chromosome();
      int g_bp1 = region.start.position();
      int g_bp2 = region.stop.position();

      std::set<Region> others = g.locdb.get_regions( g.locdb.lookup_group_id( PLINKSeq::DEFAULT_LOC_GROUP() ) , g_chr , g_bp1 , g_bp2 );      

      *rout << "others <- list() \n";

      if ( others.size() > 0 ) 
	  int cnt1 = 1;
	  std::set<Region>::iterator ii = others.begin();
	  while ( ii != others.end() )
	      // for this transcript, exclude self
	      if ( ii->subregion.size() > 0 && ii->name != )
		  *rout << "others[[" << cnt1 << "]] <- list( name = \"" << ii->name << "\" , \n ";
		  *rout << "exon_cds = list( " ;

		  int cnt2 = 1;
		  for (int s = 0 ; s < ii->subregion.size(); s++) 
		      if ( ii->subregion[s].CDS() ) { *rout << (cnt2>1?",":"") << " c( " << ii->subregion[s].start.position() << " , " << ii->subregion[s].stop.position() << " ) \n"; ++cnt2; }
		  *rout << " ) , \n";
		  *rout << "exon_notcds = list( " ;
		  cnt2 = 1;
		  for (int s = 0 ; s < ii->subregion.size(); s++) 
		    if ( ii->subregion[s].exon() ) { *rout << (cnt2>1?",":"") << " c( " << ii->subregion[s].start.position() << " , " << ii->subregion[s].stop.position() << " ) \n"; ++cnt2; }
		  *rout << " ) \n";
		  *rout << " ) \n";


      // Variants;

      *rout << " ## variants, with MAC (by cases), with annotation \n";
      for (int v=0;v<vars.size();v++)
	  std::string annot = ".";
	  std::string annotdet = ".";
	  int cnta , cntu;

	  int case_count = 0 , case_tot = 0; 
	  int control_count = 0 , control_tot = 0;	  
	  bool ma, control_ma;
	  Variant * pvar = &vars(v);

	  if ( aux->pheno )       
	      ma = pvar->n_minor_allele( &case_count , &case_tot , NULL , CASE );
	      control_ma = pvar->n_minor_allele( &control_count , &control_tot , NULL , CONTROL );
	    ma = pvar->n_minor_allele( &case_count , &case_tot );
	  // always report non-ref
	  if ( ! ma ) case_count = case_tot - case_count;
	  if ( ! control_ma ) control_count = control_tot - control_count;
	  if ( ! aux->pheno ) 
	      cntu = -ma;
	      cnta = case_count;
	      cntu = control_count;
	  bool exonic = Annotate::annotate( (Variant&)*pvar , region );

	  // pull out correct transcript annotation
	  std::vector<std::string> annot_func = pvar->meta.get_string( PLINKSeq::ANNOT_TYPE() );
	  std::vector<std::string> annot_trans = pvar->meta.get_string( PLINKSeq::ANNOT_GENE() );
	  std::vector<std::string> annot_prot = pvar->meta.get_string( PLINKSeq::ANNOT_PROTEIN() );	  
	  if ( annot_func.size() == annot_trans.size() && annot_func.size() == annot_prot.size() )
	      for( int i=0;i<annot_func.size(); i++)
		  if ( annot_trans[i] == ) 
		      annot = annot_func[i]; 
		      annotdet = annot_prot[i];

	  *rout << "var[[" << v+1 << "]] <- list( pos = c( " << vars(v).position() << " , " <<  vars(v).stop() 
		<< " ) , name = \"" << vars(v).name() 
		<< "\" , annot = \"" << annot << "\" , annotdet = \"" << annotdet 
		<< "\" , a = " << cnta << " , u = " << cntu << " ) \n";


      // Reference variants
      if ( aux->ref ) 
	  *rout << "## Reference variants\n";

	  *rout << "ref[[1]] <- list( group = \"" << args.as_string( "ref" ) << "\" , det = list() ) \n";
	  std::set<RefVariant>::iterator i = rvars.begin();
	  int rfc = 1;
	  while ( i != rvars.end() ) 
	      int pos = i->start();
	      *rout << "ref[[1]]$det[[" << rfc++ << "]] <- list( pos = c( " << i->start() << " , " << i->stop() << " ) , name = \"" << i->name() << "\" ) \n";

      // Protein domains
      if ( aux->protdb )

	  // just repeat this lookup from above for now...

	  *rout << "## Protein domains \n";
	  std::set<Feature> features = aux->protdb->fetch( );
	  bool all_prot = aux->protdom.find( "*" ) != aux->protdom.end() 
	    || aux->protdom.find( "ALL" ) != aux->protdom.end() 
	    || aux->protdom.find( "all" ) != aux->protdom.end(); 
	  int pdct = 1;
	  std::map<std::string,int> sourcemap;
	  std::map<std::string,int> sourcecnt;
	  std::set<Feature>::iterator ii = features.begin();
	  while ( ii != features.end() )
	      if ( all_prot || aux->protdom.find( ii->source_id ) != aux->protdom.end() )

		  if ( sourcemap.find( ii->source_id ) == sourcemap.end() )
		      int t = sourcemap.size() + 1 ;
		      sourcemap[ ii->source_id ] = t;
		      *rout << "dom[[" << t << "]] <- list( group = \"" << ii->source_id << "\" , det = list() ) \n";    

		  // 1-based count
		  *rout << "dom[[" << sourcemap[ ii->source_id ] << "]]$det[[" << ++sourcecnt[ ii->source_id ] 
			<< "]] <- list( name = \"" << ii->feature_id << ":" << ii->feature_name << "\" , "
			<< "pos = c( " << ii->gstart << " , " << ii->gstop << " ) , aa = c( " << ii->pstart << " , " << ii->pstop << " ) ) \n ";

  // Perform actual plot, redircted to PDF
  *rout << "\n\n#create plot\n"
	<< "pdf( width = 14 , height = 10 , file=\"plot" << << ".pdf\") \n "
	<< "doplot() \n"
	<< " \n\n"
	<< "#---------------------------------------------------------------------------\n\n\n";

  pout << "------------------------------------------------------------\n\n";