Ejemplo n.º 1
0
int splicing_i_gff_constitutive_exons_all(const splicing_gff_t *gff,
					  splicing_exonset_t *exons,
					  int min_length) {

  size_t g, nogenes;
  splicing_vector_int_t events;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));

  SPLICING_CHECK(splicing_exonset_init(exons, 0));
  SPLICING_FINALLY(splicing_exonset_destroy, exons);
  SPLICING_CHECK(splicing_vector_int_init(&events, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &events);
  
  for (g=0; g<nogenes; g++) {
    const char *seqid=
      splicing_strvector_get(&gff->seqids, VECTOR(gff->seqid)[g]);
    int noex, idx, noEvents, i;
    size_t noiso;
    int start=VECTOR(gff->genes)[g];
    int end= g+1 < nogenes ? VECTOR(gff->genes)[g+1] : gff->n;
    splicing_gff_noiso_one(gff, g, &noiso);
    
    /* Collect and sort all events */
    splicing_vector_int_clear(&events);
    for (idx=start+1; idx<end; idx++) {
      if (VECTOR(gff->type)[idx] == SPLICING_TYPE_EXON) {
	SPLICING_CHECK(splicing_vector_int_push_back2
		       (&events, VECTOR(gff->start)[idx],
			-VECTOR(gff->end)[idx]));
      }
    }
    noEvents=splicing_vector_int_size(&events);
    splicing_qsort(VECTOR(events), noEvents, sizeof(int),
		   splicing_i_const_cmp);

    /* Now go over the sorted events and extract the constitutive exons */
    for (noex=0, i=0; i<noEvents; i++) {
      int ev=VECTOR(events)[i];
      if (ev > 0) { noex++; } 
      if (ev < 0) { 
	int prev=VECTOR(events)[i-1];
	if (noex == noiso && (-ev)-prev+1 >= min_length) {
	  /* constitutive exon */
	  SPLICING_CHECK(splicing_exonset_append(exons, seqid, prev, -ev));
	}
	noex--;
      }
    }
  }

  splicing_vector_int_destroy(&events);
  SPLICING_FINALLY_CLEAN(2);	/* + exons */

  return 0;
}
Ejemplo n.º 2
0
Archivo: gff.c Proyecto: mlovci/MISO
int splicing_genomic_to_iso(const splicing_gff_t *gff, size_t gene,
			    const splicing_vector_int_t *position, 
			    splicing_matrix_int_t *isopos) {

  size_t r, i, noiso, noreads=splicing_vector_int_size(position);
  splicing_vector_int_t exstart, exend, exidx, shift;
  
  splicing_gff_noiso_one(gff, gene, &noiso);
  
  SPLICING_CHECK(splicing_vector_int_init(&exstart, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exstart);
  SPLICING_CHECK(splicing_vector_int_init(&exend, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exend);
  SPLICING_CHECK(splicing_vector_int_init(&exidx, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exidx);
  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &exstart, &exend,
					     &exidx, gene));

  SPLICING_CHECK(splicing_vector_int_init(&shift, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &shift);
  
  for (i=0; i<noiso; i++) {
    size_t cs=0, ce=0, ex=0;
    int pos=VECTOR(exidx)[i], pos2=VECTOR(exidx)[i+1];
    while (pos < pos2) {
      cs += VECTOR(exstart)[pos];
      SPLICING_CHECK(splicing_vector_int_push_back(&shift, cs-ce-ex-1));
      ex++; ce += VECTOR(exend)[pos]; pos++;
    }
  }

  SPLICING_CHECK(splicing_matrix_int_resize(isopos, noiso, noreads));
  
  for (r=0; r<noreads; r++) {
    for (i=0; i<noiso; i++) {
      size_t pos=VECTOR(*position)[r];
      size_t startpos=VECTOR(exidx)[i];
      size_t endpos=VECTOR(exidx)[i+1];
      int ex;
      for (ex=startpos; ex < endpos && VECTOR(exend)[ex] < pos; ex++) ;
      if (VECTOR(exstart)[ex] <= pos && pos <= VECTOR(exend)[ex]) {
	MATRIX(*isopos, i, r) = VECTOR(*position)[r] - VECTOR(shift)[ex];
      } else { 
	MATRIX(*isopos, i, r) = -1;
      }
    }
  }

  splicing_vector_int_destroy(&shift);
  splicing_vector_int_destroy(&exidx);
  splicing_vector_int_destroy(&exend);
  splicing_vector_int_destroy(&exstart);
  SPLICING_FINALLY_CLEAN(4);

  return 0;
}
Ejemplo n.º 3
0
Archivo: gff.c Proyecto: mlovci/MISO
int splicing_gff_exon_start_end(const splicing_gff_t *gff, 
				splicing_vector_int_t *start,
				splicing_vector_int_t *end,
				splicing_vector_int_t *idx,
				int gene) {
  
  size_t noiso;
  int i=0, p=0, n=splicing_gff_size(gff);
  int pos;
  size_t nogenes;
  splicing_vector_int_t tmp, tmp2;

  SPLICING_CHECK(splicing_vector_int_init(&tmp, 10));
  SPLICING_FINALLY(splicing_vector_int_destroy, &tmp);
  SPLICING_CHECK(splicing_vector_int_init(&tmp2, 10));
  SPLICING_FINALLY(splicing_vector_int_destroy, &tmp2);

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  if (gene < 0 || gene >= nogenes) { 
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  pos=VECTOR(gff->genes)[gene]+1;
  
  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));
  splicing_vector_int_clear(start);
  splicing_vector_int_clear(end);
  SPLICING_CHECK(splicing_vector_int_resize(idx, noiso+1));
  while (pos < n) {
    if (VECTOR(gff->type)[pos] == SPLICING_TYPE_EXON) { 
      int s=VECTOR(gff->start)[pos];
      int e=VECTOR(gff->end)[pos];
      SPLICING_CHECK(splicing_vector_int_push_back(start, s)); p++;
      SPLICING_CHECK(splicing_vector_int_push_back(end, e));
    } else if (VECTOR(gff->type)[pos] == SPLICING_TYPE_MRNA) {
      VECTOR(*idx)[i] = p;
      if (i!=0) { 
	SPLICING_CHECK(splicing_i_gff_exon_start_end_sort(start, end, idx, 
							  i-1, &tmp, &tmp2));
      }
      i++;
    } else if (VECTOR(gff->type)[pos] == SPLICING_TYPE_GENE) {
      break;
    }
    pos++;
  }
  VECTOR(*idx)[i] = p;
  SPLICING_CHECK(splicing_i_gff_exon_start_end_sort(start, end, idx, i-1, 
						    &tmp, &tmp2));

  splicing_vector_int_destroy(&tmp2);
  splicing_vector_int_destroy(&tmp);
  SPLICING_FINALLY_CLEAN(1);

  return 0;
}
Ejemplo n.º 4
0
int splicing_i_gff_constitutive_exons_full(const splicing_gff_t *gff,
					   splicing_exonset_t *exons,
					   int min_length) {
  size_t g, nogenes;
  splicing_vector_int_t events;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  
  SPLICING_CHECK(splicing_exonset_init(exons, 1));
  SPLICING_FINALLY(splicing_exonset_destroy, exons);
  SPLICING_CHECK(splicing_vector_int_init(&events, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &events);
  
  for (g=0; g<nogenes; g++) {
    const char *seqid=
      splicing_strvector_get(&gff->seqids, VECTOR(gff->seqid)[g]);
    int i, idx, noExons, noSame;
    size_t noiso;
    int start=VECTOR(gff->genes)[g];
    int end= g+1 < nogenes ? VECTOR(gff->genes)[g+1] : gff->n;
    splicing_gff_noiso_one(gff, g, &noiso);
    
    /* Collect and sort all events */
    splicing_vector_int_clear(&events);
    for (idx=start+1; idx<end; idx++) {
      if (VECTOR(gff->type)[idx] == SPLICING_TYPE_EXON) {
	SPLICING_CHECK(splicing_vector_int_push_back2
		       (&events, VECTOR(gff->start)[idx],
			VECTOR(gff->end)[idx]));
      }
    }
    noExons=splicing_vector_int_size(&events)/2;
    splicing_qsort(VECTOR(events), noExons, sizeof(int)*2,
		   splicing_i_const_cmp2);
    
    /* Now go over them and check how many times each exon appears */
    for (noSame=1, i=2; i<noExons*2; i+=2) { 
      int start=VECTOR(events)[i];
      int end=VECTOR(events)[i+1];
      if (start == VECTOR(events)[i-2] && VECTOR(events)[i-1] == end) {
	noSame++;
      } else {
	noSame=1;
      }
      if (noSame == noiso) {
	SPLICING_CHECK(splicing_exonset_append(exons, seqid, start, end));
      }
    }
  }
  
  splicing_vector_int_destroy(&events);
  SPLICING_FINALLY_CLEAN(2);	/* + exons */

  return 0;
}
Ejemplo n.º 5
0
int splicing_gff_converter_init(const splicing_gff_t *gff, size_t gene,
				splicing_gff_converter_t *converter) {

  int i; 

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &converter->noiso));

  SPLICING_VECTOR_INT_INIT_FINALLY(&converter->exstart, 0);
  SPLICING_VECTOR_INT_INIT_FINALLY(&converter->exend, 0);
  SPLICING_VECTOR_INT_INIT_FINALLY(&converter->exidx, 0);
  SPLICING_VECTOR_INT_INIT_FINALLY(&converter->shift, 0);
  SPLICING_VECTOR_INT_INIT_FINALLY(&converter->exlim, 0);
  
  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &converter->exstart, 
					     &converter->exend, 
					     &converter->exidx, gene));

  /* Calculate the shift */
  for (i=0; i < converter->noiso; i++) {
    size_t cs=0, ce=0, ex=0;
    int pos=VECTOR(converter->exidx)[i], pos2=VECTOR(converter->exidx)[i+1];
    while (pos < pos2) {
      cs += VECTOR(converter->exstart)[pos];
      SPLICING_CHECK(splicing_vector_int_push_back(&converter->shift, 
						   cs-ce-ex-1));
      ex++; ce += VECTOR(converter->exend)[pos]; pos++;
    }
  }
  
  /* Calculate the exlim */
  for (i=0; i < converter->noiso; i++) { 
    size_t cs=0;
    int pos=VECTOR(converter->exidx)[i], pos2=VECTOR(converter->exidx)[i+1];
    while (pos < pos2) {
      size_t l=
	VECTOR(converter->exend)[pos] - VECTOR(converter->exstart)[pos]+1;
      cs += l;
      SPLICING_CHECK(splicing_vector_int_push_back(&converter->exlim, cs+1));
      pos++;
    }
  }

  SPLICING_FINALLY_CLEAN(5);

  return 0;
}
Ejemplo n.º 6
0
Archivo: miso.c Proyecto: mlovci/MISO
int splicing_miso(const splicing_gff_t *gff, size_t gene,
		  const splicing_vector_int_t *position,
		  const char **cigarstr, int readLength, 
		  int noIterations, int noBurnIn, int noLag,
		  const splicing_vector_t *hyperp, 
		  splicing_matrix_t *samples, splicing_vector_t *logLik,
		  splicing_matrix_t *match_matrix, 
		  splicing_matrix_t *class_templates,
		  splicing_vector_t *class_counts,
		  splicing_vector_int_t *assignment,
		  splicing_miso_rundata_t *rundata) {

  double acceptP, cJS, pJS, sigma;
  int noReads = splicing_vector_int_size(position);
  splicing_vector_int_t *myass=assignment, vass;
  size_t noiso;
  splicing_vector_t vpsi, vpsiNew, valpha, valphaNew, 
    *psi=&vpsi, *psiNew=&vpsiNew, *alpha=&valpha, *alphaNew=&valphaNew;
  int noSamples = (noIterations - noBurnIn + 1) / noLag;
  int i, m, lagCounter=0, noS=0;
  splicing_matrix_t *mymatch_matrix=match_matrix, vmatch_matrix;
  splicing_vector_int_t match_order;
  splicing_vector_int_t effisolen;
  splicing_vector_t isoscores;

  if ( (class_templates ? 1 : 0) + (class_counts ? 1 : 0) == 1) {
    SPLICING_ERROR("Only one of `class_templates' and `class_counts' is "
		   "given", SPLICING_EINVAL);
  }

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));

  rundata->noIso=noiso;
  rundata->noIters=noIterations;
  rundata->noBurnIn=noBurnIn;
  rundata->noLag=noLag;
  rundata->noAccepted = rundata->noRejected = 0;

  if (assignment) { 
    SPLICING_CHECK(splicing_vector_int_resize(myass, noReads));
    splicing_vector_int_null(myass);
  } else {
    myass=&vass;
    SPLICING_CHECK(splicing_vector_int_init(myass, noReads));
    SPLICING_FINALLY(splicing_vector_int_destroy, myass);
  }
  SPLICING_CHECK(splicing_vector_init(&vpsi, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &vpsi);
  SPLICING_CHECK(splicing_vector_init(&vpsiNew, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &vpsiNew);
  SPLICING_CHECK(splicing_vector_init(&valpha, noiso-1));
  SPLICING_FINALLY(splicing_vector_destroy, &valpha);
  SPLICING_CHECK(splicing_vector_init(&valphaNew, noiso-1));
  SPLICING_FINALLY(splicing_vector_destroy, &valphaNew);
  
  if (match_matrix) { 
    SPLICING_CHECK(splicing_matrix_resize(match_matrix, noiso, noReads));
  } else {
    mymatch_matrix=&vmatch_matrix;
    SPLICING_CHECK(splicing_matrix_init(mymatch_matrix, noiso, noReads));
    SPLICING_FINALLY(splicing_matrix_destroy, mymatch_matrix);
  }
  SPLICING_CHECK(splicing_vector_int_init(&match_order, noReads));
  SPLICING_FINALLY(splicing_vector_int_destroy, &match_order);
  SPLICING_CHECK(splicing_matchIso(gff, gene, position, cigarstr, 
				   mymatch_matrix));
  SPLICING_CHECK(splicing_order_matches(mymatch_matrix, &match_order));

  if (class_templates && class_counts) { 
    SPLICING_CHECK(splicing_i_miso_classes(mymatch_matrix, &match_order, 
					   class_templates, class_counts, 
					   /*bin_class_templates=*/ 0,
					   /*bin_class_counts=*/ 0));
  }

  SPLICING_CHECK(splicing_vector_int_init(&effisolen, noiso));
  SPLICING_FINALLY(splicing_vector_int_destroy, &effisolen);
  SPLICING_CHECK(splicing_vector_init(&isoscores, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &isoscores);
  SPLICING_CHECK(splicing_gff_isolength_one(gff, gene, &effisolen));
  for (i=0; i<noiso; i++) { 
    int l=VECTOR(effisolen)[i]-readLength+1;
    VECTOR(effisolen)[i] = l > 0 ? l : 0;
    VECTOR(isoscores)[i] = -log((double) l);
  }

  SPLICING_CHECK(splicing_matrix_resize(samples, noiso, noSamples));
  SPLICING_CHECK(splicing_vector_resize(logLik, noSamples));

  /* Initialize Psi(0) randomly */

  SPLICING_CHECK(splicing_drift_proposal(/* mode= */ 0, 0, 0, 0, 0, 0, 
					 noiso, psi, alpha, &sigma, 0));
  SPLICING_CHECK(splicing_drift_proposal(/* mode= */ 1, psi, alpha, sigma,
					 0, 0, noiso, psi, alpha, 0, 0));
  
  /* Initialize assignments of reads */  
  
  SPLICING_CHECK(splicing_reassign_samples(mymatch_matrix, &match_order, psi, 
					   noiso, myass));
  
  /* foreach Iteration m=1, ..., M do */

  for (m=0; m < noIterations; m++) {

    SPLICING_CHECK(splicing_drift_proposal(/* mode= */ 1, psi, alpha, sigma,
					   0, 0, noiso, psiNew, alphaNew, 0,
					   0));

    SPLICING_CHECK(splicing_metropolis_hastings_ratio(myass, noReads, psiNew,
						      alphaNew, psi, alpha,
						      sigma, noiso, 
						      &effisolen, hyperp,
						      &isoscores, 
						      m > 0 ? 1 : 0, 
						      &acceptP, &cJS, &pJS));
    
    if (acceptP >= 1 || RNG_UNIF01() < acceptP) {
      splicing_vector_t *tmp;
      tmp=psi; psi=psiNew; psiNew=tmp;
      tmp=alpha; alpha=alphaNew; alphaNew=tmp;
      cJS = pJS;
      rundata->noAccepted ++;
    } else {
      rundata->noRejected ++;
    }
    
    if (m >= noBurnIn) {
      if (lagCounter == noLag - 1) {
	memcpy(&MATRIX(*samples, 0, noS), VECTOR(*psi), 
	       noiso * sizeof(double));
	VECTOR(*logLik)[noS] = cJS;
	noS++;
	lagCounter = 0;
      } else {
	lagCounter ++;
      }
    }
    
    SPLICING_CHECK(splicing_reassign_samples(mymatch_matrix, &match_order, 
					     psi, noiso, myass));

  } /* for m < noIterations */

  splicing_vector_destroy(&isoscores);
  splicing_vector_int_destroy(&effisolen);
  splicing_vector_int_destroy(&match_order);
  SPLICING_FINALLY_CLEAN(3);
  if (!match_matrix) {
    splicing_matrix_destroy(mymatch_matrix);
    SPLICING_FINALLY_CLEAN(1);
  }
  splicing_vector_destroy(&valphaNew);
  splicing_vector_destroy(&valpha);
  splicing_vector_destroy(&vpsiNew);
  splicing_vector_destroy(&vpsi);
  SPLICING_FINALLY_CLEAN(4);
  
  if (!assignment) { 
    splicing_vector_int_destroy(myass);
    SPLICING_FINALLY_CLEAN(1);
  }

  return 0;
}
Ejemplo n.º 7
0
Archivo: gff.c Proyecto: mlovci/MISO
int splicing_iso_to_genomic(const splicing_gff_t *gff, size_t gene, 
			    const splicing_vector_int_t *isoform,
			    const splicing_vector_int_t *exstart,
			    const splicing_vector_int_t *exend,
			    const splicing_vector_int_t *exidx,
			    splicing_vector_int_t *position) {

  size_t i, noiso, n=splicing_vector_int_size(position);
  splicing_vector_int_t exlim, shift;
  splicing_vector_int_t vexstart, vexend, vexidx, 
    *myexstart=(splicing_vector_int_t *) exstart, 
    *myexend=(splicing_vector_int_t *) exend, 
    *myexidx=(splicing_vector_int_t *) exidx;
  size_t pos, pos2;

  if (!exstart || !exend || !exidx) {
    myexstart=&vexstart;
    myexend=&vexend;
    myexidx=&vexidx;
    SPLICING_CHECK(splicing_vector_int_init(myexstart, 0));
    SPLICING_FINALLY(splicing_vector_int_destroy, myexstart);
    SPLICING_CHECK(splicing_vector_int_init(myexend, 0));
    SPLICING_FINALLY(splicing_vector_int_destroy, myexend);
    SPLICING_CHECK(splicing_vector_int_init(myexidx, 0));
    SPLICING_FINALLY(splicing_vector_int_destroy, myexidx);
    SPLICING_CHECK(splicing_gff_exon_start_end(gff, myexstart, myexend, 
					       myexidx, gene));
  }

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));

  SPLICING_CHECK(splicing_vector_int_init(&exlim, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exlim);
  SPLICING_CHECK(splicing_vector_int_init(&shift, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &shift);

  for (i=0; i<noiso; i++) {
    size_t cs=0, ce=0, ex=0;
    int pos=VECTOR(*myexidx)[i], pos2=VECTOR(*myexidx)[i+1];
    while (pos < pos2) {
      cs += VECTOR(*myexstart)[pos];
      SPLICING_CHECK(splicing_vector_int_push_back(&shift, cs-ce-ex-1));
      ex++; ce += VECTOR(*myexend)[pos]; pos++;
    }
  }

  for (i=0; i<noiso; i++) { 
    size_t cs=0;
    int pos=VECTOR(*myexidx)[i], pos2=VECTOR(*myexidx)[i+1];
    while (pos < pos2) {
      size_t l=VECTOR(*myexend)[pos]-VECTOR(*myexstart)[pos]+1;
      cs += l;
      SPLICING_CHECK(splicing_vector_int_push_back(&exlim, cs+1));
      pos++;
    }
  }  

  for (i=0; i<n; i++) {
    int iso=VECTOR(*isoform)[i];
    size_t pos=VECTOR(*position)[i];
    int ex;
    for (ex=VECTOR(*myexidx)[iso]; VECTOR(exlim)[ex] <= pos; ex++) ;
    VECTOR(*position)[i] = pos + VECTOR(shift)[ex];
  }

  splicing_vector_int_destroy(&shift);
  splicing_vector_int_destroy(&exlim);
  SPLICING_FINALLY_CLEAN(2);

  if (!exstart || !exend || !exidx) {
    splicing_vector_int_destroy(myexidx);
    splicing_vector_int_destroy(myexend);
    splicing_vector_int_destroy(myexstart);
    SPLICING_FINALLY_CLEAN(3);
  }
  
  return 0;
}
Ejemplo n.º 8
0
static PyObject* pysplicing_miso(PyObject *self, PyObject *args) {
  PyObject *gff, *readpos, *readcigar, *hyperp=0;
  int gene, readLength, noIterations=5000, noBurnIn=500, noLag=10;
  splicing_gff_t *mygff;
  splicing_strvector_t myreadcigar;
  splicing_vector_int_t myreadpos;
  splicing_vector_t myhyperp;
  splicing_matrix_t samples;
  splicing_vector_t logLik;
  splicing_matrix_t class_templates;
  splicing_vector_t class_counts;
  splicing_vector_int_t assignment;
  splicing_miso_rundata_t rundata;
  PyObject *r1, *r2, *r3, *r4, *r5, *r6;
  
  if (!PyArg_ParseTuple(args, "OiOOi|iiiO", &gff, &gene, &readpos, &readcigar,
			&readLength, &noIterations, &noBurnIn, &noLag, 
			&hyperp)) { return NULL; }
  
  mygff=PyCObject_AsVoidPtr(gff);

  SPLICING_PYCHECK(splicing_matrix_init(&samples, 0, 0));
  SPLICING_FINALLY(splicing_matrix_destroy, &samples);
  SPLICING_PYCHECK(splicing_vector_init(&logLik, 0));
  SPLICING_FINALLY(splicing_vector_destroy, &logLik);
  SPLICING_PYCHECK(splicing_matrix_init(&class_templates, 0, 0));
  SPLICING_FINALLY(splicing_matrix_destroy, &class_templates);
  SPLICING_PYCHECK(splicing_vector_init(&class_counts, 0));
  SPLICING_FINALLY(splicing_vector_destroy, &class_counts);
  SPLICING_PYCHECK(splicing_vector_int_init(&assignment, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &assignment);
  if (pysplicing_to_vector_int(readpos, &myreadpos)) { return NULL; }
  SPLICING_FINALLY(splicing_vector_int_destroy, &myreadpos);
  if (hyperp) { 
    if (pysplicing_to_vector(hyperp, &myhyperp)) { return NULL; }
    SPLICING_FINALLY(splicing_vector_destroy, &myhyperp);
  } else {
    size_t i, noiso;
    SPLICING_PYCHECK(splicing_gff_noiso_one(mygff, gene, &noiso));
    SPLICING_PYCHECK(splicing_vector_init(&myhyperp, noiso));
    SPLICING_FINALLY(splicing_vector_destroy, &myhyperp);
    for (i=0; i<noiso; i++) { VECTOR(myhyperp)[i] = 1.0; }
  }
  if (pysplicing_to_strvector(readcigar, &myreadcigar)) { return NULL; };
  SPLICING_FINALLY(splicing_strvector_destroy, &myreadcigar);
  
  SPLICING_PYCHECK(splicing_miso(mygff, gene, &myreadpos, 
				 (const char**) myreadcigar.table, 
				 readLength, noIterations, noBurnIn, noLag,
				 &myhyperp, &samples, &logLik, 
				 /*match_matrix=*/ 0, &class_templates,
				 &class_counts, &assignment, &rundata));

  splicing_vector_destroy(&myhyperp);
  splicing_vector_int_destroy(&myreadpos);
  splicing_strvector_destroy(&myreadcigar);
  SPLICING_FINALLY_CLEAN(3);
  
  r6=pysplicing_from_miso_rundata(&rundata);

  r5=pysplicing_from_vector_int(&assignment);
  splicing_vector_int_destroy(&assignment); SPLICING_FINALLY_CLEAN(1);

  r4=pysplicing_from_vector(&class_counts);
  splicing_vector_destroy(&class_counts); SPLICING_FINALLY_CLEAN(1);

  splicing_matrix_transpose(&class_templates);
  r3=pysplicing_from_matrix(&class_templates);
  splicing_matrix_destroy(&class_templates); SPLICING_FINALLY_CLEAN(1);

  r2=pysplicing_from_vector(&logLik);
  splicing_vector_destroy(&logLik); SPLICING_FINALLY_CLEAN(1);

  r1=pysplicing_from_matrix(&samples);
  splicing_matrix_destroy(&samples); SPLICING_FINALLY_CLEAN(1);
  
  return Py_BuildValue("OOOOOO", r1, r2, r3, r4, r5, r6);
}
Ejemplo n.º 9
0
int splicing_simulate_reads(const splicing_gff_t *gff, int gene,
			    const splicing_vector_t *expression,
			    int noreads, int readLength,
			    splicing_vector_int_t *isoform, 
			    splicing_vector_int_t *position, 
			    splicing_strvector_t *cigar, 
			    splicing_vector_t *sample_prob) {
  
  size_t i, p, noiso, goodiso=0, nogenes;
  splicing_vector_int_t effisolen;
  splicing_vector_t sampleprob;
  double rand, sumpsi=0.0;
  splicing_vector_int_t exstart, exend, exidx;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  if (gene < 0 || gene >= nogenes) {
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  /* TODO: more error checks */

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));
    
  SPLICING_CHECK(splicing_vector_int_init(&effisolen, noiso));
  SPLICING_FINALLY(splicing_vector_int_destroy, &effisolen);
  SPLICING_CHECK(splicing_vector_init(&sampleprob, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &sampleprob);
  SPLICING_CHECK(splicing_vector_int_resize(isoform, noreads));
  SPLICING_CHECK(splicing_gff_isolength_one(gff, gene, &effisolen));
  for (i=0; i<noiso; i++) {
    int l=VECTOR(effisolen)[i]-readLength+1;
    VECTOR(effisolen)[i] = l > 0 ? l : 0;
    VECTOR(sampleprob)[i] = VECTOR(*expression)[i] * VECTOR(effisolen)[i];
    if (VECTOR(sampleprob)[i] != 0) { goodiso++; }
    sumpsi += VECTOR(sampleprob)[i];
  }

  if (goodiso==0) {
    SPLICING_ERROR("No isoform is possible", SPLICING_FAILURE);
  }

  if (sample_prob) {
    SPLICING_CHECK(splicing_vector_update(sample_prob, &sampleprob));
  }

  for (i=1; i<noiso; i++) {
    VECTOR(sampleprob)[i] += VECTOR(sampleprob)[i-1];
  }

  for (i=0; i<noreads; i++) {
    int w;
    if (noiso==1) {
      w=0;
    } else if (noiso==2) {
      rand = RNG_UNIF01() * sumpsi;
      w = (rand < VECTOR(sampleprob)[0]) ? 0 : 1;
    } else {
      rand = RNG_UNIF01() * sumpsi;
      for (w=0; rand > VECTOR(sampleprob)[w]; w++) ;
    }
    VECTOR(*isoform)[i]=w;
  }
  
  splicing_vector_destroy(&sampleprob);
  SPLICING_FINALLY_CLEAN(1);

  /* OK, we have the isoforms, now we need the read positions, 
     these are uniformly sampled from the individual isoforms. */

  SPLICING_CHECK(splicing_vector_int_resize(position, noreads));
  SPLICING_CHECK(splicing_vector_int_init(&exstart, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exstart);
  SPLICING_CHECK(splicing_vector_int_init(&exend, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exend);
  SPLICING_CHECK(splicing_vector_int_init(&exidx, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exidx);
  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &exstart, &exend, &exidx,
					     gene));

  /* Positions in isoform coordinates first */

  for (i=0; i<noreads; i++) { 
    int iso=VECTOR(*isoform)[i];
    int len=VECTOR(effisolen)[iso];
    VECTOR(*position)[i]=RNG_INTEGER(1, len);
  }

  /* Translate isoform coordinates to genomic coordintes */

  /* TODO: some of this is already calculated */
  SPLICING_CHECK(splicing_iso_to_genomic(gff, gene, isoform, /*converter=*/ 0,
					 position));

  /* CIGAR strings */

  splicing_strvector_clear(cigar);
  SPLICING_CHECK(splicing_strvector_reserve(cigar, noreads));
  for (i=0; i<noreads; i++) {
    char tmp[1000], *tmp2=tmp;
    int iso=VECTOR(*isoform)[i];
    size_t rs=VECTOR(*position)[i];
    int ex=0;
    int rl=readLength;
    for (ex=VECTOR(exidx)[iso]; VECTOR(exend)[ex] < rs; ex++) ;
    while (VECTOR(exend)[ex] < rs+rl-1) {
      tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM%iN",
		       (int) (VECTOR(exend)[ex]-rs+1), 
		       (int) (VECTOR(exstart)[ex+1]-VECTOR(exend)[ex]-1));
      if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
	SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL);
      }
      rl -= (VECTOR(exend)[ex] - rs + 1);
      rs = VECTOR(exstart)[ex+1];
      ex++;
    }
    tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM", rl);
    if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
      SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL); }
    SPLICING_CHECK(splicing_strvector_append(cigar, tmp));
  }

  splicing_vector_int_destroy(&exidx);
  splicing_vector_int_destroy(&exend);
  splicing_vector_int_destroy(&exstart);
  splicing_vector_int_destroy(&effisolen);
  SPLICING_FINALLY_CLEAN(4);
  
  return 0;
}
Ejemplo n.º 10
0
int splicing_simulate_paired_reads(const splicing_gff_t *gff, int gene,
				   const splicing_vector_t *expression,
				   int noreads, int readLength,
				   const splicing_vector_t *fragmentProb,
				   int fragmentStart, double normalMean,
				   double normalVar, double numDevs,
				   splicing_vector_int_t *isoform,
				   splicing_vector_int_t *position,
				   splicing_strvector_t *cigar, 
				   splicing_vector_t *sampleprob) {
  
  size_t i, j, noiso, il, nogenes;
  splicing_vector_t *mysampleprob=sampleprob, vsampleprob;
  splicing_vector_t px, cpx;
  double sumpx, sumpsi=0.0;
  splicing_vector_int_t isolen;
  int goodiso=0;
  splicing_vector_int_t exstart, exend, exidx;
  splicing_vector_t *myfragmentProb=(splicing_vector_t*) fragmentProb,
    vfragmentProb;
  int fs, fl;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  if (gene < 0 || gene >= nogenes) {
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  /* TODO: more error checks */

  if (!fragmentProb) { 
    myfragmentProb=&vfragmentProb;
    SPLICING_CHECK(splicing_vector_init(&vfragmentProb, 0));
    SPLICING_FINALLY(splicing_vector_destroy, &vfragmentProb);
    SPLICING_CHECK(splicing_normal_fragment(normalMean, normalVar, numDevs, 
					    readLength, myfragmentProb,
					    &fragmentStart));
    splicing_vector_scale(myfragmentProb, 
			  1.0/splicing_vector_sum(myfragmentProb));
  }

  il=splicing_vector_size(myfragmentProb);
  fs=fragmentStart;
  fl=fragmentStart+il-1;

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));
    
  if ( fabs(splicing_vector_sum(myfragmentProb) - 1.0) > 1e-10 ) {
    SPLICING_ERROR("Fragment length distribution does not sum up to 1", 
		   SPLICING_EINVAL);
  }

  SPLICING_CHECK(splicing_vector_int_init(&isolen, noiso));
  SPLICING_FINALLY(splicing_vector_int_destroy, &isolen);
  SPLICING_CHECK(splicing_gff_isolength_one(gff, gene, &isolen));
  
  SPLICING_CHECK(splicing_vector_copy(&px, myfragmentProb));
  SPLICING_FINALLY(splicing_vector_destroy, &px);
  SPLICING_CHECK(splicing_vector_init(&cpx, il));
  SPLICING_FINALLY(splicing_vector_destroy, &cpx);

  if (!sampleprob) {
    mysampleprob=&vsampleprob;
    SPLICING_CHECK(splicing_vector_init(mysampleprob, noiso));
    SPLICING_FINALLY(splicing_vector_destroy, mysampleprob);
  } else {
    SPLICING_CHECK(splicing_vector_resize(mysampleprob, noiso));
  }

  for (sumpx=VECTOR(px)[0], i=1; i<il; i++) {
    VECTOR(px)[i] += VECTOR(px)[i-1];
    sumpx += VECTOR(px)[i];
  }
  VECTOR(cpx)[0] = VECTOR(px)[0];
  for (i=1; i<il; i++) {
    VECTOR(cpx)[i] = VECTOR(cpx)[i-1] + VECTOR(px)[i];
  }

  for (i=0; i<noiso; i++) {
    int ilen=VECTOR(isolen)[i];
    int r1= ilen >= fl ? ilen - fl + 1 : 0;
    int r2= ilen >= fs ? (ilen >= fl ? fl - fs : ilen - fs + 1) : 0;
    /* int r3= fs - 1; */
    double sp=0.0;
    if (r1 > 0) { sp += r1; } 
    if (r2 > 0) { sp += VECTOR(cpx)[r2-1]; }
    VECTOR(*mysampleprob)[i] = sp * VECTOR(*expression)[i];
    if (VECTOR(*mysampleprob)[i] != 0) { goodiso += 1; }
    sumpsi += VECTOR(*mysampleprob)[i];
  }

  if (goodiso == 0) {
    SPLICING_ERROR("No isoform is possible", SPLICING_FAILURE);
  }

  for (i=1; i<noiso; i++) {
    VECTOR(*mysampleprob)[i] += VECTOR(*mysampleprob)[i-1];
  }

  SPLICING_CHECK(splicing_vector_int_resize(isoform, noreads*2));

  for (i=0; i<2*noreads; i+=2) {
    int w;
    double rand;
    if (noiso==1) {
      w=0;
    } else if (noiso==2) {
      rand = RNG_UNIF01() * sumpsi;
      w = (rand < VECTOR(*mysampleprob)[0]) ? 0 : 1;
    } else {
      rand = RNG_UNIF01() * sumpsi;
      for (w=0; rand > VECTOR(*mysampleprob)[w]; w++) ;
    }
    VECTOR(*isoform)[i]=VECTOR(*isoform)[i+1]=w;
  }

  if (!sampleprob) { 
    splicing_vector_destroy(mysampleprob);
    SPLICING_FINALLY_CLEAN(1);
  } else {
    for (i=noiso-1; i>0; i--) {
      VECTOR(*mysampleprob)[i] -= VECTOR(*mysampleprob)[i-1];
    }
  }

  /* We have the isoforms, now get the read positions. */
  
  SPLICING_CHECK(splicing_vector_int_resize(position, noreads*2));
  SPLICING_CHECK(splicing_vector_int_init(&exstart, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exstart);
  SPLICING_CHECK(splicing_vector_int_init(&exend, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exend);
  SPLICING_CHECK(splicing_vector_int_init(&exidx, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exidx);
  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &exstart, &exend, &exidx,
					     gene));
  
  /* Positions in isoform coordinates first. 
     These are sampled based on the fragment length distribution. */

  for (i=0, j=0; i<noreads; i++) {
    int iso=VECTOR(*isoform)[2*i];
    int ilen=VECTOR(isolen)[iso];
    int r1= ilen >= fl ? ilen - fl + 1 : 0;
    int r2= ilen >= fs ? (ilen >= fl ? fl - fs : ilen - fs + 1) : 0;
    /* int r3= fs - 1; */
    int pos, fragment;
    double sp=0.0;
    if (r1 > 0) { sp += r1; } 
    if (r2 > 0) { sp += VECTOR(cpx)[r2-1]; }
    double rand=RNG_UNIF(0, sp);
    if (rand < r1) { 
      pos = ceil(rand);
    } else {
      int w;
      rand -= r1;
      for (w=0; VECTOR(cpx)[w] < rand; w++) ;
      pos = r1 + r2 - w;
    }

    if (pos <= r1) {
      rand=RNG_UNIF(0, 1.0);
    } else {
      rand=RNG_UNIF(0, VECTOR(px)[r1+r2-pos]);
    }
    for (fragment=0; VECTOR(px)[fragment] < rand; fragment++) ;
    fragment += fragmentStart;

    VECTOR(*position)[j++] = pos;
    VECTOR(*position)[j++] = pos+fragment-readLength;
    
  }

  /* Translate positions to genomic coordinates */

  /* TODO: some of this is already calculated */
  SPLICING_CHECK(splicing_iso_to_genomic(gff, gene, isoform, /*converter=*/ 0,
					 position));

  /* CIGAR strings */

  splicing_strvector_clear(cigar);
  SPLICING_CHECK(splicing_strvector_reserve(cigar, 2*noreads));
  for (j=0; j<2*noreads; j++) {
    char tmp[1000], *tmp2=tmp;
    int iso=VECTOR(*isoform)[j];
    size_t rs=VECTOR(*position)[j];
    int ex=0;
    int rl=readLength;
    for (ex=VECTOR(exidx)[iso]; VECTOR(exend)[ex] < rs; ex++) ;
    while (rs + rl - 1 > VECTOR(exend)[ex]) {
      tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM%iN",
		       (int) (VECTOR(exend)[ex]-rs+1), 
		       (int) (VECTOR(exstart)[ex+1]-VECTOR(exend)[ex]-1));
      if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
	SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL);
      }
      rl -= (VECTOR(exend)[ex] - rs + 1);
      rs = VECTOR(exstart)[ex+1];
      ex++;
    }
    tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM", rl);
    if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
      SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL);
    }
    SPLICING_CHECK(splicing_strvector_append(cigar, tmp));
  }

  splicing_vector_int_destroy(&exidx);
  splicing_vector_int_destroy(&exend);
  splicing_vector_int_destroy(&exstart);
  splicing_vector_destroy(&cpx);
  splicing_vector_destroy(&px);
  splicing_vector_int_destroy(&isolen);
  SPLICING_FINALLY_CLEAN(6);

  if (!fragmentProb) { 
    splicing_vector_destroy(myfragmentProb); 
    SPLICING_FINALLY_CLEAN(1);
  }

  return 0;
}