Beispiel #1
1
int splicing_dgesdd(const splicing_matrix_t *matrix, 
		    splicing_vector_t *values) {

  splicing_matrix_t tmp;
  int m=splicing_matrix_nrow(matrix);
  int n=splicing_matrix_ncol(matrix);
  int lda=m, minmn= m < n ? m : n, maxmn = m < n ? n : m;
  int lwork=-1;
  int info=0;
  splicing_vector_t work;
  splicing_vector_int_t iwork;
  char jobz='N';
  int dummy=1;
  double dummy2;
  
  SPLICING_CHECK(splicing_matrix_copy(&tmp, matrix));
  SPLICING_FINALLY(splicing_matrix_destroy, &tmp);
  SPLICING_CHECK(splicing_vector_init(&work, 1));
  SPLICING_FINALLY(splicing_vector_destroy, &work);
  SPLICING_CHECK(splicing_vector_int_init(&iwork, 8*minmn));
  SPLICING_FINALLY(splicing_vector_int_destroy, &iwork);

  SPLICING_CHECK(splicing_vector_resize(values, minmn));

  /* Get the optiomal lwork first*/
  splicingdgesdd_(&jobz, &m, &n, &MATRIX(tmp,0,0), &lda, VECTOR(*values),
		  /*U=*/ &dummy2, /*LDU=*/ &dummy, 
		  /*VT=*/ &dummy2, /*LDVT=*/ &dummy, 
		  VECTOR(work), &lwork, VECTOR(iwork), &info);

  lwork = VECTOR(work)[0];
  SPLICING_CHECK(splicing_vector_resize(&work, lwork));

  /* Now do the SVD */
  splicingdgesdd_(&jobz, &m, &n, &MATRIX(tmp,0,0), &lda, VECTOR(*values),
		  /*U=*/ &dummy2, /*LDU=*/ &dummy, 
		  /*VT=*/ &dummy2, /*LDVT=*/ &dummy, 
		  VECTOR(work), &lwork, VECTOR(iwork), &info);

  if (info != 0) { 
    SPLICING_ERROR("Cannot calculate SVD", SPLICING_ELAPACK);
  }

  splicing_vector_destroy(&work);
  splicing_vector_int_destroy(&iwork);
  splicing_matrix_destroy(&tmp);
  SPLICING_FINALLY_CLEAN(3);
  
  return 0;
}
Beispiel #2
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_io_get_string(FILE *input, char *buffer, size_t maxlen, 
			   size_t *len, char delim, int newline) {
  int c;

  *len = 0;
  while (1) {
    c=fgetc(input);
    if (c==EOF) {
      SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
    } else if (*len == maxlen) { 
      SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
    } else if (c==delim) {
      *buffer='\0'; 
      buffer++;
      return 0;
    } else if (newline && (c=='\n' || c=='\r')) {
      *buffer='\0';
      buffer++;
      return 0;
    } else if (!newline && (c=='\n' || c=='\r')) {
      SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
    } else { 
      *buffer=(char) c;
      buffer++;
      *len += 1;
    }
  }

  return 1;
}
Beispiel #3
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_io_get_real_na(FILE *input, double *real, char delim, 
			    char nachar) {
  char buffer[30];
  char *bufend;
  size_t len;
  double na=SPLICING_NA_REAL;
  int eof = splicing_io_get_string(input, buffer, 
				   sizeof(buffer)/sizeof(char), &len, 
				   delim, /*newline=*/ 0);

  if (eof) { 
    SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
  }

  if (len > 0 && buffer[0]==nachar) {
    *real=na;
    return 0;
  }

  *real = strtod(buffer, &bufend);
  if (*bufend != '\0') { 
    SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
  }

  return 0;
}
Beispiel #4
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_io_get_integer_na(FILE *input, int *integer, char delim,
			       char nachar) {
  char buffer[30];
  char *bufend;
  size_t len;
  int na=SPLICING_NA_INTEGER;
  int eof = splicing_io_get_string(input, buffer, 
				   sizeof(buffer)/sizeof(char), &len, 
				   delim, /*newline=*/ 0);

  if (eof) { 
    SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
  }

  if (len > 0 && buffer[0]==nachar) {
    *integer=na;
    return 0;
  }

  *integer = (int) strtol(buffer, &bufend, /*base=*/ 10);
  if (*bufend != '\0') { 
    SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
  }
  
  return 0;
}
Beispiel #5
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_io_parse_attributes(char *attr, char **ID, char**parent) {
  *ID=SPLICING_STRVECTOR_ZERO; *parent=SPLICING_STRVECTOR_ZERO;
  char *kw, *vl;
  while (*attr != '\0') {
    /* Skip white space */
    while (*attr != '\0' && isspace(*attr)) {
      attr++;
    }
    /* Keyword */
    kw=attr;
    while (*attr != '\0' && *attr != '=') {
      attr++;
    }
    if (*attr == '\0') { 
      SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
    }
    *attr='\0'; attr++;
    /* Value */
    vl=attr;
    while (*attr != '\0' && *attr != ';') { 
      attr++;
    }
    if (*attr == ';') { *attr='\0'; attr++; }
    if (!strcmp("ID", kw)) {
      *ID=vl;
    } else if (!strcmp("Parent", kw)) {
      *parent=vl;
    }
  }
  return 0;
}
Beispiel #6
0
int splicing_gff_noexons_one(const splicing_gff_t *gff, size_t gene,
			     splicing_vector_int_t *noexons) {

  size_t nogenes, idx1, idx2, noiso, pos, il;
  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  
  if (gene >= nogenes) {
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  idx1=VECTOR(gff->genes)[gene];
  idx2= gene+1 == nogenes ? gff->n : VECTOR(gff->genes)[gene+1];
  
  for (noiso=0; idx1 < idx2; idx1++) {
    if (VECTOR(gff->type)[idx1] == SPLICING_TYPE_MRNA) { noiso += 1; }    
  }

  SPLICING_CHECK(splicing_vector_int_resize(noexons, noiso));

  idx1=VECTOR(gff->genes)[gene];
  idx2= gene+1 == nogenes ? gff->n : VECTOR(gff->genes)[gene+1];
  for (; idx1 < idx2 && VECTOR(gff->type)[idx1] != SPLICING_TYPE_MRNA; 
       idx1++) ;
  idx1++;
  for (pos=0, il=0; idx1 < idx2; idx1++) {
    if (VECTOR(gff->type)[idx1] == SPLICING_TYPE_MRNA) {
      VECTOR(*noexons)[pos++]=il;
      il=0;
    } else if (VECTOR(gff->type)[idx1] == SPLICING_TYPE_EXON) { il++; }
  }
  VECTOR(*noexons)[pos++]=il;

  return 0;
}
Beispiel #7
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_gff_exon_start_end(const splicing_gff_t *gff, 
				splicing_vector_int_t *start,
				splicing_vector_int_t *end,
				splicing_vector_int_t *idx,
				int gene) {
  
  size_t noiso;
  int i=0, p=0, n=splicing_gff_size(gff);
  int pos;
  size_t nogenes;
  splicing_vector_int_t tmp, tmp2;

  SPLICING_CHECK(splicing_vector_int_init(&tmp, 10));
  SPLICING_FINALLY(splicing_vector_int_destroy, &tmp);
  SPLICING_CHECK(splicing_vector_int_init(&tmp2, 10));
  SPLICING_FINALLY(splicing_vector_int_destroy, &tmp2);

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  if (gene < 0 || gene >= nogenes) { 
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  pos=VECTOR(gff->genes)[gene]+1;
  
  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));
  splicing_vector_int_clear(start);
  splicing_vector_int_clear(end);
  SPLICING_CHECK(splicing_vector_int_resize(idx, noiso+1));
  while (pos < n) {
    if (VECTOR(gff->type)[pos] == SPLICING_TYPE_EXON) { 
      int s=VECTOR(gff->start)[pos];
      int e=VECTOR(gff->end)[pos];
      SPLICING_CHECK(splicing_vector_int_push_back(start, s)); p++;
      SPLICING_CHECK(splicing_vector_int_push_back(end, e));
    } else if (VECTOR(gff->type)[pos] == SPLICING_TYPE_MRNA) {
      VECTOR(*idx)[i] = p;
      if (i!=0) { 
	SPLICING_CHECK(splicing_i_gff_exon_start_end_sort(start, end, idx, 
							  i-1, &tmp, &tmp2));
      }
      i++;
    } else if (VECTOR(gff->type)[pos] == SPLICING_TYPE_GENE) {
      break;
    }
    pos++;
  }
  VECTOR(*idx)[i] = p;
  SPLICING_CHECK(splicing_i_gff_exon_start_end_sort(start, end, idx, i-1, 
						    &tmp, &tmp2));

  splicing_vector_int_destroy(&tmp2);
  splicing_vector_int_destroy(&tmp);
  SPLICING_FINALLY_CLEAN(1);

  return 0;
}
Beispiel #8
0
Datei: gff.c Projekt: mlovci/MISO
/* TODO: do not ignore size */
int splicing_gff_init(splicing_gff_t *gff, size_t size) {

  if (size < 0) { 
    SPLICING_ERROR("Cannot create GFF, `size' must be non-negative", 
		   SPLICING_EINVAL);
  }

  SPLICING_CHECK(splicing_strvector_init(&gff->seqids, 0));
  SPLICING_FINALLY(splicing_strvector_destroy, &gff->seqids);
  SPLICING_CHECK(splicing_strvector_init(&gff->sources, 0));
  SPLICING_FINALLY(splicing_strvector_destroy, &gff->sources);
  SPLICING_CHECK(splicing_strvector_init(&gff->ID, 0));
  SPLICING_FINALLY(splicing_strvector_destroy, &gff->ID);

  SPLICING_CHECK(splicing_vector_int_init(&gff->genes, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->genes);
  SPLICING_CHECK(splicing_vector_int_init(&gff->transcripts, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->transcripts);
  SPLICING_CHECK(splicing_vector_int_init(&gff->seqid, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->seqid);
  SPLICING_CHECK(splicing_vector_int_init(&gff->source, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->source);
  SPLICING_CHECK(splicing_vector_int_init(&gff->strand, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->strand);
  SPLICING_CHECK(splicing_vector_int_init(&gff->type, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->type);
  SPLICING_CHECK(splicing_vector_int_init(&gff->start, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->start);
  SPLICING_CHECK(splicing_vector_int_init(&gff->end, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->end);
  SPLICING_CHECK(splicing_vector_init(&gff->score, 0));
  SPLICING_FINALLY(splicing_vector_destroy, &gff->score);
  SPLICING_CHECK(splicing_vector_int_init(&gff->phase, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->phase);
  SPLICING_CHECK(splicing_vector_int_init(&gff->parent, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &gff->parent);

  gff->n=0;
  gff->nogenes=0;
  gff->notranscripts=0;

  gff->last_gene_id = gff->last_mrna_id = SPLICING_STRVECTOR_ZERO;
  gff->last_gene_no = gff->last_mrna_no = -1;
  gff->last_seqid = gff->last_source = SPLICING_STRVECTOR_ZERO;

  SPLICING_FINALLY_CLEAN(14);

  return 0;
}
Beispiel #9
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_gff_gene_start_end_one(const splicing_gff_t *gff, size_t gene,
				    size_t *start, size_t *end) {

  size_t nogenes=splicing_vector_int_size(&gff->genes);
  size_t idx;
  
  if (gene < 0 || gene >= nogenes) { 
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL); 
  }
  
  idx=VECTOR(gff->genes)[gene];
  *start=VECTOR(gff->start)[idx];
  *end=VECTOR(gff->end)[idx];

  return 0;
}
Beispiel #10
0
int splicing_gff_constitutive_exons(const splicing_gff_t *gff,
				    splicing_exonset_t *exons,
				    int min_length, 
				    splicing_constitutive_mode_t mode) {

  switch (mode) {
  case SPLICING_CONSTITUTIVE_ALL:
    return splicing_i_gff_constitutive_exons_all(gff, exons, min_length);
    break;
  case SPLICING_CONSTITUTIVE_FULL:
    return splicing_i_gff_constitutive_exons_full(gff, exons, min_length);
    break;
  default:
    SPLICING_ERROR("Unknown `mode' argument for constitutive exon finding",
		   SPLICING_EINVAL);
  }
}
Beispiel #11
0
int splicing_i_gff_reindex_cmp(void *data, const void *a, const void *b) {
  splicing_gff_t *gff=(splicing_gff_t *) data;
  int aa=*(int*)a, bb=*(int*)b;
  
  int parent_a=VECTOR(gff->parent)[aa];
  int parent_b=VECTOR(gff->parent)[bb];
  int gparent_a= parent_a == -1 ? -1 : VECTOR(gff->parent)[parent_a];
  int gparent_b= parent_b == -1 ? -1 : VECTOR(gff->parent)[parent_b];
  
  const char *a_gene_id, *b_gene_id, *a_mrna_id, *b_mrna_id;
  int c1, c2;

  /* If gene ids differ */
  a_gene_id = gparent_a != -1 ? STR(gparent_a) : 
    (parent_a != -1 ? STR(parent_a) : STR(aa));
  b_gene_id = gparent_b != -1 ? STR(gparent_b) : 
    (parent_b != -1 ? STR(parent_b) : STR(bb));
  c1=strcmp(a_gene_id, b_gene_id); if (c1 != 0) { return c1; }

  /* Or if mRNA ids differ */
  a_mrna_id = gparent_a != -1 ? STR(parent_a) : STR(aa);
  b_mrna_id = gparent_b != -1 ? STR(parent_b) : STR(bb);
  c2=strcmp(a_mrna_id, b_mrna_id); if (c2 != 0) { return c2; }
  
  /* Otherwise gene first, then mRNA, then the rest according to
     start position */
  if (parent_a == -1 && parent_b != -1) { 
    return -1; 
  } else if (parent_a != -1 && parent_b == -1) { 
    return 1;
  } else if (gparent_a == -1 && gparent_b != -1) { 
    return -1;
  } else if (gparent_a != -1 && gparent_b == -1) { 
    return 1;
  } else if (gparent_a != -1 && gparent_b != -1) { 
    int sa=VECTOR(gff->start)[aa];
    int sb=VECTOR(gff->start)[bb];
    if (sa < sb) { return -1; } else if (sa > sb) { return 1; }
    return 0;
  } else {
    SPLICING_ERROR("Invalid GFF file, cannot order records", 
		   SPLICING_EINVAL);
  }
  return 0;
}
Beispiel #12
0
int splicing_iso_to_genomic_all(const splicing_gff_t *gff, size_t gene,
				int position, 
				const splicing_gff_converter_t *converter,
				splicing_vector_int_t *result) {

  size_t i;
  splicing_gff_converter_t vconverter, 
    *myconverter = (splicing_gff_converter_t*) converter;

  if (position < 1) { 
    SPLICING_ERROR("Invalid isoform coordinate, must the larger than zero", 
		   SPLICING_EINVAL);
  }

  if (!converter) { 
    myconverter=&vconverter;
    SPLICING_CHECK(splicing_gff_converter_init(gff, gene, myconverter));
    SPLICING_FINALLY(splicing_gff_converter_destroy, myconverter);
  }

  SPLICING_CHECK(splicing_vector_int_resize(result, myconverter->noiso));

  /* TODO: find impossible positions */
  for (i=0; i<myconverter->noiso; i++) {
    int ex;
    for (ex=VECTOR(myconverter->exidx)[i]; 
	 ex < VECTOR(myconverter->exidx)[i+1] && 
	   VECTOR(myconverter->exlim)[ex] <= position; 
	 ex++) ;
    if (ex < VECTOR(myconverter->exidx)[i+1]) {
      VECTOR(*result)[i] = position + VECTOR(myconverter->shift)[ex];
    } else {
      VECTOR(*result)[i] = -1;
    }
  }

  if (!converter) {
    splicing_gff_converter_destroy(myconverter);
    SPLICING_FINALLY_CLEAN(1);
  }
  
  return 0;
}
Beispiel #13
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_i_gff_noiso_one(const splicing_gff_t *gff, size_t gene,
			     size_t *noiso, splicing_vector_int_t *isolen) {

  size_t nogenes, idx1, idx2;
  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  
  if (gene < 0 || gene >= nogenes) {
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  idx1=VECTOR(gff->genes)[gene];
  idx2= gene+1 == nogenes ? gff->n : VECTOR(gff->genes)[gene+1];
  
  *noiso = 0;
  for ( ; idx1 < idx2; idx1++) {
    if (VECTOR(gff->type)[idx1] == SPLICING_TYPE_MRNA) { *noiso += 1; }
  }

  if (isolen) {
    size_t il=0, pos=0;
    SPLICING_CHECK(splicing_vector_int_resize(isolen, *noiso));
    idx1=VECTOR(gff->genes)[gene];
    idx2= gene+1 == nogenes ? gff->n : VECTOR(gff->genes)[gene+1];
    
    for (; idx1 < idx2 && VECTOR(gff->type)[idx1] != SPLICING_TYPE_MRNA; 
	 idx1++) ;
    idx1++;
    for (; idx1 < idx2; idx1++) {
      if (VECTOR(gff->type)[idx1] == SPLICING_TYPE_MRNA) { 
	VECTOR(*isolen)[pos++]=il;
	il = 0;
      } else if (VECTOR(gff->type)[idx1] == SPLICING_TYPE_EXON) {
	il += VECTOR(gff->end)[idx1] - VECTOR(gff->start)[idx1] + 1;
      }
    }
    VECTOR(*isolen)[pos++]=il;
  }
  
  return 0;
}
Beispiel #14
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_gff_fprint_gene(const splicing_gff_t *gff, 
			     FILE *outfile, int gene) {

  size_t nogenes, noiso;
  int i, j;
  splicing_vector_int_t start, end, idx;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  
  if (gene < 0 || gene >= nogenes) { 
    SPLICING_ERROR("Invalid gene ID", SPLICING_EINVAL);
  }

  SPLICING_CHECK(splicing_vector_int_init(&start, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &start);
  SPLICING_CHECK(splicing_vector_int_init(&end, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &end);
  SPLICING_CHECK(splicing_vector_int_init(&idx, 0));  
  SPLICING_FINALLY(splicing_vector_int_destroy, &idx);

  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &start, &end, &idx, gene));
  noiso = splicing_vector_int_size(&idx)-1;
  
  fprintf(outfile, "===\nGene with %i isoforms:\n", (int) noiso);
  for (i=0; i<noiso; i++) {
    fprintf(outfile, "  Isoform %i:\n", i);
    for (j=VECTOR(idx)[i]; j<VECTOR(idx)[i+1]; j++) {
      fprintf(outfile, "    %i-%i\n", VECTOR(start)[j], VECTOR(end)[j]);
    }
  }
  
  splicing_vector_int_destroy(&idx);
  splicing_vector_int_destroy(&end);
  splicing_vector_int_destroy(&start);
  SPLICING_FINALLY_CLEAN(3);
  
  return 0;    
}
Beispiel #15
0
int splicing_miso_trinity(const splicing_matrix_t *match_matrix,
			  const splicing_vector_int_t *isolen,
			  int readLength, int noIterations, int noBurnIn,
			  int noLag, const splicing_vector_t *hyperp,
			  splicing_matrix_t *samples, 
			  splicing_vector_t *logLik,
			  splicing_matrix_t *class_templates,
			  splicing_vector_t *class_counts,
			  splicing_vector_int_t *assignment,
			  splicing_miso_rundata_t *rundata) {

  double acceptP, cJS, pJS, sigma;
  int noiso = splicing_matrix_nrow(match_matrix);
  int noReads = splicing_matrix_ncol(match_matrix);
  splicing_vector_int_t *myass=assignment, vass;
  splicing_vector_t vpsi, vpsiNew, valpha, valphaNew, 
    *psi=&vpsi, *psiNew=&vpsiNew, *alpha=&valpha, *alphaNew=&valphaNew;
  int noSamples = (noIterations - noBurnIn + 1) / noLag;
  int i, m, lagCounter=0, noS=0;
  splicing_vector_int_t match_order;
  splicing_vector_int_t effisolen;
  splicing_vector_t isoscores;

  if ( (class_templates ? 1 : 0) + (class_counts ? 1 : 0) == 1) {
    SPLICING_ERROR("Only one of `class_templates' and `class_counts' is "
		   "given", SPLICING_EINVAL);
  }
  
  rundata->noIso=noiso;
  rundata->noIters=noIterations;
  rundata->noBurnIn=noBurnIn;
  rundata->noLag=noLag;
  rundata->noAccepted = rundata->noRejected = 0;

  if (assignment) { 
    SPLICING_CHECK(splicing_vector_int_resize(myass, noReads));
    splicing_vector_int_null(myass);
  } else {
    myass=&vass;
    SPLICING_CHECK(splicing_vector_int_init(myass, noReads));
    SPLICING_FINALLY(splicing_vector_int_destroy, myass);
  }
  SPLICING_CHECK(splicing_vector_init(&vpsi, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &vpsi);
  SPLICING_CHECK(splicing_vector_init(&vpsiNew, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &vpsiNew);
  SPLICING_CHECK(splicing_vector_init(&valpha, noiso-1));
  SPLICING_FINALLY(splicing_vector_destroy, &valpha);
  SPLICING_CHECK(splicing_vector_init(&valphaNew, noiso-1));
  SPLICING_FINALLY(splicing_vector_destroy, &valphaNew);

  SPLICING_CHECK(splicing_vector_int_init(&match_order, noReads));
  SPLICING_FINALLY(splicing_vector_int_destroy, &match_order);
  SPLICING_CHECK(splicing_order_matches(match_matrix, &match_order));

  if (class_templates && class_counts) { 
    SPLICING_CHECK(splicing_i_miso_classes(match_matrix, &match_order, 
					   class_templates, class_counts, 
					   /*bin_class_templates=*/ 0, 
					   /*bin_class_counts=*/ 0));
  }

  SPLICING_CHECK(splicing_vector_int_init(&effisolen, noiso));
  SPLICING_FINALLY(splicing_vector_int_destroy, &effisolen);
  SPLICING_CHECK(splicing_vector_init(&isoscores, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &isoscores);
  for (i=0; i<noiso; i++) { 
    int l=VECTOR(*isolen)[i]-readLength+1;
    VECTOR(effisolen)[i] = l > 0 ? l : 0;
    VECTOR(isoscores)[i] = -log((double) l);
  }

  SPLICING_CHECK(splicing_matrix_resize(samples, noiso, noSamples));
  SPLICING_CHECK(splicing_vector_resize(logLik, noSamples));

  /* Initialize Psi(0) randomly */

  SPLICING_CHECK(splicing_drift_proposal(/* mode= */ 0, 0, 0, 0, 0, 0, 
					 noiso, psi, alpha, &sigma, 0));
  SPLICING_CHECK(splicing_drift_proposal(/* mode= */ 1, psi, alpha, sigma,
					 0, 0, noiso, psi, alpha, 0, 0));

  /* Initialize assignments of reads */  
  
  SPLICING_CHECK(splicing_reassign_samples(match_matrix, &match_order, psi, 
					   noiso, myass));

  /* foreach Iteration m=1, ..., M do */

  for (m=0; m < noIterations; m++) {

    SPLICING_CHECK(splicing_drift_proposal(/* mode= */ 1, psi, alpha, sigma,
					   0, 0, noiso, psiNew, alphaNew, 0,
					   0));
    
    SPLICING_CHECK(splicing_metropolis_hastings_ratio(myass, noReads, psiNew,
						      alphaNew, psi, alpha,
						      sigma, noiso, 
						      &effisolen, hyperp,
						      &isoscores, 
						      m > 0 ? 1 : 0, 
						      &acceptP, &cJS, &pJS));
    
    if (acceptP >= 1 || RNG_UNIF01() < acceptP) {
      splicing_vector_t *tmp;
      tmp=psi; psi=psiNew; psiNew=tmp;
      tmp=alpha; alpha=alphaNew; alphaNew=tmp;
      cJS = pJS;
      rundata->noAccepted ++;
    } else {
      rundata->noRejected ++;
    }
    
    if (m >= noBurnIn) {
      if (lagCounter == noLag - 1) {
	memcpy(&MATRIX(*samples, 0, noS), VECTOR(*psi), 
	       noiso * sizeof(double));
	VECTOR(*logLik)[noS] = cJS;
	noS++;
	lagCounter = 0;
      } else {
	lagCounter ++;
      }
    }
    
    SPLICING_CHECK(splicing_reassign_samples(match_matrix, &match_order, 
					     psi, noiso, myass));

  } /* for m < noIterations */

  splicing_vector_destroy(&isoscores);
  splicing_vector_int_destroy(&effisolen);
  splicing_vector_int_destroy(&match_order);
  splicing_vector_destroy(&valphaNew);
  splicing_vector_destroy(&valpha);
  splicing_vector_destroy(&vpsiNew);
  splicing_vector_destroy(&vpsi);
  SPLICING_FINALLY_CLEAN(7);

  if (!assignment) { 
    splicing_vector_int_destroy(myass);
    SPLICING_FINALLY_CLEAN(1);
  }  
  
  return 0;
}
Beispiel #16
0
int splicing_simulate_paired_reads(const splicing_gff_t *gff, int gene,
				   const splicing_vector_t *expression,
				   int noreads, int readLength,
				   const splicing_vector_t *fragmentProb,
				   int fragmentStart, double normalMean,
				   double normalVar, double numDevs,
				   splicing_vector_int_t *isoform,
				   splicing_vector_int_t *position,
				   splicing_strvector_t *cigar, 
				   splicing_vector_t *sampleprob) {
  
  size_t i, j, noiso, il, nogenes;
  splicing_vector_t *mysampleprob=sampleprob, vsampleprob;
  splicing_vector_t px, cpx;
  double sumpx, sumpsi=0.0;
  splicing_vector_int_t isolen;
  int goodiso=0;
  splicing_vector_int_t exstart, exend, exidx;
  splicing_vector_t *myfragmentProb=(splicing_vector_t*) fragmentProb,
    vfragmentProb;
  int fs, fl;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  if (gene < 0 || gene >= nogenes) {
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  /* TODO: more error checks */

  if (!fragmentProb) { 
    myfragmentProb=&vfragmentProb;
    SPLICING_CHECK(splicing_vector_init(&vfragmentProb, 0));
    SPLICING_FINALLY(splicing_vector_destroy, &vfragmentProb);
    SPLICING_CHECK(splicing_normal_fragment(normalMean, normalVar, numDevs, 
					    readLength, myfragmentProb,
					    &fragmentStart));
    splicing_vector_scale(myfragmentProb, 
			  1.0/splicing_vector_sum(myfragmentProb));
  }

  il=splicing_vector_size(myfragmentProb);
  fs=fragmentStart;
  fl=fragmentStart+il-1;

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));
    
  if ( fabs(splicing_vector_sum(myfragmentProb) - 1.0) > 1e-10 ) {
    SPLICING_ERROR("Fragment length distribution does not sum up to 1", 
		   SPLICING_EINVAL);
  }

  SPLICING_CHECK(splicing_vector_int_init(&isolen, noiso));
  SPLICING_FINALLY(splicing_vector_int_destroy, &isolen);
  SPLICING_CHECK(splicing_gff_isolength_one(gff, gene, &isolen));
  
  SPLICING_CHECK(splicing_vector_copy(&px, myfragmentProb));
  SPLICING_FINALLY(splicing_vector_destroy, &px);
  SPLICING_CHECK(splicing_vector_init(&cpx, il));
  SPLICING_FINALLY(splicing_vector_destroy, &cpx);

  if (!sampleprob) {
    mysampleprob=&vsampleprob;
    SPLICING_CHECK(splicing_vector_init(mysampleprob, noiso));
    SPLICING_FINALLY(splicing_vector_destroy, mysampleprob);
  } else {
    SPLICING_CHECK(splicing_vector_resize(mysampleprob, noiso));
  }

  for (sumpx=VECTOR(px)[0], i=1; i<il; i++) {
    VECTOR(px)[i] += VECTOR(px)[i-1];
    sumpx += VECTOR(px)[i];
  }
  VECTOR(cpx)[0] = VECTOR(px)[0];
  for (i=1; i<il; i++) {
    VECTOR(cpx)[i] = VECTOR(cpx)[i-1] + VECTOR(px)[i];
  }

  for (i=0; i<noiso; i++) {
    int ilen=VECTOR(isolen)[i];
    int r1= ilen >= fl ? ilen - fl + 1 : 0;
    int r2= ilen >= fs ? (ilen >= fl ? fl - fs : ilen - fs + 1) : 0;
    /* int r3= fs - 1; */
    double sp=0.0;
    if (r1 > 0) { sp += r1; } 
    if (r2 > 0) { sp += VECTOR(cpx)[r2-1]; }
    VECTOR(*mysampleprob)[i] = sp * VECTOR(*expression)[i];
    if (VECTOR(*mysampleprob)[i] != 0) { goodiso += 1; }
    sumpsi += VECTOR(*mysampleprob)[i];
  }

  if (goodiso == 0) {
    SPLICING_ERROR("No isoform is possible", SPLICING_FAILURE);
  }

  for (i=1; i<noiso; i++) {
    VECTOR(*mysampleprob)[i] += VECTOR(*mysampleprob)[i-1];
  }

  SPLICING_CHECK(splicing_vector_int_resize(isoform, noreads*2));

  for (i=0; i<2*noreads; i+=2) {
    int w;
    double rand;
    if (noiso==1) {
      w=0;
    } else if (noiso==2) {
      rand = RNG_UNIF01() * sumpsi;
      w = (rand < VECTOR(*mysampleprob)[0]) ? 0 : 1;
    } else {
      rand = RNG_UNIF01() * sumpsi;
      for (w=0; rand > VECTOR(*mysampleprob)[w]; w++) ;
    }
    VECTOR(*isoform)[i]=VECTOR(*isoform)[i+1]=w;
  }

  if (!sampleprob) { 
    splicing_vector_destroy(mysampleprob);
    SPLICING_FINALLY_CLEAN(1);
  } else {
    for (i=noiso-1; i>0; i--) {
      VECTOR(*mysampleprob)[i] -= VECTOR(*mysampleprob)[i-1];
    }
  }

  /* We have the isoforms, now get the read positions. */
  
  SPLICING_CHECK(splicing_vector_int_resize(position, noreads*2));
  SPLICING_CHECK(splicing_vector_int_init(&exstart, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exstart);
  SPLICING_CHECK(splicing_vector_int_init(&exend, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exend);
  SPLICING_CHECK(splicing_vector_int_init(&exidx, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exidx);
  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &exstart, &exend, &exidx,
					     gene));
  
  /* Positions in isoform coordinates first. 
     These are sampled based on the fragment length distribution. */

  for (i=0, j=0; i<noreads; i++) {
    int iso=VECTOR(*isoform)[2*i];
    int ilen=VECTOR(isolen)[iso];
    int r1= ilen >= fl ? ilen - fl + 1 : 0;
    int r2= ilen >= fs ? (ilen >= fl ? fl - fs : ilen - fs + 1) : 0;
    /* int r3= fs - 1; */
    int pos, fragment;
    double sp=0.0;
    if (r1 > 0) { sp += r1; } 
    if (r2 > 0) { sp += VECTOR(cpx)[r2-1]; }
    double rand=RNG_UNIF(0, sp);
    if (rand < r1) { 
      pos = ceil(rand);
    } else {
      int w;
      rand -= r1;
      for (w=0; VECTOR(cpx)[w] < rand; w++) ;
      pos = r1 + r2 - w;
    }

    if (pos <= r1) {
      rand=RNG_UNIF(0, 1.0);
    } else {
      rand=RNG_UNIF(0, VECTOR(px)[r1+r2-pos]);
    }
    for (fragment=0; VECTOR(px)[fragment] < rand; fragment++) ;
    fragment += fragmentStart;

    VECTOR(*position)[j++] = pos;
    VECTOR(*position)[j++] = pos+fragment-readLength;
    
  }

  /* Translate positions to genomic coordinates */

  /* TODO: some of this is already calculated */
  SPLICING_CHECK(splicing_iso_to_genomic(gff, gene, isoform, /*converter=*/ 0,
					 position));

  /* CIGAR strings */

  splicing_strvector_clear(cigar);
  SPLICING_CHECK(splicing_strvector_reserve(cigar, 2*noreads));
  for (j=0; j<2*noreads; j++) {
    char tmp[1000], *tmp2=tmp;
    int iso=VECTOR(*isoform)[j];
    size_t rs=VECTOR(*position)[j];
    int ex=0;
    int rl=readLength;
    for (ex=VECTOR(exidx)[iso]; VECTOR(exend)[ex] < rs; ex++) ;
    while (rs + rl - 1 > VECTOR(exend)[ex]) {
      tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM%iN",
		       (int) (VECTOR(exend)[ex]-rs+1), 
		       (int) (VECTOR(exstart)[ex+1]-VECTOR(exend)[ex]-1));
      if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
	SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL);
      }
      rl -= (VECTOR(exend)[ex] - rs + 1);
      rs = VECTOR(exstart)[ex+1];
      ex++;
    }
    tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM", rl);
    if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
      SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL);
    }
    SPLICING_CHECK(splicing_strvector_append(cigar, tmp));
  }

  splicing_vector_int_destroy(&exidx);
  splicing_vector_int_destroy(&exend);
  splicing_vector_int_destroy(&exstart);
  splicing_vector_destroy(&cpx);
  splicing_vector_destroy(&px);
  splicing_vector_int_destroy(&isolen);
  SPLICING_FINALLY_CLEAN(6);

  if (!fragmentProb) { 
    splicing_vector_destroy(myfragmentProb); 
    SPLICING_FINALLY_CLEAN(1);
  }

  return 0;
}
Beispiel #17
0
Datei: gff.c Projekt: mlovci/MISO
int splicing_gff_read(FILE *input, splicing_gff_t *gff) {
  int eof=!EOF;
  char seqid[200];
  char source[200];
  char type[200];
  int start, end, phase;
  double score;
  char strand[10];
  char attributes[5000];
  size_t len;
  splicing_type_t realtype;
  splicing_strand_t realstrand=SPLICING_STRAND_UNKNOWN;
  char *ID, *parent;

  do { 

    eof = eof || splicing_io_skip_newline_and_comments(input);
    eof = eof || splicing_io_get_string(input, seqid, 
					sizeof(seqid)/sizeof(char), &len, 
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_string(input, source, 
					sizeof(source)/sizeof(char), &len,
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_string(input, type, 
					sizeof(type)/sizeof(char), &len, 
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_integer(input, &start, /*delim=*/ '\t');
    eof = eof || splicing_io_get_integer(input, &end, /*delim=*/ '\t');
    eof = eof || splicing_io_get_real_na(input, &score, /*delim=*/ '\t', 
					 /*nachar=*/ '.');
    eof = eof || splicing_io_get_string(input, strand, 
					sizeof(strand)/sizeof(char), &len, 
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_integer_na(input, &phase, /*delim=*/ '\t', 
					    /*nachar=*/ '.');
    eof = eof || splicing_io_get_string(input, attributes, 
					sizeof(attributes)/sizeof(char), &len,
					/*delim=*/ '\n', /*newline=*/ 1);

    if (eof) { 
      SPLICING_ERROR("Corrupt GFF file", SPLICING_PARSEERROR);
    }

    /* TODO: do not hardcode these names
       TODO: order them according to their frequency */
    if (!strcmp(type, "gene")) { 
      realtype = SPLICING_TYPE_GENE;
    } else if (!strcmp(type, "mRNA")) {
      realtype = SPLICING_TYPE_MRNA;
    } else if (!strcmp(type, "exon")) {
      realtype = SPLICING_TYPE_EXON;
    } else if (!strcmp(type, "CDS")) {
      realtype = SPLICING_TYPE_CDS;
    } else if (!strcmp(type, "start_codon")) {
      realtype = SPLICING_TYPE_START_CODON;
    } else if (!strcmp(type, "stop_codon")) {
      realtype = SPLICING_TYPE_STOP_CODON;
    } else {
      SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
    }

    if (!strcmp(strand, "+")) {
      realstrand=SPLICING_STRAND_PLUS;
    } else if (!strcmp(strand, "-")) {
      realstrand=SPLICING_STRAND_MINUS;
    } else if (!strcmp(strand, ".")) {
      realstrand=SPLICING_STRAND_UNKNOWN;
    }

    /* Parsing the attributes field */
    SPLICING_CHECK(splicing_io_parse_attributes(attributes, &ID, &parent));

    SPLICING_CHECK(splicing_gff_append(gff, seqid, source, realtype, start,
				       end, score, realstrand, phase, ID,
				       parent));
    
    eof = splicing_io_skip_newline_and_comments(input);

  } while (!eof);
  
  return 0;
}
Beispiel #18
0
int splicing_simulate_reads(const splicing_gff_t *gff, int gene,
			    const splicing_vector_t *expression,
			    int noreads, int readLength,
			    splicing_vector_int_t *isoform, 
			    splicing_vector_int_t *position, 
			    splicing_strvector_t *cigar, 
			    splicing_vector_t *sample_prob) {
  
  size_t i, p, noiso, goodiso=0, nogenes;
  splicing_vector_int_t effisolen;
  splicing_vector_t sampleprob;
  double rand, sumpsi=0.0;
  splicing_vector_int_t exstart, exend, exidx;

  SPLICING_CHECK(splicing_gff_nogenes(gff, &nogenes));
  if (gene < 0 || gene >= nogenes) {
    SPLICING_ERROR("Invalid gene id", SPLICING_EINVAL);
  }

  /* TODO: more error checks */

  SPLICING_CHECK(splicing_gff_noiso_one(gff, gene, &noiso));
    
  SPLICING_CHECK(splicing_vector_int_init(&effisolen, noiso));
  SPLICING_FINALLY(splicing_vector_int_destroy, &effisolen);
  SPLICING_CHECK(splicing_vector_init(&sampleprob, noiso));
  SPLICING_FINALLY(splicing_vector_destroy, &sampleprob);
  SPLICING_CHECK(splicing_vector_int_resize(isoform, noreads));
  SPLICING_CHECK(splicing_gff_isolength_one(gff, gene, &effisolen));
  for (i=0; i<noiso; i++) {
    int l=VECTOR(effisolen)[i]-readLength+1;
    VECTOR(effisolen)[i] = l > 0 ? l : 0;
    VECTOR(sampleprob)[i] = VECTOR(*expression)[i] * VECTOR(effisolen)[i];
    if (VECTOR(sampleprob)[i] != 0) { goodiso++; }
    sumpsi += VECTOR(sampleprob)[i];
  }

  if (goodiso==0) {
    SPLICING_ERROR("No isoform is possible", SPLICING_FAILURE);
  }

  if (sample_prob) {
    SPLICING_CHECK(splicing_vector_update(sample_prob, &sampleprob));
  }

  for (i=1; i<noiso; i++) {
    VECTOR(sampleprob)[i] += VECTOR(sampleprob)[i-1];
  }

  for (i=0; i<noreads; i++) {
    int w;
    if (noiso==1) {
      w=0;
    } else if (noiso==2) {
      rand = RNG_UNIF01() * sumpsi;
      w = (rand < VECTOR(sampleprob)[0]) ? 0 : 1;
    } else {
      rand = RNG_UNIF01() * sumpsi;
      for (w=0; rand > VECTOR(sampleprob)[w]; w++) ;
    }
    VECTOR(*isoform)[i]=w;
  }
  
  splicing_vector_destroy(&sampleprob);
  SPLICING_FINALLY_CLEAN(1);

  /* OK, we have the isoforms, now we need the read positions, 
     these are uniformly sampled from the individual isoforms. */

  SPLICING_CHECK(splicing_vector_int_resize(position, noreads));
  SPLICING_CHECK(splicing_vector_int_init(&exstart, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exstart);
  SPLICING_CHECK(splicing_vector_int_init(&exend, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exend);
  SPLICING_CHECK(splicing_vector_int_init(&exidx, 0));
  SPLICING_FINALLY(splicing_vector_int_destroy, &exidx);
  SPLICING_CHECK(splicing_gff_exon_start_end(gff, &exstart, &exend, &exidx,
					     gene));

  /* Positions in isoform coordinates first */

  for (i=0; i<noreads; i++) { 
    int iso=VECTOR(*isoform)[i];
    int len=VECTOR(effisolen)[iso];
    VECTOR(*position)[i]=RNG_INTEGER(1, len);
  }

  /* Translate isoform coordinates to genomic coordintes */

  /* TODO: some of this is already calculated */
  SPLICING_CHECK(splicing_iso_to_genomic(gff, gene, isoform, /*converter=*/ 0,
					 position));

  /* CIGAR strings */

  splicing_strvector_clear(cigar);
  SPLICING_CHECK(splicing_strvector_reserve(cigar, noreads));
  for (i=0; i<noreads; i++) {
    char tmp[1000], *tmp2=tmp;
    int iso=VECTOR(*isoform)[i];
    size_t rs=VECTOR(*position)[i];
    int ex=0;
    int rl=readLength;
    for (ex=VECTOR(exidx)[iso]; VECTOR(exend)[ex] < rs; ex++) ;
    while (VECTOR(exend)[ex] < rs+rl-1) {
      tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM%iN",
		       (int) (VECTOR(exend)[ex]-rs+1), 
		       (int) (VECTOR(exstart)[ex+1]-VECTOR(exend)[ex]-1));
      if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
	SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL);
      }
      rl -= (VECTOR(exend)[ex] - rs + 1);
      rs = VECTOR(exstart)[ex+1];
      ex++;
    }
    tmp2 += snprintf(tmp2, sizeof(tmp)/sizeof(char)-(tmp2-tmp)-1, "%iM", rl);
    if (tmp2 >= tmp + sizeof(tmp)/sizeof(char)) {
      SPLICING_ERROR("CIGAR string too long", SPLICING_EINVAL); }
    SPLICING_CHECK(splicing_strvector_append(cigar, tmp));
  }

  splicing_vector_int_destroy(&exidx);
  splicing_vector_int_destroy(&exend);
  splicing_vector_int_destroy(&exstart);
  splicing_vector_int_destroy(&effisolen);
  SPLICING_FINALLY_CLEAN(4);
  
  return 0;
}
Beispiel #19
0
int splicing_gene_complexity(const splicing_gff_t *gff, size_t gene,
			     int readLength, splicing_complexity_t type,
			     splicing_norm_t norm, int paired,
			     const splicing_vector_t *fragmentProb,
			     int fragmentStart, double normalMean, 
			     double normalVar, double numDevs,
			     double *complexity) {
  
  splicing_matrix_t assignment_matrix;

  SPLICING_CHECK(splicing_matrix_init(&assignment_matrix, 0, 0));
  SPLICING_FINALLY(splicing_matrix_destroy, &assignment_matrix);

  if (!paired) {
    SPLICING_CHECK(splicing_assignment_matrix(gff, gene, readLength, 
					      &assignment_matrix));
  } else {
    SPLICING_CHECK(splicing_paired_assignment_matrix(gff, gene, readLength, 
						     fragmentProb, 
						     fragmentStart,
						     normalMean, normalVar,
						     numDevs, 
						     &assignment_matrix));
  }

  switch (type) {
  case SPLICING_COMPLEXITY_RELATIVE:
    switch (norm) {
      splicing_vector_t values;
      int i, n;

    case SPLICING_NORM_2:

      SPLICING_CHECK(splicing_vector_init(&values, 0));
      SPLICING_FINALLY(splicing_vector_destroy, &values);
      SPLICING_CHECK(splicing_dgesdd(&assignment_matrix, &values));
      n=splicing_vector_size(&values);
      for (i=n-1; i>=0 && VECTOR(values)[i] < 1e-14; i--) ;
      *complexity = VECTOR(values)[0] / VECTOR(values)[i];
      splicing_vector_destroy(&values);
      SPLICING_FINALLY_CLEAN(1);
      break;

    case SPLICING_NORM_1:

      SPLICING_ERROR("One norm not implemented", SPLICING_UNIMPLEMENTED);
      break;

    case SPLICING_NORM_INFINITY:

      SPLICING_ERROR("Infinity norm not implemented", SPLICING_UNIMPLEMENTED);
      break;

    }
    break;
  case SPLICING_COMPLEXITY_ABSOLUTE:
    SPLICING_ERROR("Absolute complexity not implemented", 
		   SPLICING_UNIMPLEMENTED);
    break;
  }

  splicing_matrix_destroy(&assignment_matrix);
  SPLICING_FINALLY_CLEAN(1);

  return 0;
}