Ejemplo n.º 1
0
int splicing_create_gene(const splicing_vector_int_t *exons,
			 const splicing_vector_int_t *isoforms,
			 const char *id, const char *seqid, 
			 const char *source, splicing_strand_t strand,
			 splicing_gff_t *extend) {

  size_t i=0;
  size_t exlen=splicing_vector_int_size(exons);
  size_t isolen=splicing_vector_int_size(isoforms);
  size_t genestart=splicing_vector_int_min(exons);
  size_t geneend=splicing_vector_int_max(exons);
  char buffer[5000], buffer2[5000];
  int noiso=0;
  
  /* TODO: error checks */
  
  /* Gene */
  SPLICING_CHECK(splicing_gff_append(extend, seqid, source, 
				     SPLICING_TYPE_GENE, 
				     genestart, geneend, 
				     /*score=*/ SPLICING_NA_REAL, 
				     strand, /*phase=*/ SPLICING_NA_INTEGER,
				     id, /*parent=*/ 0));

  while (i<isolen) {
    size_t mmin=VECTOR(*exons)[ 2*VECTOR(*isoforms)[i] ];
    size_t mmax=VECTOR(*exons)[ 2*VECTOR(*isoforms)[i] + 1 ];
    size_t j, exon=0;    
    for (j=i+1; VECTOR(*isoforms)[j] >= 0; j++) {
      size_t m1=VECTOR(*exons)[ 2*VECTOR(*isoforms)[j] ];
      size_t m2=VECTOR(*exons)[ 2*VECTOR(*isoforms)[j] + 1 ];      
      if (m1 < mmin) { mmin = m1; }
      if (m2 > mmax) { mmax = m2; }
    }
    snprintf(buffer, sizeof(buffer)/sizeof(char)-sizeof(char), 
	     "%s-isoform-%i", id, noiso);    
    SPLICING_CHECK(splicing_gff_append(extend, seqid, source, 
				       SPLICING_TYPE_MRNA, mmin, mmax, 
				       /*score=*/ SPLICING_NA_REAL, strand,
				       /*phase=*/ SPLICING_NA_INTEGER,
				       buffer, /*parent=*/ id));
    for (; VECTOR(*isoforms)[i] >= 0; i++) {
      snprintf(buffer2, sizeof(buffer2)/sizeof(char)-sizeof(char),
	       "%s-isoform-%i-exon-%i", id, (int) noiso, (int) exon++);
      SPLICING_CHECK(splicing_gff_append(extend, seqid, source, 
			 SPLICING_TYPE_EXON,
			 VECTOR(*exons)[ 2*VECTOR(*isoforms)[i] ],
			 VECTOR(*exons)[ 2*VECTOR(*isoforms)[i] + 1 ],
			 /*score=*/ SPLICING_NA_REAL, strand,
			 /*phase=*/ SPLICING_NA_INTEGER, buffer2, 
			 /*parent=*/ buffer));
    }
    noiso++;
    i++;
  }
  
  return 0;
}
Ejemplo n.º 2
0
Archivo: gff.c Proyecto: mlovci/MISO
int splicing_gff_read(FILE *input, splicing_gff_t *gff) {
  int eof=!EOF;
  char seqid[200];
  char source[200];
  char type[200];
  int start, end, phase;
  double score;
  char strand[10];
  char attributes[5000];
  size_t len;
  splicing_type_t realtype;
  splicing_strand_t realstrand=SPLICING_STRAND_UNKNOWN;
  char *ID, *parent;

  do { 

    eof = eof || splicing_io_skip_newline_and_comments(input);
    eof = eof || splicing_io_get_string(input, seqid, 
					sizeof(seqid)/sizeof(char), &len, 
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_string(input, source, 
					sizeof(source)/sizeof(char), &len,
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_string(input, type, 
					sizeof(type)/sizeof(char), &len, 
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_integer(input, &start, /*delim=*/ '\t');
    eof = eof || splicing_io_get_integer(input, &end, /*delim=*/ '\t');
    eof = eof || splicing_io_get_real_na(input, &score, /*delim=*/ '\t', 
					 /*nachar=*/ '.');
    eof = eof || splicing_io_get_string(input, strand, 
					sizeof(strand)/sizeof(char), &len, 
					/*delim=*/ '\t', /*newline=*/ 0);
    eof = eof || splicing_io_get_integer_na(input, &phase, /*delim=*/ '\t', 
					    /*nachar=*/ '.');
    eof = eof || splicing_io_get_string(input, attributes, 
					sizeof(attributes)/sizeof(char), &len,
					/*delim=*/ '\n', /*newline=*/ 1);

    if (eof) { 
      SPLICING_ERROR("Corrupt GFF file", SPLICING_PARSEERROR);
    }

    /* TODO: do not hardcode these names
       TODO: order them according to their frequency */
    if (!strcmp(type, "gene")) { 
      realtype = SPLICING_TYPE_GENE;
    } else if (!strcmp(type, "mRNA")) {
      realtype = SPLICING_TYPE_MRNA;
    } else if (!strcmp(type, "exon")) {
      realtype = SPLICING_TYPE_EXON;
    } else if (!strcmp(type, "CDS")) {
      realtype = SPLICING_TYPE_CDS;
    } else if (!strcmp(type, "start_codon")) {
      realtype = SPLICING_TYPE_START_CODON;
    } else if (!strcmp(type, "stop_codon")) {
      realtype = SPLICING_TYPE_STOP_CODON;
    } else {
      SPLICING_ERROR("Invalid GFF file", SPLICING_PARSEERROR);
    }

    if (!strcmp(strand, "+")) {
      realstrand=SPLICING_STRAND_PLUS;
    } else if (!strcmp(strand, "-")) {
      realstrand=SPLICING_STRAND_MINUS;
    } else if (!strcmp(strand, ".")) {
      realstrand=SPLICING_STRAND_UNKNOWN;
    }

    /* Parsing the attributes field */
    SPLICING_CHECK(splicing_io_parse_attributes(attributes, &ID, &parent));

    SPLICING_CHECK(splicing_gff_append(gff, seqid, source, realtype, start,
				       end, score, realstrand, phase, ID,
				       parent));
    
    eof = splicing_io_skip_newline_and_comments(input);

  } while (!eof);
  
  return 0;
}
Ejemplo n.º 3
0
static PyObject* pysplicing_to_gff(PyObject *self, PyObject *args) {
  
  PyObject *pygff, *entries;
  size_t i, noRec;
  splicing_gff_t *cgff;
  PyObject *IDkey, *Parentkey;
  
  if (!PyArg_ParseTuple(args, "O", &pygff)) { return NULL; }

  if (!PyObject_HasAttrString(pygff, "_GFFDatabase__entries")) {
    splicingmodule_handle_splicing_error();
    return NULL;
  }
  entries=PyObject_GetAttrString(pygff, "_GFFDatabase__entries");
  noRec=PySequence_Size(entries);

  IDkey=PyString_FromString("ID");
  Parentkey=PyString_FromString("Parent");

  cgff=malloc(sizeof(splicing_gff_t));
  if (!cgff) { splicingmodule_handle_splicing_error(); return NULL; }
  splicing_gff_init(cgff, noRec);
  
  for (i=0; i<noRec; i++) {
    PyObject *rec=0, *seqid=0, *source=0, *type=0, *start=0, *end=0,
      *score=0, *strand=0, *phase=0, *attributes=0, *ID=0, *Parent=0;
    char *Cseqid, *Csource, *CID, *Cparent=0, *Ctype;
    splicing_type_t Ctype2;
    int Cstart, Cend, Cphase;
    double Cscore;
    splicing_strand_t Cstrand;
    
    rec=PySequence_GetItem(entries, i);
    seqid=PyObject_GetAttrString(rec, "seqid");
    source=PyObject_GetAttrString(rec, "source");
    type=PyObject_GetAttrString(rec, "type");
    start=PyObject_GetAttrString(rec, "start");
    end=PyObject_GetAttrString(rec, "end");
    score=PyObject_GetAttrString(rec, "score");
    strand=PyObject_GetAttrString(rec, "strand");
    phase=PyObject_GetAttrString(rec, "phase");
    attributes=PyObject_GetAttrString(rec, "attributes");
    ID=PyDict_GetItem(attributes, IDkey);
    Parent=PyDict_GetItem(attributes, Parentkey);

    Cseqid=PyString_AsString(seqid);
    Csource=PyString_AsString(source);

    Ctype=PyString_AsString(type);
    if (!strcmp(Ctype, "gene")) { 
      Ctype2=SPLICING_TYPE_GENE;
    } else if (!strcmp(Ctype, "mRNA")) {
      Ctype2=SPLICING_TYPE_MRNA;
    } else if (!strcmp(Ctype, "exon")) {
      Ctype2=SPLICING_TYPE_EXON;
    } else if (!strcmp(Ctype, "CDS")) {
      Ctype2=SPLICING_TYPE_CDS;
    } else if (!strcmp(Ctype, "start_codon")) {
      Ctype2=SPLICING_TYPE_START_CODON;
    } else if (!strcmp(Ctype, "stop_codon")) {
      Ctype2=SPLICING_TYPE_STOP_CODON;
    } /* TODO: else error? */

    Cstart=PyInt_AsLong(start);
    Cend=PyInt_AsLong(end);
    Cscore=PyFloat_AsDouble(score);
    Cstrand=PyInt_AsLong(strand);
    Cphase=PyInt_AsLong(phase);
    CID=PyString_AsString(ID);
    if (Parent) { Cparent=PyString_AsString(Parent); }

    SPLICING_PYCHECK(splicing_gff_append(cgff, Cseqid, Csource, Ctype2, 
					 Cstart, Cend, Cscore, Cstrand,
					 Cphase, CID, Cparent));

    Py_DECREF(rec); Py_DECREF(seqid); Py_DECREF(source); Py_DECREF(type);
    Py_DECREF(start); Py_DECREF(end); Py_DECREF(score); Py_DECREF(strand);
    Py_DECREF(phase); Py_DECREF(attributes);
  }

  Py_DECREF(entries);
  Py_DECREF(IDkey);
  Py_DECREF(Parentkey);
  
  return PyCObject_FromVoidPtr(cgff, splicing_gff_destroy2);
}