Esempio n. 1
0
/*
=head2 add_map_coordinates

    Arg  1      int $id
                id of 'source' sequence
    Arg  2      int $start
                start coordinate of 'source' sequence
    Arg  3      int $end
                end coordinate of 'source' sequence
    Arg  4      int $strand
                relative orientation of source and target (+/- 1)
    Arg  5      int $id
                id of 'target' sequence
    Arg  6      int $start
                start coordinate of 'target' sequence
    Arg  7      int $end
                end coordinate of 'target' sequence
    Function    Stores details of mapping between
                'source' and 'target' regions.
    Returntype  none
    Exceptions  none
    Caller      Bio::EnsEMBL::Mapper

=cut
*/
void Mapper_addMapCoordinates(Mapper *m, IDType contigId, int contigStart, int contigEnd,
                              int contigOri, IDType chrId, int chrStart, int chrEnd) {
  MapperPair *pair;
  MapperUnit *from;
  MapperUnit *to;
  IDHash     *fromHash;
  IDHash     *toHash;
  MapperPairSet *mps;

  if ((contigEnd - contigStart) != (chrEnd - chrStart)) {
    fprintf(stderr,"ERROR: Cannot deal with mis-lengthed mappings so far\n");
    fprintf(stderr,"Contig %d to %d and chromosome %d to %d\n",contigStart,contigEnd,
            chrStart,chrEnd);
    exit(1);
  }

  pair = MapperPair_new();

  from = MapperUnit_new();

  from->start = contigStart;
  from->end = contigEnd;
  from->id = contigId;

  to = MapperUnit_new();

  to->start = chrStart;
  to->end = chrEnd;
  to->id = chrId;

  MapperPair_setUnit(pair,MAPPER_TO_IND,to);
  MapperPair_setUnit(pair,MAPPER_FROM_IND,from);

  pair->ori = contigOri;

  // place into hash on both ids
  fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND);
  toHash   = Mapper_getPairHash(m, MAPPER_TO_IND);

  if (!IDHash_contains(toHash,chrId)) {
    IDHash_add(toHash,chrId,MapperPairSet_new());
  }
  mps = (MapperPairSet *)IDHash_getValue(toHash,chrId);
  MapperPairSet_addPair(mps,pair);

  if (!IDHash_contains(fromHash,contigId)) {
    IDHash_add(fromHash,contigId,MapperPairSet_new());
  }
  mps = (MapperPairSet *)IDHash_getValue(fromHash,contigId);
  MapperPairSet_addPair(mps,pair);

  Mapper_incPairCount(m);
  Mapper_setIsSorted(m,0);
}
Esempio n. 2
0
// This is almost identical to Mapper_addCoordinates (just the isIndel line as far as I can see!) - I should refactor this
int Mapper_addIndelCoordinates( Mapper *m, IDType contigId, long contigStart, long contigEnd, int contigOri, IDType chrId, long chrStart, long chrEnd) {
  MapperPair *pair;
  MapperUnit *from;
  MapperUnit *to;
  IDHash     *fromHash;
  IDHash     *toHash;
  MapperPairSet *mps;


  //we need to create the IndelPair object to add to both lists, to and from
  pair = MapperPair_new();

  from = MapperUnit_new();

  from->start = contigStart;
  from->end = contigEnd;
  from->id = contigId;

  to = MapperUnit_new();

  to->start = chrStart;
  to->end = chrEnd;
  to->id = chrId;

  MapperPair_setUnit(pair,MAPPER_TO_IND,to);
  MapperPair_setUnit(pair,MAPPER_FROM_IND,from);


  pair->ori = contigOri;
  pair->isIndel = 1;

  // place into hash on both ids
  fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND);
  toHash   = Mapper_getPairHash(m, MAPPER_TO_IND);

  if (!IDHash_contains(toHash,chrId)) {
    IDHash_add(toHash,chrId,MapperPairSet_new());
  }
  mps = (MapperPairSet *)IDHash_getValue(toHash,chrId);
  MapperPairSet_addPair(mps,pair);

  if (!IDHash_contains(fromHash,contigId)) {
    IDHash_add(fromHash,contigId,MapperPairSet_new());
  }
  mps = (MapperPairSet *)IDHash_getValue(fromHash,contigId);
  MapperPairSet_addPair(mps,pair);

  Mapper_incPairCount(m);
  Mapper_setIsSorted(m,0);

  return 1;
}
Esempio n. 3
0
Chromosome *ChromosomeAdaptor_fetchByDbID(ChromosomeAdaptor *ca, IDType dbID) {
  Chromosome *chromosome;
  char qStr[256];
  StatementHandle *sth;
  ResultRow *row;

  if (IDHash_contains(ca->chrCache,dbID)) {

    chromosome = IDHash_getValue(ca->chrCache, dbID);

  } else {
    sprintf(qStr,"SELECT chromosome_id, name, length"
      " FROM chromosome"
      " WHERE  chromosome_id = "
      IDFMTSTR, dbID);
  
    sth = ca->prepare((BaseAdaptor *)ca,qStr,strlen(qStr));
    sth->execute(sth);
  
    row = sth->fetchRow(sth);
    if( row == NULL ) {
      sth->finish(sth);
      return NULL;
    }
  
    chromosome = ChromosomeAdaptor_chromosomeFromRow(ca, row);
    sth->finish(sth);
  }

  return chromosome;
}
Esempio n. 4
0
void Mapper_dump(Mapper *m, FILE *fp) {
  IDHash *fromHash;
  IDType *keys;
  int   nKey;
  int   i;

  if (fp==NULL) {
    fp = stderr;
  }

  fromHash = Mapper_getPairHash(m, MAPPER_TO_IND);

  keys = IDHash_getKeys(fromHash);
  nKey = IDHash_getNumValues(fromHash);

  for (i=0;i<nKey;i++) {
    IDType id = keys[i];
    MapperPairSet *set = IDHash_getValue(fromHash,id);
    int j;

    fprintf(fp, "From Hash " IDFMTSTR " with %d pairs\n",id, MapperPairSet_getNumPair(set));

    for (j=0; j<MapperPairSet_getNumPair(set); j++) {
      MapperPair *pair = MapperPairSet_getPairAt(set,j);
      MapperUnit *fromCoord = MapperPair_getUnit(pair, MAPPER_FROM_IND);
      MapperUnit *toCoord   = MapperPair_getUnit(pair, MAPPER_TO_IND);

      fprintf(fp, "    %ld %ld:%ld %ld " IDFMTSTR "\n",fromCoord->start,fromCoord->end,
              toCoord->start,toCoord->end,toCoord->id);
    }
  }

  free(keys);
}
Esempio n. 5
0
// low level function to access the ranges
// only use for read access
Vector *RangeRegistry_getRanges(RangeRegistry *registry, IDType id) {
  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list = NULL;

  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id); 
  }

  return list;
}
Esempio n. 6
0
void Translation_transform(Translation *translation, IDHash *exonTransforms) {

  Exon * startExon = Translation_getStartExon(translation);
  Exon * endExon   = Translation_getEndExon(translation);
  IDType startExonRef = (IDType)startExon;
  IDType endExonRef = (IDType)endExon;

/* CHECK */
  if (IDHash_contains(exonTransforms,startExonRef)) {
    Translation_setStartExon(translation,IDHash_getValue(exonTransforms,startExonRef));
  } else {
    // do nothing, the start exon wasnt mapped
  }

  if (IDHash_contains(exonTransforms,endExonRef)) {
    Translation_setEndExon(translation,IDHash_getValue(exonTransforms,endExonRef));
  } else {
    // do nothing, the end exon wasnt mapped
  }
}
Esempio n. 7
0
/*
=head2 register_assembled

  Arg [1]    : integer $asm_seq_region
               The dbID of the sequence region to register.
  Arg [2]    : int $chunk_id
               The chunk number of the provided seq_region to register.
  Example    : $asm_mapper->register_assembled( 'X', 4 );
  Description: Flags a given assembled region as registered in this
               assembly mapper.  This should only be called by this
               class or the AssemblyMapperAdaptor.  Do not call this
               method unless you really know what you are doing.
  Return type: None
  Exceptions : Throws on incorrect arguments
  Caller     : Internal, AssemblyMapperAdaptor
  Status     : Stable

=cut
*/
void AssemblyMapper_registerAssembled(AssemblyMapper *am, IDType asmSeqRegionId, int chunkId) {
  IDHash *assembledRegister = AssemblyMapper_getAssembledRegister(am);

  if ( !IDHash_contains(assembledRegister, asmSeqRegionId) ) {
    IDHash_add(assembledRegister, asmSeqRegionId, IDHash_new(IDHASH_MEDIUM));
  }

  IDHash *chunkHash = IDHash_getValue(assembledRegister, asmSeqRegionId);

  if (!IDHash_contains(chunkHash, (IDType)chunkId)) {
    IDHash_add(chunkHash, (IDType)chunkId, &trueVal);
  }
}
Esempio n. 8
0
int AssemblyMapper_haveRegisteredAssembled(AssemblyMapper *am, IDType asmSeqRegionId, int chunkId) {
  IDHash *assembledRegister = AssemblyMapper_getAssembledRegister(am);

  if ( !IDHash_contains(assembledRegister, asmSeqRegionId) ) {
    return 0;
  }

  IDHash *chunkHash = IDHash_getValue(assembledRegister, asmSeqRegionId);

  if (!IDHash_contains(chunkHash, (IDType)chunkId)) {
    return 0;
  }

  return 1;
}
Esempio n. 9
0
long RangeRegistry_overlapSize(RangeRegistry *registry, IDType id, long start, long end)  {
  long overlap = 0;

  if ( start > end ) return 0;

  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list;
  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id);
  } else {
    return 0; // No list for this id, so can't be any overlap 
  }

  int len = Vector_getNumElement(list);

  if ( len == 0 ) {
    fprintf(stderr, "Odd have zero length list in RangeRegistry_overlapSize\n");
    return 0;
  }

  int startIdx = 0;
  int endIdx   = Vector_getNumElement(list)-1;
  int midIdx;
  CoordPair *range;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    range   = Vector_getElementAt(list, midIdx);
    if ( CoordPair_getEnd(range) < start ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;
    }
  }

  int i;
  for (i=startIdx; i < len ; i++ ) {
    CoordPair *pRange = Vector_getElementAt(list, i);
    long pStart = CoordPair_getStart(pRange);
    long pEnd   = CoordPair_getEnd(pRange);

    if ( pStart > end ) {
      break;
    }

    if ( pStart <= start && pEnd >= end ) {
      overlap = end - start + 1;
      break;
    }

    long mStart = ( start < pStart ? pStart : start );
    long mEnd   = ( end   < pEnd   ? end    : pEnd );

    if (mEnd - mStart >= 0) {
      overlap += ( mEnd - mStart + 1 );
    }
  }

  return overlap;
}
Esempio n. 10
0
/*
=head2 list_pairs

    Arg  1      int $id
                id of 'source' sequence
    Arg  2      int $start
                start coordinate of 'source' sequence
    Arg  3      int $end
                end coordinate of 'source' sequence
    Arg  4      string $type
                nature of transform - gives the type of
                coordinates to be transformed *from*
    Function    list all pairs of mappings in a region
    Returntype  list of Bio::EnsEMBL::Mapper::Pair
    Exceptions  none
    Caller      Bio::EnsEMBL::Mapper

=cut
*/
MapperPairSet *Mapper_listPairs(Mapper *m, IDType id, long start, long end, char *type) {
  MapperPairSet *pairs;
  IDHash *hash;
  int from, to;
  MapperPairSet *retSet;
  int i;

  if (start > end) {
    fprintf(stderr,"ERROR: Start is greater than end for id " IDFMTSTR ", start %ld, end %ld\n",id,start,end);
  }

  if( Mapper_getIsSorted(m) == 0 ) {
    Mapper_sort(m);
  }

  if (!Mapper_compareType(type, Mapper_getTo(m))) {
    from = MAPPER_TO_IND;
    to   = MAPPER_FROM_IND;
  } else {
    from = MAPPER_FROM_IND;
    to   = MAPPER_TO_IND;
  }

  hash = Mapper_getPairHash(m, from);

  if (!hash) {
    fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type);
    exit(1);
  }


  if (!IDHash_contains(hash, id)) {
    return NULL;
  }

  pairs = IDHash_getValue(hash,id);


  retSet = MapperPairSet_new();
  //Mapper_dump(m,NULL);
  //fprintf(stderr, "listPairs with %d %d %d\n",id,start,end);

  if (start == -1 && end == -1) {
    for (i=0;i<MapperPairSet_getNumPair(pairs);i++) {
      MapperPairSet_addPair(retSet,MapperPairSet_getPairAt(pairs,i));
    }
  } else {

    for (i=0;i<MapperPairSet_getNumPair(pairs);i++) {
      MapperPair *pair = MapperPairSet_getPairAt(pairs,i);
      MapperUnit *fromCoord   = MapperPair_getUnit(pair, from);
      //fprintf(stderr," unit %d %d\n",fromCoord->start,fromCoord->end);

      if( fromCoord->end < start ) {
        continue;
      }
      if( fromCoord->start > end ) {
        break;
      }
      MapperPairSet_addPair(retSet,pair);
    }
  }
  return retSet;
}
Esempio n. 11
0
Vector *DBEntryAdaptor_fetchByObjectType(DBEntryAdaptor *dbea, IDType ensObj, char *ensType) {
  Vector *out;
  char qStr[1024];
  StatementHandle *sth;
  ResultRow *row;
  IDHash *seen;
  
  if (!ensObj) {
    fprintf(stderr,"Error: Can't fetchByObjectType without an object\n");
    exit(1);
  }

  if (!ensType) {
    fprintf(stderr,"Error: Can't fetchByObjectType without a type\n");
    exit(1);
  }

// Not sure if idt identities are right way round
  sprintf(qStr,
    "SELECT xref.xref_id, xref.dbprimary_acc, xref.display_label, xref.version,"
    "       xref.description,"
    "       exDB.db_name, exDB.db_release, exDB.status," 
    "       oxr.object_xref_id,"
    "       es.synonym," 
    "       idt.xref_identity, idt.ensembl_identity"
    " FROM  (external_db exDB, object_xref oxr, xref xref)" 
    " LEFT JOIN external_synonym es on es.xref_id = xref.xref_id"
    " LEFT JOIN identity_xref idt on idt.object_xref_id = oxr.object_xref_id"
    " WHERE  xref.xref_id = oxr.xref_id"
    "  AND  xref.external_db_id = exDB.external_db_id"
    "  AND  oxr.ensembl_id = " IDFMTSTR
    "  AND  oxr.ensembl_object_type = '%s'",
    ensObj,
    ensType);
  
  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

  sth->execute(sth);

  seen = IDHash_new(IDHASH_SMALL);
  out = Vector_new();

  while ((row = sth->fetchRow(sth))) {
    DBEntry *exDB;
    IDType refID = row->getLongLongAt(row,0);
			    
    // using an outer join on the synonyms as well as on identity_xref, we
    // now have to filter out the duplicates (see v.1.18 for
    // original). Since there is at most one identity_xref row per xref,
    // this is easy enough; all the 'extra' bits are synonyms

    if (!IDHash_contains(seen,refID))  {
      exDB = DBEntry_new();
      DBEntry_setAdaptor(exDB,(BaseAdaptor *)dbea);
      DBEntry_setDbID(exDB, refID);
      DBEntry_setPrimaryId(exDB, row->getStringAt(row,1));
      DBEntry_setDisplayId(exDB, row->getStringAt(row,2));
      DBEntry_setVersion(exDB, row->getStringAt(row,3));
      DBEntry_setDbName(exDB, row->getStringAt(row,5));
      DBEntry_setRelease(exDB, row->getStringAt(row,6));

      if (row->col(row,10)) {
        IdentityXref *idx = IdentityXref_new();
        DBEntry_setIdentityXref(exDB,idx);
	IdentityXref_setQueryIdentity(idx, row->getDoubleAt(row,10));
	IdentityXref_setTargetIdentity(idx, row->getDoubleAt(row,11));
      }
      
      if (row->col(row,4)) DBEntry_setDescription(exDB, row->getStringAt(row,4));
      if (row->col(row,7)) DBEntry_setStatus(exDB, row->getStringAt(row,7));
      
      Vector_addElement(out, exDB);
      IDHash_add(seen, refID, exDB);
    } 

    exDB = IDHash_getValue(seen, refID);

    if (row->col(row,9)) {
      DBEntry_addSynonym(exDB,row->getStringAt(row,9));
    }
  }

  IDHash_free(seen, NULL);

  sth->finish(sth);
  
  return out;
}
Esempio n. 12
0
// NIY: May need some reworking to handle mapInsert because I'd changed the way it returns data
// Change back to returning MapperRangeSet
MapperRangeSet *Mapper_fastMap(Mapper *m, IDType id, long start, long end, int strand, char *type) {
  MapperPairSet *pairs;
  int i;
  IDHash *hash;
  int from, to;
  CoordSystem *cs;

  if(end+1 == start) {
    return Mapper_mapInsert(m, id, start, end, strand, type, 1);
  }

  if(!Mapper_compareType(type, Mapper_getTo(m))) {
    from = MAPPER_TO_IND;
    to   = MAPPER_FROM_IND;
    cs   = Mapper_getFromCoordSystem(m);

  } else {
    from = MAPPER_FROM_IND;
    to   = MAPPER_TO_IND;
    cs   = Mapper_getToCoordSystem(m);
  }

  hash = Mapper_getPairHash(m, from);

  if (!hash) {
    fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type);
    exit(1);
  }

  if (Mapper_getIsSorted(m) == 0) {
    Mapper_sort(m);
  }

  if (!IDHash_contains(hash, id)) {
    fprintf(stderr,"ERROR: Fastmap expects to be able to find an id. It couldnt for " IDFMTSTR "\n",id);
    exit(1);
  }

  pairs = IDHash_getValue(hash,id);

  MapperRangeSet *retSet = MapperRangeSet_new();

  for (i=0;i<MapperPairSet_getNumPair(pairs);i++) {
    MapperPair *pair = MapperPairSet_getPairAt(pairs,i);
    MapperUnit *selfCoord   = MapperPair_getUnit(pair, from);
    MapperUnit *targetCoord = MapperPair_getUnit(pair, to);

    // only super easy mapping is done
    if(start < selfCoord->start ||
       end   > selfCoord->end ) {
      continue;
    }

    if (pair->ori == 1) {
      MapperCoordinate *retRange = MapperCoordinate_new(targetCoord->id, 
                                                        targetCoord->start + start - selfCoord->start, 
                                                        targetCoord->start + end   - selfCoord->start,
                                                        strand,
                                                        cs, 
                                                        0); // Perl didn't set rank, so use 0 

/*
      retRange->id     = targetCoord->id;
      retRange->start  = targetCoord->start + start - selfCoord->start;
      retRange->end    = targetCoord->start + end   - selfCoord->start;
      retRange->strand = strand;
      retRange->coordSystem = cs;
*/

      MapperRangeSet_addRange(retSet, (MapperRange *)retRange);
      break;
    } else {
      MapperCoordinate *retRange = MapperCoordinate_new(targetCoord->id, 
                                                        targetCoord->end - (end - selfCoord->start),
                                                        targetCoord->end - (start - selfCoord->start),
                                                        -strand,
                                                        cs, 
                                                        0); // Perl didn't set rank, so use 0 

/*
      retRange->id     = targetCoord->id;
      retRange->start  = targetCoord->end - (end - selfCoord->start);
      retRange->end    = targetCoord->end - (start - selfCoord->start);
      retRange->strand = -strand;
      retRange->coordSystem = cs;
*/

      MapperRangeSet_addRange(retSet, (MapperRange *)retRange);
      break;
    }
  }

  // NIY: Here we return empty set, in mapInsert it returns NULL for empty fastmap - need to work out which is right
  return retSet;
}
Esempio n. 13
0
MapperRangeSet *Mapper_mapCoordinates(Mapper *m, IDType id, long start, long end, int strand, char *type) {


  // special case for handling inserts:
  if ( start == end+1 ) {
    return Mapper_mapInsert(m, id, start, end, strand, type, 0 /*fastmap flag */);
  } else if (start > end+1) {
    fprintf(stderr,"ERROR: Start is greater than end for id " IDFMTSTR ", start %ld, end %ld\n",id,start,end);
    exit(1);
  }


  IDHash *hash;
  CoordSystem *cs;
  if( Mapper_getIsSorted(m) == 0 ) {
    Mapper_sort(m);
  }

  int from, to;
  if (!Mapper_compareType(type,Mapper_getTo(m))) {
    from = MAPPER_TO_IND;
    to   = MAPPER_FROM_IND;
    cs   = Mapper_getFromCoordSystem(m);
  } else if (!Mapper_compareType(type,Mapper_getFrom(m))) {
    from = MAPPER_FROM_IND;
    to   = MAPPER_TO_IND;
    cs   = Mapper_getToCoordSystem(m);
  } else {
    fprintf(stderr, "Invalid type [%s] in mapper (not from [%s] or to [%s])\n", type, Mapper_getFrom(m), Mapper_getTo(m));
    exit(1);
  }

  hash = Mapper_getPairHash(m, from);

  if (!hash) {
    fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type);
    exit(1);
  }


  MapperRangeSet *results = MapperRangeSet_new();


// Was upcasing the id - its a number in C, I haven't found a case yet where its a string
  if (!IDHash_contains(hash, id)) {
    // one big gap!
    MapperRange *gap = (MapperRange *)MapperGap_new(start,end,0); // Perl didn't set rank so use 0
    MapperRangeSet_addRange(results,gap);
    return results;
  }


  MapperPairSet *pairs = IDHash_getValue(hash,id); //my $lr = $hash->{ uc($id) };


  MapperPair *lastUsedPair = NULL;

  int startIdx, endIdx, midIdx;
  MapperPair *pair;
  MapperUnit *selfCoord;


  startIdx = 0;
  endIdx   = MapperPairSet_getNumPair(pairs)-1;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    pair   = MapperPairSet_getPairAt(pairs, midIdx);

    selfCoord = MapperPair_getUnit(pair, from);

    if ( selfCoord->end < start ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;
    }
  }

  int rank       = 0;
  long origStart = start;
  IDType lastTargetCoord;
  int lastTargetCoordIsSet = 0;

  int i;
  for (i=startIdx; i<MapperPairSet_getNumPair(pairs); i++) {
    MapperPair *pair = MapperPairSet_getPairAt(pairs,i);
    MapperUnit *selfCoord   = MapperPair_getUnit(pair, from);
    MapperUnit *targetCoord = MapperPair_getUnit(pair, to);

    //
    // But not the case for haplotypes!! need to test for this case???
    // so removing this till a better solution is found
    // 
    //
    //     if($self_coord->{'start'} < $start){
    //       $start = $orig_start;
    //       $rank++;
    //     }

    if ( lastTargetCoordIsSet && targetCoord->id != lastTargetCoord ) {
      if ( selfCoord->start < start ) {    // i.e. the same bit is being mapped to another assembled bit
        start = origStart;
      }
    } else {
      lastTargetCoord = targetCoord->id;
      lastTargetCoordIsSet = 1;
    }

    // if we haven't even reached the start, move on
    if (selfCoord->end < origStart) {
      continue;
    }

    // if we have over run, break
    if (selfCoord->start > end) {
      break;
    }

// Check is start not origStart
    if (start < selfCoord->start) {
      // gap detected
      MapperRange *gap = (MapperRange *)MapperGap_new(start, selfCoord->start-1, rank);
      MapperRangeSet_addRange(results,gap);
      start = gap->end+1;
    }

    long targetStart, targetEnd;

    MapperRange *res;

    if ( MapperPair_isIndel(pair) ) {
      // When next pair is an IndelPair and not a Coordinate, create the
      // new mapping Coordinate, the IndelCoordinate.
      targetStart = targetCoord->start;
      targetEnd   = targetCoord->end;

      // create a Gap object
      MapperGap *gap = MapperGap_new(start,
                                     selfCoord->end < end ? selfCoord->end : end,
                                     0); // Perl didn't set rank - don't know if need to

      // create the Coordinate object
      MapperCoordinate *coord = MapperCoordinate_new(targetCoord->id,
                                                     targetStart,
                                                     targetEnd,
                                                     pair->ori * strand, 
                                                     cs, 
                                                     0); // Perl didn't set rank - don't know if need to

      //and finally, the IndelCoordinate object with
      res = (MapperRange *)IndelCoordinate_new(gap, coord);
    } else {
      // start is somewhere inside the region
      if (pair->ori == 1) {
        targetStart = targetCoord->start + (start - selfCoord->start);
      } else {
        targetEnd = targetCoord->end - (start - selfCoord->start);
      }

      // Either we are enveloping this map or not.  If yes, then end
      // point (self perspective) is determined solely by target.  If
      // not we need to adjust.
      if (end > selfCoord->end) {
        // enveloped
        if( pair->ori == 1 ) {
          targetEnd = targetCoord->end;
        } else {
          targetStart = targetCoord->start;
        }
      } else {
        // need to adjust end
        if (pair->ori == 1) {
          targetEnd = targetCoord->start + (end - selfCoord->start);
        } else {
          targetStart = targetCoord->end - (end - selfCoord->start);
        }
      }

      res = (MapperRange *)MapperCoordinate_new(targetCoord->id,
                                                targetStart,
                                                targetEnd,
                                                pair->ori * strand, 
                                                cs, 
                                                rank);
    } // end else [ if ( exists $pair->{'indel'...})]

    MapperRangeSet_addRange(results, res);

    lastUsedPair = pair;
    start = selfCoord->end+1;
  }

  if (lastUsedPair == NULL) {
    MapperRange *gap = (MapperRange *)MapperGap_new(start,end, 0); // Perl doesn't set rank, so use 0
    MapperRangeSet_addRange(results,gap);

  } else if (MapperPair_getUnit(lastUsedPair, from)->end < end) {
    // gap at the end
    MapperRange *gap = (MapperRange *)MapperGap_new(
                           MapperPair_getUnit(lastUsedPair,from)->end + 1,
                           end, 0 ); // Perl didn't set rank so use 0
    MapperRangeSet_addRange(results,gap);
  }

  if (strand == -1) {
    MapperRangeSet_reverse(results);
  }

  return results;
}
Esempio n. 14
0
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak
// memory
Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, 
                                       long rStart, long rEnd, int wantGaps) {

  // The following was commented out due to Ensembl Genomes requirements
  // for bacterial genomes.
  // The following was uncommented because I'm not caring about those requirements
  if ( start > end ) {
    fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end);
    exit(1);
  }
  
  if ( rStart > rEnd ) {
    fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd);
    exit(1);
  }

  if ( rStart > start ) {
    fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start);
    exit(1);
  }

  if ( rEnd < end ) {
    fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end);
    exit(1);
  }

  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list;
  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id);
  } else {
    list = Vector_new();
    IDHash_add(regReg, id, list);
  }

  
  Vector *gapPairs = NULL;
  if (wantGaps) {
    gapPairs = Vector_new();
  }

  int len = Vector_getNumElement(list);

  if (len == 0) {
    //this is the first request for this id, return a gap pair for the
    // entire range and register it as seen
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_addElement(list, cp);

    return Vector_copy(list);
  }

  //####
  // loop through the list of existing ranges recording any "gaps" where
  // the existing range does not cover part of the requested range
  // 

  int startIdx = 0;
  int endIdx   = Vector_getNumElement(list)-1;
  int midIdx;
  CoordPair *range;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    range  = Vector_getElementAt(list, midIdx);

    if ( CoordPair_getEnd(range) < rStart ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;
    }
  }

  long gapStart;
  long gapEnd;
  int rIdx = -1;
  int rStartIdx = -1;
  int rEndIdx;

  gapStart = rStart;

  int i;
  for (i=startIdx; i < len ; i++ ) {
    CoordPair *pRange = Vector_getElementAt(list, i);
    long pStart = CoordPair_getStart(pRange);
    long pEnd   = CoordPair_getEnd(pRange);
    
    // no work needs to be done at all if we find a range pair that
    // entirely overlaps the requested region
    if ( pStart <= start && pEnd >= end ) {
      return Vector_new(); // perl returns undef, but that causes me problems
    }

    // find adjacent or overlapping regions already registered
    if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) {
      if ( rStartIdx < 0 ) { // Not yet been set
        rStartIdx = i;
      }
      rEndIdx = i;
    }

    if ( pStart > rStart ) {
      gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1;
      if (wantGaps) {
        CoordPair *cp = CoordPair_new(gapStart, gapEnd);
        Vector_addElement(gapPairs, cp);
      }
    }

    gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1;

    if ( pEnd >= rEnd && rIdx < 0 ) {
      rIdx = i;
      break;
    }
  }

  // do we have to make another gap?
  if ( gapStart <= rEnd ) {
    if (wantGaps) {
      CoordPair *cp = CoordPair_new(gapStart, rEnd);
      Vector_addElement(gapPairs, cp);
    }
  }

  // 
  // Merge the new range into the registered list
  // 
  if (rStartIdx >= 0 ) { // rStartIdx has been set to something 
    long newStart;
    long newEnd;
    CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); 
    CoordPair *rEndIdxRange   = Vector_getElementAt(list, rEndIdx); 

    if ( rStart < CoordPair_getStart(rStartIdxRange)) {
      newStart = rStart;
    } else {
      newStart = CoordPair_getStart(rStartIdxRange);
    }

    if ( rEnd > CoordPair_getEnd(rEndIdxRange)) {
      newEnd = rEnd;
    } else {
      newEnd = CoordPair_getEnd(rEndIdxRange);
    }

    CoordPair *cp = CoordPair_new(newStart, newEnd);

    // Think its <=
    for (i=rStartIdx; i<=rEndIdx; i++) {
      Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called
    }
    Vector_insertElementAt(list, rStartIdx, cp);
    //splice( @$list, $rstart_idx,
    //        $rend_idx - $rstart_idx + 1,
    //        [ $new_start, $new_end ] );

  } else if (rIdx >= 0) {
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_insertElementAt(list, rIdx, cp);
    //splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
  } else {
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_addElement(list, cp);
  }

  // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK
  return gapPairs;
}
Esempio n. 15
0
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) {
  int  ref;
  int  begRange;
  int  endRange;
  char region[1024];
  char region_name[512];


  if (Slice_getChrStart(slice) != 1) {
    fprintf(stderr, "Currently only allow a slice start position of 1\n");
    return 1;
  }
  if (flags & M_UCSC_NAMING) {
    sprintf(region,"chr%s", Slice_getSeqRegionName(slice));
  } else {
    sprintf(region,"%s", Slice_getSeqRegionName(slice));
  }
  bam_hdr_t *header = bam_hdr_init();
  header = bam_hdr_read(in->fp.bgzf);
  ref = bam_name2id(header, region);
  if (ref < 0) {
    fprintf(stderr, "Invalid region %s\n", region);
    exit(1);
  }
  sprintf(region,"%s:%ld-%ld", region_name,
                             Slice_getSeqRegionStart(slice),
                             Slice_getSeqRegionEnd(slice));
  if (hts_parse_reg(region, &begRange, &endRange) == NULL) {
    fprintf(stderr, "Could not parse %s\n", region);
    exit(2);
  }
  bam_hdr_destroy(header);


  hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange);
  bam1_t *b = bam_init1();

  Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage));

  long counter = 0;
  long overlapping = 0;
  long bad = 0;
  int startIndex = 0;
  while (bam_itr_next(in, iter, b) >= 0) {
    if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) {
      bad++;
      continue;
    }

    int end;
    //end = bam_calend(&b->core, bam1_cigar(b));
    end = bam_endpos(b);

    // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in).
    // That is the reason for the || end == begRange test
    if (end == begRange) {
      continue;
    }
    counter++;

    if (!(counter%1000000)) {
      if (verbosity > 1) { printf("."); }
      fflush(stdout);
    }

// Remember: b->core.pos is zero based!
    int cigInd;
    int refPos;
    int readPos;
    uint32_t *cigar = bam_get_cigar(b);
    for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) {
      int k;
      int lenCigBlock = cigar[cigInd]>>4;
      int op          = cigar[cigInd]&0xf;

      if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
        for (k = 0; k < lenCigBlock; ++k) {
          //if (ref[refPos+k] == 0) break; // out of boundary
          coverage[refPos+k].coverage++;
        }
        if (k < lenCigBlock) break;
        refPos += lenCigBlock; readPos += lenCigBlock;
      } else if (op == BAM_CDEL) {
        for (k = 0; k < lenCigBlock; ++k) {
        //  if (ref[refPos+k] == 0) break;
          coverage[refPos+k].coverage++;
        }
        if (k < lenCigBlock) break;
        refPos += lenCigBlock;
      } else if (op == BAM_CSOFT_CLIP) {
        readPos += lenCigBlock;
      } else if (op == BAM_CHARD_CLIP) {
      } else if (op == BAM_CINS) {
         readPos += lenCigBlock;
      } else if (op == BAM_CREF_SKIP) {
         refPos += lenCigBlock;
      }
    }

#ifdef DONE
    int j;
    int done = 0;
    int hadOverlap = 0;
    
    for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) {
      Gene *gene = Vector_getElementAt(genes,j); 
      if (!gene) {
        continue;
      }
// Remember: b->core.pos is zero based!
      if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) {
        int k;

        int doneGene = 0;
        for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) {
          Transcript *trans = Gene_getTranscriptAt(gene,k);

          if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) {
            int m;
     
            for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) {
              Exon *exon = Transcript_getExonAt(trans,m);

              if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) {

                // Only count as overlapping once (could be that a read overlaps more than one gene)
                if (!hadOverlap) {
                  overlapping++;
                  hadOverlap = 1;
                }

                gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
                gs->score++;
                
                doneGene = 1;
              }
            }
          }
        }
      } else if (Gene_getStart(gene) > end) {
        done = 1;
      } else if (Gene_getEnd(gene) < b->core.pos+1) {
        gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
        printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                          Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 
                                          gs->score);

        if (verbosity > 1) { 
          printf("Removing gene %s (index %d) with extent %d to %d\n", 
                 Gene_getStableId(gene), 
                 gs->index,
                 Gene_getStart(gene),
                 Gene_getEnd(gene));
        }
        Vector_setElementAt(genes,j,NULL);

        // Magic (very important for speed) - move startIndex to first non null gene
        int n;
        startIndex = 0;
        for (n=0;n<Vector_getNumElement(genes);n++) {
          void *v = Vector_getElementAt(genes,n);

          if (v != NULL) {
            break;
          }
          startIndex++;
        }
        if (verbosity > 1) { 
          printf("startIndex now %d\n",startIndex);
        }
      }
    }
#endif
  }
  if (verbosity > 1) { printf("\n"); }

#ifdef DONE
// Print out read counts for what ever's left in the genes array
  int n;
  for (n=0;n<Vector_getNumElement(genes);n++) {
    Gene *gene = Vector_getElementAt(genes,n);

    if (gene != NULL) {
      gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
      printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                        Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 
                                        gs->score);
    }

  }
#endif

  printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad);

  long i;
  for (i=0; i< Slice_getLength(slice); i++) {
    printf("%ld %ld\n", i+1, coverage[i].coverage);
  }

  sam_itr_destroy(iter);
  bam_destroy1(b);


  return 1;
}
Esempio n. 16
0
/*
=head2 _objs_from_sth

  Arg [1]    : DBI:st $sth 
               An executed DBI statement handle
  Arg [2]    : (optional) Bio::EnsEMBL::Mapper $mapper 
               An mapper to be used to convert contig coordinates
               to assembly coordinates.
  Arg [3]    : (optional) Bio::EnsEMBL::Slice $slice
               A slice to map the prediction transcript to.   
  Example    : $p_transcripts = $self->_objs_from_sth($sth);
  Description: Creates a list of Prediction transcripts from an executed DBI
               statement handle.  The columns retrieved via the statement 
               handle must be in the same order as the columns defined by the
               _columns method.  If the slice argument is provided then the
               the prediction transcripts will be in returned in the coordinate
               system of the $slice argument.  Otherwise the prediction 
               transcripts will be returned in the RawContig coordinate system.
  Returntype : reference to a list of Bio::EnsEMBL::PredictionTranscripts
  Exceptions : none
  Caller     : superclass generic_fetch
  Status     : Stable

=cut
*/
Vector *PredictionTranscriptAdaptor_objectsFromStatementHandle(PredictionTranscriptAdaptor *pta, 
                                                               StatementHandle *sth, 
                                                               AssemblyMapper *assMapper, 
                                                               Slice *destSlice) {
  SliceAdaptor *sa     = DBAdaptor_getSliceAdaptor(pta->dba);
  AnalysisAdaptor *aa  = DBAdaptor_getAnalysisAdaptor(pta->dba);

  Vector *pTranscripts = Vector_new();
  IDHash *sliceHash = IDHash_new(IDHASH_SMALL);

  long         destSliceStart;
  long         destSliceEnd;
  int          destSliceStrand;
  long         destSliceLength;
  char *       destSliceSrName;
  IDType       destSliceSrId = 0;

  if (destSlice) {
    destSliceStart  = Slice_getStart(destSlice);
    destSliceEnd    = Slice_getEnd(destSlice);
    destSliceStrand = Slice_getStrand(destSlice);
    destSliceLength = Slice_getLength(destSlice);
    destSliceSrName = Slice_getSeqRegionName(destSlice);
    destSliceSrId   = Slice_getSeqRegionId(destSlice);
  }

  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType predictionTranscriptId = row->getLongLongAt(row,0);
    IDType seqRegionId            = row->getLongLongAt(row,1);
    long seqRegionStart           = row->getLongAt(row,2);
    long seqRegionEnd             = row->getLongAt(row,3);
    int seqRegionStrand           = row->getIntAt(row,4);
    IDType analysisId             = row->getLongLongAt(row,5);
    char *displayLabel            = row->getStringAt(row,6);

    // get the analysis object
    Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId);

    if (! IDHash_contains(sliceHash, seqRegionId)) {
      IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF));
    }
    Slice *slice = IDHash_getValue(sliceHash, seqRegionId);

    Slice *ptSlice = slice;

    char *srName      = Slice_getSeqRegionName(slice);
    CoordSystem *srCs = Slice_getCoordSystem(slice);

    //
    // remap the feature coordinates to another coord system
    // if a mapper was provided
    //
    if (assMapper != NULL) {
      MapperRangeSet *mrs;

      // Slightly suspicious about need for this if statement so left in perl statements for now
      if (destSlice != NULL &&
          assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) {
        mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice);
      } else {
        mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL);
      }

      // skip features that map to gaps or coord system boundaries
      if (MapperRangeSet_getNumRange(mrs) == 0) {
        continue;
      }
      MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0);
      if (range->rangeType == MAPPERRANGE_GAP) {
        fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n");
        exit(1);
      } else {
        MapperCoordinate *mc = (MapperCoordinate *)range;

        seqRegionId     = mc->id;
        seqRegionStart  = mc->start;
        seqRegionEnd    = mc->end;
        seqRegionStrand = mc->strand;
      }

      MapperRangeSet_free(mrs);

      if (! IDHash_contains(sliceHash, seqRegionId)) {
        IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF));
      }
      ptSlice = IDHash_getValue(sliceHash, seqRegionId);
    }

    //
    // If a destination slice was provided convert the coords
    // If the dest_slice starts at 1 and is foward strand, nothing needs doing
    //
    if (destSlice != NULL) {
      if (destSliceStart != 1 || destSliceStrand != 1) {
        if (destSliceStrand == 1) {
          seqRegionStart = seqRegionStart - destSliceStart + 1;
          seqRegionEnd   = seqRegionEnd - destSliceStart + 1;
        } else {
          long tmpSeqRegionStart = seqRegionStart;
          seqRegionStart = destSliceEnd - seqRegionEnd + 1;
          seqRegionEnd   = destSliceEnd - tmpSeqRegionStart + 1;

          seqRegionStrand = -seqRegionStrand;
        }
      }
      // throw away features off the end of the requested slice
      if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) {
        continue;
      }
      ptSlice = destSlice;
    }
    
    // Finally, create the new PredictionTranscript.
    PredictionTranscript *pt = PredictionTranscript_new();

    PredictionTranscript_setStart       (pt, seqRegionStart);
    PredictionTranscript_setEnd         (pt, seqRegionEnd);
    PredictionTranscript_setStrand      (pt, seqRegionStrand);
    PredictionTranscript_setSlice       (pt, ptSlice);
    PredictionTranscript_setAnalysis    (pt, analysis);
    PredictionTranscript_setAdaptor     (pt, (BaseAdaptor *)pta);
    PredictionTranscript_setDbID        (pt, predictionTranscriptId);
    PredictionTranscript_setDisplayLabel(pt, displayLabel);

    Vector_addElement(pTranscripts, pt);
  }

  IDHash_free(sliceHash, NULL);
  return pTranscripts;
}
Esempio n. 17
0
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) {

  //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name);
  Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName);

  // if there are 0 or 1 transcripts still do lazy-loading
  if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) {
    return transcripts;
  }

  // preload all of the exons now, instead of lazy loading later
  // faster than 1 query per transcript

  // get extent of region spanned by transcripts
  long minStart =  2000000000;
  long maxEnd   = -2000000000;

  int i;
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) {
      minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t);
    }
    if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) {
      maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t);
    }
  }

  Slice *extSlice;

  if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) {
    extSlice = slice;
  } else {
    SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba);
    extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice),
                                          minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0);
  }

  // associate exon identifiers with transcripts
  IDHash *trHash = IDHash_new(IDHASH_MEDIUM);
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) {
      IDHash_add(trHash, PredictionTranscript_getDbID(t), t);
    }
  }

  IDType *uniqueIds = IDHash_getKeys(trHash);

  char tmpStr[1024];
  char *qStr = NULL;
  if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating qStr\n");
    return transcripts;
  }

  int lenNum;
  int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE  prediction_transcript_id IN (");
  for (i=0; i<IDHash_getNumValues(trHash); i++) {
    if (i!=0) {
      qStr[endPoint++] = ',';
      qStr[endPoint++] = ' ';
    }
    lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]);
    memcpy(&(qStr[endPoint]), tmpStr, lenNum);
    endPoint+=lenNum;
  }
  qStr[endPoint++] = ')';
  qStr[endPoint] = '\0';

  free(uniqueIds);

  StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr));
  sth->execute(sth);

  IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM);
  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType trId = row->getLongLongAt(row,0);
    IDType exId = row->getLongLongAt(row,1);
    int    rank = row->getIntAt(row,2);

    if (! IDHash_contains(exTrHash, exId)) {
      Vector *vec = Vector_new();
      Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free);
      IDHash_add(exTrHash, exId, vec);
    }
    Vector *exVec = IDHash_getValue(exTrHash, exId);
    PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank);
    Vector_addElement(exVec, trp);
  }

  IDHash_free(trHash, NULL);

  sth->finish(sth);

  PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba);
  Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice);

  // move exons onto transcript slice, and add them to transcripts
  for (i=0; i<Vector_getNumElement(exons); i++) {
    PredictionExon *ex = Vector_getElementAt(exons, i);

  // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it
    if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue;

    PredictionExon *newEx;
    if (slice != extSlice) {
      newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice);
      if (newEx == NULL) {
        fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n");
        exit(1);
      }
    } else {
      newEx = ex;
    }

    Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx));
    int j;
    for (j=0; j<Vector_getNumElement(exVec); j++) {
      PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j);
      PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank);
    }
  }

  IDHash_free(exTrHash, Vector_free);
  free(qStr);

  return transcripts;
}
Vector *IntronSupportingEvidenceAdaptor_objectsFromStatementHandle(IntronSupportingEvidenceAdaptor *isea, 
                                                                   StatementHandle *sth,
                                                                   AssemblyMapper *assMapper,
                                                                   Slice *destSlice) {
  SliceAdaptor *sa     = DBAdaptor_getSliceAdaptor(isea->dba);
  AnalysisAdaptor *aa  = DBAdaptor_getAnalysisAdaptor(isea->dba);

  Vector *features = Vector_new();
  IDHash *sliceHash = IDHash_new(IDHASH_SMALL);
  
/* Unneccesary
  my %analysis_hash;
  my %sr_name_hash;
  my %sr_cs_hash;
*/
  

  
/* Unused
  my $asm_cs;
  my $cmp_cs;
  my $asm_cs_vers;
  my $asm_cs_name;
  my $cmp_cs_vers;
  my $cmp_cs_name;
  if($mapper) {
    $asm_cs = $mapper->assembled_CoordSystem();
    $cmp_cs = $mapper->component_CoordSystem();
    $asm_cs_name = $asm_cs->name();
    $asm_cs_vers = $asm_cs->version();
    $cmp_cs_name = $cmp_cs->name();
    $cmp_cs_vers = $cmp_cs->version();
  }
*/

  long         destSliceStart;
  long         destSliceEnd;
  int          destSliceStrand;
  long         destSliceLength;
  //CoordSystem *destSliceCs;
  char *       destSliceSrName;
  IDType       destSliceSrId = 0;
  //AssemblyMapperAdaptor *asma;

  if (destSlice) {
    destSliceStart  = Slice_getStart(destSlice);
    destSliceEnd    = Slice_getEnd(destSlice);
    destSliceStrand = Slice_getStrand(destSlice);
    destSliceLength = Slice_getLength(destSlice);
    //??destSliceCs     = Slice_getCoordSystem(destSlice);
    destSliceSrName = Slice_getSeqRegionName(destSlice);
    destSliceSrId   = Slice_getSeqRegionId(destSlice);
    //??asma            = DBAdaptor_getAssemblyMapperAdaptor(ea->dba);
  }

  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType id =           row->getLongLongAt(row,0);
    IDType analysisId =   row->getLongLongAt(row,1);
    IDType seqRegionId =  row->getLongLongAt(row,2);
    long seqRegionStart = row->getLongAt(row,3);
    long seqRegionEnd =   row->getLongAt(row,4);
    int seqRegionStrand = row->getIntAt(row,5);
    char *hitName =       row->getStringAt(row,6);
    double score =        row->getDoubleAt(row,7);
    char *scoreType =     row->getStringAt(row,8);
    int spliceCanonical = row->getIntAt(row,9); 

    // get the analysis object
    Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId);

/*
    // need to get the internal_seq_region, if present
    $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
    #get the slice object
    my $slice = $slice_hash{"ID:".$seq_region_id};
    if(!$slice) {
      $slice = $sa->fetch_by_seq_region_id($seq_region_id);
      $slice_hash{"ID:".$seq_region_id} = $slice;
      $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
      $sr_cs_hash{$seq_region_id} = $slice->coord_system();
    }

    my $sr_name = $sr_name_hash{$seq_region_id};
    my $sr_cs   = $sr_cs_hash{$seq_region_id};
*/
    if (! IDHash_contains(sliceHash, seqRegionId)) {
      IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF));
    }
    Slice *slice = IDHash_getValue(sliceHash, seqRegionId);

    Slice *iseSlice = slice;
    
    char *srName      = Slice_getSeqRegionName(slice);
    CoordSystem *srCs = Slice_getCoordSystem(slice);

    // 
    // remap the feature coordinates to another coord system
    // if a mapper was provided
    //
    if (assMapper != NULL) {
      MapperRangeSet *mrs;

      // Slightly suspicious about need for this if statement so left in perl statements for now
      if (destSlice != NULL &&
          assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) {
        mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice);
      } else {
        mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL);
      }

      // skip features that map to gaps or coord system boundaries
      //next FEATURE if (!defined($seq_region_id));
      if (MapperRangeSet_getNumRange(mrs) == 0) {
        continue;
      }
      MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0);
      if (range->rangeType == MAPPERRANGE_GAP) {
        fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n");
        exit(1);
      } else {
        MapperCoordinate *mc = (MapperCoordinate *)range;

        seqRegionId     = mc->id;
        seqRegionStart  = mc->start;
        seqRegionEnd    = mc->end;
        seqRegionStrand = mc->strand;
      }

      MapperRangeSet_free(mrs);

      
/* Was - but identical if and else so why test???
      #get a slice in the coord system we just mapped to
      if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
        $slice = $slice_hash{"ID:".$seq_region_id} ||=
          $sa->fetch_by_seq_region_id($seq_region_id);
      } else {
        $slice = $slice_hash{"ID:".$seq_region_id} ||=
          $sa->fetch_by_seq_region_id($seq_region_id);
      }
*/
// Instead...
      if (! IDHash_contains(sliceHash, seqRegionId)) {
        IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF));
      }
      iseSlice = IDHash_getValue(sliceHash, seqRegionId);
    }


    //
    // If a destination slice was provided convert the coords
    // If the dest_slice starts at 1 and is foward strand, nothing needs doing
    // 
    if (destSlice != NULL) {
      if (destSliceStart != 1 || destSliceStrand != 1) {
        if (destSliceStrand == 1) {
          seqRegionStart = seqRegionStart - destSliceStart + 1;
          seqRegionEnd   = seqRegionEnd - destSliceStart + 1;
        } else {
          long tmpSeqRegionStart = seqRegionStart;
          seqRegionStart = destSliceEnd - seqRegionEnd + 1;
          seqRegionEnd   = destSliceEnd - tmpSeqRegionStart + 1;

          seqRegionStrand = -seqRegionStrand;
        }
      }
       
      // throw away features off the end of the requested slice
      if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) {
        continue;
      }
      iseSlice = destSlice;
    }
    
    IntronSupportingEvidence *ise = IntronSupportingEvidence_new();

    IntronSupportingEvidence_setStart             (ise, seqRegionStart);
    IntronSupportingEvidence_setEnd               (ise, seqRegionEnd);
    IntronSupportingEvidence_setStrand            (ise, seqRegionStrand);
    IntronSupportingEvidence_setSlice             (ise, iseSlice);
    IntronSupportingEvidence_setAnalysis          (ise, analysis);
    IntronSupportingEvidence_setAdaptor           (ise, (BaseAdaptor *)isea);
    IntronSupportingEvidence_setDbID              (ise, id);
    IntronSupportingEvidence_setHitName           (ise, hitName);
    IntronSupportingEvidence_setScore             (ise, score);
    IntronSupportingEvidence_setScoreType         (ise, scoreType);
    IntronSupportingEvidence_setIsSpliceCanonical(ise, spliceCanonical);

    Vector_addElement(features, ise);
  }
  
  return features;
}
Esempio n. 19
0
// this function merges pairs that are adjacent into one
// This function is a pain in the arse to implement in C
void Mapper_mergePairs(Mapper *m) {

  int to   = MAPPER_TO_IND;
  int from = MAPPER_FROM_IND;

  Mapper_setPairCount(m, 0);

  IDHash *toPairHash   = Mapper_getPairHash(m, MAPPER_TO_IND);
  IDHash *fromPairHash = Mapper_getPairHash(m, MAPPER_FROM_IND);
 
  MapperPairSet **toPairValues = (MapperPairSet **)IDHash_getValues(toPairHash);

  
  int pairInd;
  for (pairInd = 0; pairInd<IDHash_getNumValues(toPairHash); pairInd++) {
    MapperPairSet *pairs = toPairValues[pairInd];

    int i = 0;
    int next = 1;
    int length = MapperPairSet_getNumPair(pairs)-1; //$#{$lr};

    while (next <= length) {
      MapperPair *currentPair = MapperPairSet_getPairAt(pairs, i);
      MapperPair *nextPair    = MapperPairSet_getPairAt(pairs, next);
      MapperPair *delPair     = NULL;

      if (MapperPair_isIndel(currentPair) || MapperPair_isIndel(nextPair)) {
        //necessary to modify the merge function to not merge indels
        next++;
        i++;

      } else {
        // duplicate filter
        if ( MapperPair_getUnit(currentPair,to)->start == MapperPair_getUnit(nextPair,to)->start  &&
             MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) {

          delPair = nextPair;

        } else if (( MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) &&
                   ( nextPair->ori == currentPair->ori ) &&
                   ( MapperPair_getUnit(nextPair,to)->start-1 == MapperPair_getUnit(currentPair,to)->end )) {

          if ( currentPair->ori == 1 ) {

            // check forward strand merge
            if ( MapperPair_getUnit(nextPair,from)->start-1 == MapperPair_getUnit(currentPair,from)->end) {
              // normal merge with previous element
              MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end;
              MapperPair_getUnit(currentPair,from)->end = MapperPair_getUnit(nextPair,from)->end;
              delPair = nextPair;
            }
          } else {

            // check backward strand merge
            if ( MapperPair_getUnit(nextPair,from)->end+1  == MapperPair_getUnit(currentPair,from)->start ) {

              // yes its a merge
              MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end;
              MapperPair_getUnit(currentPair,from)->start = MapperPair_getUnit(nextPair,from)->start;
              delPair = nextPair;
            }
          }
        }

        if (delPair != NULL) { // Have a pair to delete
          // Remove from the to pair set
          MapperPairSet_removePairAt(pairs, next); //splice( @$lr, $next, 1 );

          MapperPairSet *fromPairs = IDHash_getValue(fromPairHash, MapperPair_getUnit(delPair, from)->id); //$self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})}; 

          int j;
          for (j=0; j < MapperPairSet_getNumPair(fromPairs); j++) {
            MapperPair *fromPair = MapperPairSet_getPairAt(fromPairs, j);
            if ( fromPair == delPair) { // Is this really going to be an equality ??? //$lr_from->[$j] == $del_pair )
              MapperPairSet_removePairAt(fromPairs, j); //splice( @$lr_from, $j, 1 );
              break;
            }
          }

          // NIY: Do we need to free delPair???

          length--;
          if ( length < next ) break;
        } else {
          next++;
          i++;
        }
      }
    }

    Mapper_addToPairCount(m, MapperPairSet_getNumPair(pairs)); //    $self->{'pair_count'} += scalar( @$lr );
  }
}