/* =head2 add_map_coordinates Arg 1 int $id id of 'source' sequence Arg 2 int $start start coordinate of 'source' sequence Arg 3 int $end end coordinate of 'source' sequence Arg 4 int $strand relative orientation of source and target (+/- 1) Arg 5 int $id id of 'target' sequence Arg 6 int $start start coordinate of 'target' sequence Arg 7 int $end end coordinate of 'target' sequence Function Stores details of mapping between 'source' and 'target' regions. Returntype none Exceptions none Caller Bio::EnsEMBL::Mapper =cut */ void Mapper_addMapCoordinates(Mapper *m, IDType contigId, int contigStart, int contigEnd, int contigOri, IDType chrId, int chrStart, int chrEnd) { MapperPair *pair; MapperUnit *from; MapperUnit *to; IDHash *fromHash; IDHash *toHash; MapperPairSet *mps; if ((contigEnd - contigStart) != (chrEnd - chrStart)) { fprintf(stderr,"ERROR: Cannot deal with mis-lengthed mappings so far\n"); fprintf(stderr,"Contig %d to %d and chromosome %d to %d\n",contigStart,contigEnd, chrStart,chrEnd); exit(1); } pair = MapperPair_new(); from = MapperUnit_new(); from->start = contigStart; from->end = contigEnd; from->id = contigId; to = MapperUnit_new(); to->start = chrStart; to->end = chrEnd; to->id = chrId; MapperPair_setUnit(pair,MAPPER_TO_IND,to); MapperPair_setUnit(pair,MAPPER_FROM_IND,from); pair->ori = contigOri; // place into hash on both ids fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND); toHash = Mapper_getPairHash(m, MAPPER_TO_IND); if (!IDHash_contains(toHash,chrId)) { IDHash_add(toHash,chrId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(toHash,chrId); MapperPairSet_addPair(mps,pair); if (!IDHash_contains(fromHash,contigId)) { IDHash_add(fromHash,contigId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(fromHash,contigId); MapperPairSet_addPair(mps,pair); Mapper_incPairCount(m); Mapper_setIsSorted(m,0); }
// This is almost identical to Mapper_addCoordinates (just the isIndel line as far as I can see!) - I should refactor this int Mapper_addIndelCoordinates( Mapper *m, IDType contigId, long contigStart, long contigEnd, int contigOri, IDType chrId, long chrStart, long chrEnd) { MapperPair *pair; MapperUnit *from; MapperUnit *to; IDHash *fromHash; IDHash *toHash; MapperPairSet *mps; //we need to create the IndelPair object to add to both lists, to and from pair = MapperPair_new(); from = MapperUnit_new(); from->start = contigStart; from->end = contigEnd; from->id = contigId; to = MapperUnit_new(); to->start = chrStart; to->end = chrEnd; to->id = chrId; MapperPair_setUnit(pair,MAPPER_TO_IND,to); MapperPair_setUnit(pair,MAPPER_FROM_IND,from); pair->ori = contigOri; pair->isIndel = 1; // place into hash on both ids fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND); toHash = Mapper_getPairHash(m, MAPPER_TO_IND); if (!IDHash_contains(toHash,chrId)) { IDHash_add(toHash,chrId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(toHash,chrId); MapperPairSet_addPair(mps,pair); if (!IDHash_contains(fromHash,contigId)) { IDHash_add(fromHash,contigId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(fromHash,contigId); MapperPairSet_addPair(mps,pair); Mapper_incPairCount(m); Mapper_setIsSorted(m,0); return 1; }
Chromosome *ChromosomeAdaptor_fetchByDbID(ChromosomeAdaptor *ca, IDType dbID) { Chromosome *chromosome; char qStr[256]; StatementHandle *sth; ResultRow *row; if (IDHash_contains(ca->chrCache,dbID)) { chromosome = IDHash_getValue(ca->chrCache, dbID); } else { sprintf(qStr,"SELECT chromosome_id, name, length" " FROM chromosome" " WHERE chromosome_id = " IDFMTSTR, dbID); sth = ca->prepare((BaseAdaptor *)ca,qStr,strlen(qStr)); sth->execute(sth); row = sth->fetchRow(sth); if( row == NULL ) { sth->finish(sth); return NULL; } chromosome = ChromosomeAdaptor_chromosomeFromRow(ca, row); sth->finish(sth); } return chromosome; }
void Mapper_dump(Mapper *m, FILE *fp) { IDHash *fromHash; IDType *keys; int nKey; int i; if (fp==NULL) { fp = stderr; } fromHash = Mapper_getPairHash(m, MAPPER_TO_IND); keys = IDHash_getKeys(fromHash); nKey = IDHash_getNumValues(fromHash); for (i=0;i<nKey;i++) { IDType id = keys[i]; MapperPairSet *set = IDHash_getValue(fromHash,id); int j; fprintf(fp, "From Hash " IDFMTSTR " with %d pairs\n",id, MapperPairSet_getNumPair(set)); for (j=0; j<MapperPairSet_getNumPair(set); j++) { MapperPair *pair = MapperPairSet_getPairAt(set,j); MapperUnit *fromCoord = MapperPair_getUnit(pair, MAPPER_FROM_IND); MapperUnit *toCoord = MapperPair_getUnit(pair, MAPPER_TO_IND); fprintf(fp, " %ld %ld:%ld %ld " IDFMTSTR "\n",fromCoord->start,fromCoord->end, toCoord->start,toCoord->end,toCoord->id); } } free(keys); }
// low level function to access the ranges // only use for read access Vector *RangeRegistry_getRanges(RangeRegistry *registry, IDType id) { IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list = NULL; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } return list; }
void Translation_transform(Translation *translation, IDHash *exonTransforms) { Exon * startExon = Translation_getStartExon(translation); Exon * endExon = Translation_getEndExon(translation); IDType startExonRef = (IDType)startExon; IDType endExonRef = (IDType)endExon; /* CHECK */ if (IDHash_contains(exonTransforms,startExonRef)) { Translation_setStartExon(translation,IDHash_getValue(exonTransforms,startExonRef)); } else { // do nothing, the start exon wasnt mapped } if (IDHash_contains(exonTransforms,endExonRef)) { Translation_setEndExon(translation,IDHash_getValue(exonTransforms,endExonRef)); } else { // do nothing, the end exon wasnt mapped } }
/* =head2 register_assembled Arg [1] : integer $asm_seq_region The dbID of the sequence region to register. Arg [2] : int $chunk_id The chunk number of the provided seq_region to register. Example : $asm_mapper->register_assembled( 'X', 4 ); Description: Flags a given assembled region as registered in this assembly mapper. This should only be called by this class or the AssemblyMapperAdaptor. Do not call this method unless you really know what you are doing. Return type: None Exceptions : Throws on incorrect arguments Caller : Internal, AssemblyMapperAdaptor Status : Stable =cut */ void AssemblyMapper_registerAssembled(AssemblyMapper *am, IDType asmSeqRegionId, int chunkId) { IDHash *assembledRegister = AssemblyMapper_getAssembledRegister(am); if ( !IDHash_contains(assembledRegister, asmSeqRegionId) ) { IDHash_add(assembledRegister, asmSeqRegionId, IDHash_new(IDHASH_MEDIUM)); } IDHash *chunkHash = IDHash_getValue(assembledRegister, asmSeqRegionId); if (!IDHash_contains(chunkHash, (IDType)chunkId)) { IDHash_add(chunkHash, (IDType)chunkId, &trueVal); } }
int AssemblyMapper_haveRegisteredAssembled(AssemblyMapper *am, IDType asmSeqRegionId, int chunkId) { IDHash *assembledRegister = AssemblyMapper_getAssembledRegister(am); if ( !IDHash_contains(assembledRegister, asmSeqRegionId) ) { return 0; } IDHash *chunkHash = IDHash_getValue(assembledRegister, asmSeqRegionId); if (!IDHash_contains(chunkHash, (IDType)chunkId)) { return 0; } return 1; }
long RangeRegistry_overlapSize(RangeRegistry *registry, IDType id, long start, long end) { long overlap = 0; if ( start > end ) return 0; IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { return 0; // No list for this id, so can't be any overlap } int len = Vector_getNumElement(list); if ( len == 0 ) { fprintf(stderr, "Odd have zero length list in RangeRegistry_overlapSize\n"); return 0; } int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < start ) { startIdx = midIdx; } else { endIdx = midIdx; } } int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); if ( pStart > end ) { break; } if ( pStart <= start && pEnd >= end ) { overlap = end - start + 1; break; } long mStart = ( start < pStart ? pStart : start ); long mEnd = ( end < pEnd ? end : pEnd ); if (mEnd - mStart >= 0) { overlap += ( mEnd - mStart + 1 ); } } return overlap; }
/* =head2 list_pairs Arg 1 int $id id of 'source' sequence Arg 2 int $start start coordinate of 'source' sequence Arg 3 int $end end coordinate of 'source' sequence Arg 4 string $type nature of transform - gives the type of coordinates to be transformed *from* Function list all pairs of mappings in a region Returntype list of Bio::EnsEMBL::Mapper::Pair Exceptions none Caller Bio::EnsEMBL::Mapper =cut */ MapperPairSet *Mapper_listPairs(Mapper *m, IDType id, long start, long end, char *type) { MapperPairSet *pairs; IDHash *hash; int from, to; MapperPairSet *retSet; int i; if (start > end) { fprintf(stderr,"ERROR: Start is greater than end for id " IDFMTSTR ", start %ld, end %ld\n",id,start,end); } if( Mapper_getIsSorted(m) == 0 ) { Mapper_sort(m); } if (!Mapper_compareType(type, Mapper_getTo(m))) { from = MAPPER_TO_IND; to = MAPPER_FROM_IND; } else { from = MAPPER_FROM_IND; to = MAPPER_TO_IND; } hash = Mapper_getPairHash(m, from); if (!hash) { fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type); exit(1); } if (!IDHash_contains(hash, id)) { return NULL; } pairs = IDHash_getValue(hash,id); retSet = MapperPairSet_new(); //Mapper_dump(m,NULL); //fprintf(stderr, "listPairs with %d %d %d\n",id,start,end); if (start == -1 && end == -1) { for (i=0;i<MapperPairSet_getNumPair(pairs);i++) { MapperPairSet_addPair(retSet,MapperPairSet_getPairAt(pairs,i)); } } else { for (i=0;i<MapperPairSet_getNumPair(pairs);i++) { MapperPair *pair = MapperPairSet_getPairAt(pairs,i); MapperUnit *fromCoord = MapperPair_getUnit(pair, from); //fprintf(stderr," unit %d %d\n",fromCoord->start,fromCoord->end); if( fromCoord->end < start ) { continue; } if( fromCoord->start > end ) { break; } MapperPairSet_addPair(retSet,pair); } } return retSet; }
Vector *DBEntryAdaptor_fetchByObjectType(DBEntryAdaptor *dbea, IDType ensObj, char *ensType) { Vector *out; char qStr[1024]; StatementHandle *sth; ResultRow *row; IDHash *seen; if (!ensObj) { fprintf(stderr,"Error: Can't fetchByObjectType without an object\n"); exit(1); } if (!ensType) { fprintf(stderr,"Error: Can't fetchByObjectType without a type\n"); exit(1); } // Not sure if idt identities are right way round sprintf(qStr, "SELECT xref.xref_id, xref.dbprimary_acc, xref.display_label, xref.version," " xref.description," " exDB.db_name, exDB.db_release, exDB.status," " oxr.object_xref_id," " es.synonym," " idt.xref_identity, idt.ensembl_identity" " FROM (external_db exDB, object_xref oxr, xref xref)" " LEFT JOIN external_synonym es on es.xref_id = xref.xref_id" " LEFT JOIN identity_xref idt on idt.object_xref_id = oxr.object_xref_id" " WHERE xref.xref_id = oxr.xref_id" " AND xref.external_db_id = exDB.external_db_id" " AND oxr.ensembl_id = " IDFMTSTR " AND oxr.ensembl_object_type = '%s'", ensObj, ensType); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); seen = IDHash_new(IDHASH_SMALL); out = Vector_new(); while ((row = sth->fetchRow(sth))) { DBEntry *exDB; IDType refID = row->getLongLongAt(row,0); // using an outer join on the synonyms as well as on identity_xref, we // now have to filter out the duplicates (see v.1.18 for // original). Since there is at most one identity_xref row per xref, // this is easy enough; all the 'extra' bits are synonyms if (!IDHash_contains(seen,refID)) { exDB = DBEntry_new(); DBEntry_setAdaptor(exDB,(BaseAdaptor *)dbea); DBEntry_setDbID(exDB, refID); DBEntry_setPrimaryId(exDB, row->getStringAt(row,1)); DBEntry_setDisplayId(exDB, row->getStringAt(row,2)); DBEntry_setVersion(exDB, row->getStringAt(row,3)); DBEntry_setDbName(exDB, row->getStringAt(row,5)); DBEntry_setRelease(exDB, row->getStringAt(row,6)); if (row->col(row,10)) { IdentityXref *idx = IdentityXref_new(); DBEntry_setIdentityXref(exDB,idx); IdentityXref_setQueryIdentity(idx, row->getDoubleAt(row,10)); IdentityXref_setTargetIdentity(idx, row->getDoubleAt(row,11)); } if (row->col(row,4)) DBEntry_setDescription(exDB, row->getStringAt(row,4)); if (row->col(row,7)) DBEntry_setStatus(exDB, row->getStringAt(row,7)); Vector_addElement(out, exDB); IDHash_add(seen, refID, exDB); } exDB = IDHash_getValue(seen, refID); if (row->col(row,9)) { DBEntry_addSynonym(exDB,row->getStringAt(row,9)); } } IDHash_free(seen, NULL); sth->finish(sth); return out; }
// NIY: May need some reworking to handle mapInsert because I'd changed the way it returns data // Change back to returning MapperRangeSet MapperRangeSet *Mapper_fastMap(Mapper *m, IDType id, long start, long end, int strand, char *type) { MapperPairSet *pairs; int i; IDHash *hash; int from, to; CoordSystem *cs; if(end+1 == start) { return Mapper_mapInsert(m, id, start, end, strand, type, 1); } if(!Mapper_compareType(type, Mapper_getTo(m))) { from = MAPPER_TO_IND; to = MAPPER_FROM_IND; cs = Mapper_getFromCoordSystem(m); } else { from = MAPPER_FROM_IND; to = MAPPER_TO_IND; cs = Mapper_getToCoordSystem(m); } hash = Mapper_getPairHash(m, from); if (!hash) { fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type); exit(1); } if (Mapper_getIsSorted(m) == 0) { Mapper_sort(m); } if (!IDHash_contains(hash, id)) { fprintf(stderr,"ERROR: Fastmap expects to be able to find an id. It couldnt for " IDFMTSTR "\n",id); exit(1); } pairs = IDHash_getValue(hash,id); MapperRangeSet *retSet = MapperRangeSet_new(); for (i=0;i<MapperPairSet_getNumPair(pairs);i++) { MapperPair *pair = MapperPairSet_getPairAt(pairs,i); MapperUnit *selfCoord = MapperPair_getUnit(pair, from); MapperUnit *targetCoord = MapperPair_getUnit(pair, to); // only super easy mapping is done if(start < selfCoord->start || end > selfCoord->end ) { continue; } if (pair->ori == 1) { MapperCoordinate *retRange = MapperCoordinate_new(targetCoord->id, targetCoord->start + start - selfCoord->start, targetCoord->start + end - selfCoord->start, strand, cs, 0); // Perl didn't set rank, so use 0 /* retRange->id = targetCoord->id; retRange->start = targetCoord->start + start - selfCoord->start; retRange->end = targetCoord->start + end - selfCoord->start; retRange->strand = strand; retRange->coordSystem = cs; */ MapperRangeSet_addRange(retSet, (MapperRange *)retRange); break; } else { MapperCoordinate *retRange = MapperCoordinate_new(targetCoord->id, targetCoord->end - (end - selfCoord->start), targetCoord->end - (start - selfCoord->start), -strand, cs, 0); // Perl didn't set rank, so use 0 /* retRange->id = targetCoord->id; retRange->start = targetCoord->end - (end - selfCoord->start); retRange->end = targetCoord->end - (start - selfCoord->start); retRange->strand = -strand; retRange->coordSystem = cs; */ MapperRangeSet_addRange(retSet, (MapperRange *)retRange); break; } } // NIY: Here we return empty set, in mapInsert it returns NULL for empty fastmap - need to work out which is right return retSet; }
MapperRangeSet *Mapper_mapCoordinates(Mapper *m, IDType id, long start, long end, int strand, char *type) { // special case for handling inserts: if ( start == end+1 ) { return Mapper_mapInsert(m, id, start, end, strand, type, 0 /*fastmap flag */); } else if (start > end+1) { fprintf(stderr,"ERROR: Start is greater than end for id " IDFMTSTR ", start %ld, end %ld\n",id,start,end); exit(1); } IDHash *hash; CoordSystem *cs; if( Mapper_getIsSorted(m) == 0 ) { Mapper_sort(m); } int from, to; if (!Mapper_compareType(type,Mapper_getTo(m))) { from = MAPPER_TO_IND; to = MAPPER_FROM_IND; cs = Mapper_getFromCoordSystem(m); } else if (!Mapper_compareType(type,Mapper_getFrom(m))) { from = MAPPER_FROM_IND; to = MAPPER_TO_IND; cs = Mapper_getToCoordSystem(m); } else { fprintf(stderr, "Invalid type [%s] in mapper (not from [%s] or to [%s])\n", type, Mapper_getFrom(m), Mapper_getTo(m)); exit(1); } hash = Mapper_getPairHash(m, from); if (!hash) { fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type); exit(1); } MapperRangeSet *results = MapperRangeSet_new(); // Was upcasing the id - its a number in C, I haven't found a case yet where its a string if (!IDHash_contains(hash, id)) { // one big gap! MapperRange *gap = (MapperRange *)MapperGap_new(start,end,0); // Perl didn't set rank so use 0 MapperRangeSet_addRange(results,gap); return results; } MapperPairSet *pairs = IDHash_getValue(hash,id); //my $lr = $hash->{ uc($id) }; MapperPair *lastUsedPair = NULL; int startIdx, endIdx, midIdx; MapperPair *pair; MapperUnit *selfCoord; startIdx = 0; endIdx = MapperPairSet_getNumPair(pairs)-1; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; pair = MapperPairSet_getPairAt(pairs, midIdx); selfCoord = MapperPair_getUnit(pair, from); if ( selfCoord->end < start ) { startIdx = midIdx; } else { endIdx = midIdx; } } int rank = 0; long origStart = start; IDType lastTargetCoord; int lastTargetCoordIsSet = 0; int i; for (i=startIdx; i<MapperPairSet_getNumPair(pairs); i++) { MapperPair *pair = MapperPairSet_getPairAt(pairs,i); MapperUnit *selfCoord = MapperPair_getUnit(pair, from); MapperUnit *targetCoord = MapperPair_getUnit(pair, to); // // But not the case for haplotypes!! need to test for this case??? // so removing this till a better solution is found // // // if($self_coord->{'start'} < $start){ // $start = $orig_start; // $rank++; // } if ( lastTargetCoordIsSet && targetCoord->id != lastTargetCoord ) { if ( selfCoord->start < start ) { // i.e. the same bit is being mapped to another assembled bit start = origStart; } } else { lastTargetCoord = targetCoord->id; lastTargetCoordIsSet = 1; } // if we haven't even reached the start, move on if (selfCoord->end < origStart) { continue; } // if we have over run, break if (selfCoord->start > end) { break; } // Check is start not origStart if (start < selfCoord->start) { // gap detected MapperRange *gap = (MapperRange *)MapperGap_new(start, selfCoord->start-1, rank); MapperRangeSet_addRange(results,gap); start = gap->end+1; } long targetStart, targetEnd; MapperRange *res; if ( MapperPair_isIndel(pair) ) { // When next pair is an IndelPair and not a Coordinate, create the // new mapping Coordinate, the IndelCoordinate. targetStart = targetCoord->start; targetEnd = targetCoord->end; // create a Gap object MapperGap *gap = MapperGap_new(start, selfCoord->end < end ? selfCoord->end : end, 0); // Perl didn't set rank - don't know if need to // create the Coordinate object MapperCoordinate *coord = MapperCoordinate_new(targetCoord->id, targetStart, targetEnd, pair->ori * strand, cs, 0); // Perl didn't set rank - don't know if need to //and finally, the IndelCoordinate object with res = (MapperRange *)IndelCoordinate_new(gap, coord); } else { // start is somewhere inside the region if (pair->ori == 1) { targetStart = targetCoord->start + (start - selfCoord->start); } else { targetEnd = targetCoord->end - (start - selfCoord->start); } // Either we are enveloping this map or not. If yes, then end // point (self perspective) is determined solely by target. If // not we need to adjust. if (end > selfCoord->end) { // enveloped if( pair->ori == 1 ) { targetEnd = targetCoord->end; } else { targetStart = targetCoord->start; } } else { // need to adjust end if (pair->ori == 1) { targetEnd = targetCoord->start + (end - selfCoord->start); } else { targetStart = targetCoord->end - (end - selfCoord->start); } } res = (MapperRange *)MapperCoordinate_new(targetCoord->id, targetStart, targetEnd, pair->ori * strand, cs, rank); } // end else [ if ( exists $pair->{'indel'...})] MapperRangeSet_addRange(results, res); lastUsedPair = pair; start = selfCoord->end+1; } if (lastUsedPair == NULL) { MapperRange *gap = (MapperRange *)MapperGap_new(start,end, 0); // Perl doesn't set rank, so use 0 MapperRangeSet_addRange(results,gap); } else if (MapperPair_getUnit(lastUsedPair, from)->end < end) { // gap at the end MapperRange *gap = (MapperRange *)MapperGap_new( MapperPair_getUnit(lastUsedPair,from)->end + 1, end, 0 ); // Perl didn't set rank so use 0 MapperRangeSet_addRange(results,gap); } if (strand == -1) { MapperRangeSet_reverse(results); } return results; }
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak // memory Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, long rStart, long rEnd, int wantGaps) { // The following was commented out due to Ensembl Genomes requirements // for bacterial genomes. // The following was uncommented because I'm not caring about those requirements if ( start > end ) { fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end); exit(1); } if ( rStart > rEnd ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd); exit(1); } if ( rStart > start ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start); exit(1); } if ( rEnd < end ) { fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end); exit(1); } IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { list = Vector_new(); IDHash_add(regReg, id, list); } Vector *gapPairs = NULL; if (wantGaps) { gapPairs = Vector_new(); } int len = Vector_getNumElement(list); if (len == 0) { //this is the first request for this id, return a gap pair for the // entire range and register it as seen CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); return Vector_copy(list); } //#### // loop through the list of existing ranges recording any "gaps" where // the existing range does not cover part of the requested range // int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < rStart ) { startIdx = midIdx; } else { endIdx = midIdx; } } long gapStart; long gapEnd; int rIdx = -1; int rStartIdx = -1; int rEndIdx; gapStart = rStart; int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); // no work needs to be done at all if we find a range pair that // entirely overlaps the requested region if ( pStart <= start && pEnd >= end ) { return Vector_new(); // perl returns undef, but that causes me problems } // find adjacent or overlapping regions already registered if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) { if ( rStartIdx < 0 ) { // Not yet been set rStartIdx = i; } rEndIdx = i; } if ( pStart > rStart ) { gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1; if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, gapEnd); Vector_addElement(gapPairs, cp); } } gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1; if ( pEnd >= rEnd && rIdx < 0 ) { rIdx = i; break; } } // do we have to make another gap? if ( gapStart <= rEnd ) { if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, rEnd); Vector_addElement(gapPairs, cp); } } // // Merge the new range into the registered list // if (rStartIdx >= 0 ) { // rStartIdx has been set to something long newStart; long newEnd; CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); CoordPair *rEndIdxRange = Vector_getElementAt(list, rEndIdx); if ( rStart < CoordPair_getStart(rStartIdxRange)) { newStart = rStart; } else { newStart = CoordPair_getStart(rStartIdxRange); } if ( rEnd > CoordPair_getEnd(rEndIdxRange)) { newEnd = rEnd; } else { newEnd = CoordPair_getEnd(rEndIdxRange); } CoordPair *cp = CoordPair_new(newStart, newEnd); // Think its <= for (i=rStartIdx; i<=rEndIdx; i++) { Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called } Vector_insertElementAt(list, rStartIdx, cp); //splice( @$list, $rstart_idx, // $rend_idx - $rstart_idx + 1, // [ $new_start, $new_end ] ); } else if (rIdx >= 0) { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_insertElementAt(list, rIdx, cp); //splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); } else { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); } // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK return gapPairs; }
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) { int ref; int begRange; int endRange; char region[1024]; char region_name[512]; if (Slice_getChrStart(slice) != 1) { fprintf(stderr, "Currently only allow a slice start position of 1\n"); return 1; } if (flags & M_UCSC_NAMING) { sprintf(region,"chr%s", Slice_getSeqRegionName(slice)); } else { sprintf(region,"%s", Slice_getSeqRegionName(slice)); } bam_hdr_t *header = bam_hdr_init(); header = bam_hdr_read(in->fp.bgzf); ref = bam_name2id(header, region); if (ref < 0) { fprintf(stderr, "Invalid region %s\n", region); exit(1); } sprintf(region,"%s:%ld-%ld", region_name, Slice_getSeqRegionStart(slice), Slice_getSeqRegionEnd(slice)); if (hts_parse_reg(region, &begRange, &endRange) == NULL) { fprintf(stderr, "Could not parse %s\n", region); exit(2); } bam_hdr_destroy(header); hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange); bam1_t *b = bam_init1(); Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage)); long counter = 0; long overlapping = 0; long bad = 0; int startIndex = 0; while (bam_itr_next(in, iter, b) >= 0) { if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) { bad++; continue; } int end; //end = bam_calend(&b->core, bam1_cigar(b)); end = bam_endpos(b); // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in). // That is the reason for the || end == begRange test if (end == begRange) { continue; } counter++; if (!(counter%1000000)) { if (verbosity > 1) { printf("."); } fflush(stdout); } // Remember: b->core.pos is zero based! int cigInd; int refPos; int readPos; uint32_t *cigar = bam_get_cigar(b); for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) { int k; int lenCigBlock = cigar[cigInd]>>4; int op = cigar[cigInd]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (k = 0; k < lenCigBlock; ++k) { //if (ref[refPos+k] == 0) break; // out of boundary coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; readPos += lenCigBlock; } else if (op == BAM_CDEL) { for (k = 0; k < lenCigBlock; ++k) { // if (ref[refPos+k] == 0) break; coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; } else if (op == BAM_CSOFT_CLIP) { readPos += lenCigBlock; } else if (op == BAM_CHARD_CLIP) { } else if (op == BAM_CINS) { readPos += lenCigBlock; } else if (op == BAM_CREF_SKIP) { refPos += lenCigBlock; } } #ifdef DONE int j; int done = 0; int hadOverlap = 0; for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) { Gene *gene = Vector_getElementAt(genes,j); if (!gene) { continue; } // Remember: b->core.pos is zero based! if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) { int k; int doneGene = 0; for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) { Transcript *trans = Gene_getTranscriptAt(gene,k); if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) { int m; for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) { Exon *exon = Transcript_getExonAt(trans,m); if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) { // Only count as overlapping once (could be that a read overlaps more than one gene) if (!hadOverlap) { overlapping++; hadOverlap = 1; } gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); gs->score++; doneGene = 1; } } } } } else if (Gene_getStart(gene) > end) { done = 1; } else if (Gene_getEnd(gene) < b->core.pos+1) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); if (verbosity > 1) { printf("Removing gene %s (index %d) with extent %d to %d\n", Gene_getStableId(gene), gs->index, Gene_getStart(gene), Gene_getEnd(gene)); } Vector_setElementAt(genes,j,NULL); // Magic (very important for speed) - move startIndex to first non null gene int n; startIndex = 0; for (n=0;n<Vector_getNumElement(genes);n++) { void *v = Vector_getElementAt(genes,n); if (v != NULL) { break; } startIndex++; } if (verbosity > 1) { printf("startIndex now %d\n",startIndex); } } } #endif } if (verbosity > 1) { printf("\n"); } #ifdef DONE // Print out read counts for what ever's left in the genes array int n; for (n=0;n<Vector_getNumElement(genes);n++) { Gene *gene = Vector_getElementAt(genes,n); if (gene != NULL) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); } } #endif printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad); long i; for (i=0; i< Slice_getLength(slice); i++) { printf("%ld %ld\n", i+1, coverage[i].coverage); } sam_itr_destroy(iter); bam_destroy1(b); return 1; }
/* =head2 _objs_from_sth Arg [1] : DBI:st $sth An executed DBI statement handle Arg [2] : (optional) Bio::EnsEMBL::Mapper $mapper An mapper to be used to convert contig coordinates to assembly coordinates. Arg [3] : (optional) Bio::EnsEMBL::Slice $slice A slice to map the prediction transcript to. Example : $p_transcripts = $self->_objs_from_sth($sth); Description: Creates a list of Prediction transcripts from an executed DBI statement handle. The columns retrieved via the statement handle must be in the same order as the columns defined by the _columns method. If the slice argument is provided then the the prediction transcripts will be in returned in the coordinate system of the $slice argument. Otherwise the prediction transcripts will be returned in the RawContig coordinate system. Returntype : reference to a list of Bio::EnsEMBL::PredictionTranscripts Exceptions : none Caller : superclass generic_fetch Status : Stable =cut */ Vector *PredictionTranscriptAdaptor_objectsFromStatementHandle(PredictionTranscriptAdaptor *pta, StatementHandle *sth, AssemblyMapper *assMapper, Slice *destSlice) { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); AnalysisAdaptor *aa = DBAdaptor_getAnalysisAdaptor(pta->dba); Vector *pTranscripts = Vector_new(); IDHash *sliceHash = IDHash_new(IDHASH_SMALL); long destSliceStart; long destSliceEnd; int destSliceStrand; long destSliceLength; char * destSliceSrName; IDType destSliceSrId = 0; if (destSlice) { destSliceStart = Slice_getStart(destSlice); destSliceEnd = Slice_getEnd(destSlice); destSliceStrand = Slice_getStrand(destSlice); destSliceLength = Slice_getLength(destSlice); destSliceSrName = Slice_getSeqRegionName(destSlice); destSliceSrId = Slice_getSeqRegionId(destSlice); } ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType predictionTranscriptId = row->getLongLongAt(row,0); IDType seqRegionId = row->getLongLongAt(row,1); long seqRegionStart = row->getLongAt(row,2); long seqRegionEnd = row->getLongAt(row,3); int seqRegionStrand = row->getIntAt(row,4); IDType analysisId = row->getLongLongAt(row,5); char *displayLabel = row->getStringAt(row,6); // get the analysis object Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId); if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } Slice *slice = IDHash_getValue(sliceHash, seqRegionId); Slice *ptSlice = slice; char *srName = Slice_getSeqRegionName(slice); CoordSystem *srCs = Slice_getCoordSystem(slice); // // remap the feature coordinates to another coord system // if a mapper was provided // if (assMapper != NULL) { MapperRangeSet *mrs; // Slightly suspicious about need for this if statement so left in perl statements for now if (destSlice != NULL && assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) { mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice); } else { mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL); } // skip features that map to gaps or coord system boundaries if (MapperRangeSet_getNumRange(mrs) == 0) { continue; } MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0); if (range->rangeType == MAPPERRANGE_GAP) { fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n"); exit(1); } else { MapperCoordinate *mc = (MapperCoordinate *)range; seqRegionId = mc->id; seqRegionStart = mc->start; seqRegionEnd = mc->end; seqRegionStrand = mc->strand; } MapperRangeSet_free(mrs); if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } ptSlice = IDHash_getValue(sliceHash, seqRegionId); } // // If a destination slice was provided convert the coords // If the dest_slice starts at 1 and is foward strand, nothing needs doing // if (destSlice != NULL) { if (destSliceStart != 1 || destSliceStrand != 1) { if (destSliceStrand == 1) { seqRegionStart = seqRegionStart - destSliceStart + 1; seqRegionEnd = seqRegionEnd - destSliceStart + 1; } else { long tmpSeqRegionStart = seqRegionStart; seqRegionStart = destSliceEnd - seqRegionEnd + 1; seqRegionEnd = destSliceEnd - tmpSeqRegionStart + 1; seqRegionStrand = -seqRegionStrand; } } // throw away features off the end of the requested slice if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) { continue; } ptSlice = destSlice; } // Finally, create the new PredictionTranscript. PredictionTranscript *pt = PredictionTranscript_new(); PredictionTranscript_setStart (pt, seqRegionStart); PredictionTranscript_setEnd (pt, seqRegionEnd); PredictionTranscript_setStrand (pt, seqRegionStrand); PredictionTranscript_setSlice (pt, ptSlice); PredictionTranscript_setAnalysis (pt, analysis); PredictionTranscript_setAdaptor (pt, (BaseAdaptor *)pta); PredictionTranscript_setDbID (pt, predictionTranscriptId); PredictionTranscript_setDisplayLabel(pt, displayLabel); Vector_addElement(pTranscripts, pt); } IDHash_free(sliceHash, NULL); return pTranscripts; }
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) { //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name); Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName); // if there are 0 or 1 transcripts still do lazy-loading if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) { return transcripts; } // preload all of the exons now, instead of lazy loading later // faster than 1 query per transcript // get extent of region spanned by transcripts long minStart = 2000000000; long maxEnd = -2000000000; int i; for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) { minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t); } if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) { maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t); } } Slice *extSlice; if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) { extSlice = slice; } else { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice), minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0); } // associate exon identifiers with transcripts IDHash *trHash = IDHash_new(IDHASH_MEDIUM); for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) { IDHash_add(trHash, PredictionTranscript_getDbID(t), t); } } IDType *uniqueIds = IDHash_getKeys(trHash); char tmpStr[1024]; char *qStr = NULL; if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating qStr\n"); return transcripts; } int lenNum; int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE prediction_transcript_id IN ("); for (i=0; i<IDHash_getNumValues(trHash); i++) { if (i!=0) { qStr[endPoint++] = ','; qStr[endPoint++] = ' '; } lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]); memcpy(&(qStr[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } qStr[endPoint++] = ')'; qStr[endPoint] = '\0'; free(uniqueIds); StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr)); sth->execute(sth); IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType trId = row->getLongLongAt(row,0); IDType exId = row->getLongLongAt(row,1); int rank = row->getIntAt(row,2); if (! IDHash_contains(exTrHash, exId)) { Vector *vec = Vector_new(); Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free); IDHash_add(exTrHash, exId, vec); } Vector *exVec = IDHash_getValue(exTrHash, exId); PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank); Vector_addElement(exVec, trp); } IDHash_free(trHash, NULL); sth->finish(sth); PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba); Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice); // move exons onto transcript slice, and add them to transcripts for (i=0; i<Vector_getNumElement(exons); i++) { PredictionExon *ex = Vector_getElementAt(exons, i); // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue; PredictionExon *newEx; if (slice != extSlice) { newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice); if (newEx == NULL) { fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n"); exit(1); } } else { newEx = ex; } Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx)); int j; for (j=0; j<Vector_getNumElement(exVec); j++) { PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j); PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank); } } IDHash_free(exTrHash, Vector_free); free(qStr); return transcripts; }
Vector *IntronSupportingEvidenceAdaptor_objectsFromStatementHandle(IntronSupportingEvidenceAdaptor *isea, StatementHandle *sth, AssemblyMapper *assMapper, Slice *destSlice) { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(isea->dba); AnalysisAdaptor *aa = DBAdaptor_getAnalysisAdaptor(isea->dba); Vector *features = Vector_new(); IDHash *sliceHash = IDHash_new(IDHASH_SMALL); /* Unneccesary my %analysis_hash; my %sr_name_hash; my %sr_cs_hash; */ /* Unused my $asm_cs; my $cmp_cs; my $asm_cs_vers; my $asm_cs_name; my $cmp_cs_vers; my $cmp_cs_name; if($mapper) { $asm_cs = $mapper->assembled_CoordSystem(); $cmp_cs = $mapper->component_CoordSystem(); $asm_cs_name = $asm_cs->name(); $asm_cs_vers = $asm_cs->version(); $cmp_cs_name = $cmp_cs->name(); $cmp_cs_vers = $cmp_cs->version(); } */ long destSliceStart; long destSliceEnd; int destSliceStrand; long destSliceLength; //CoordSystem *destSliceCs; char * destSliceSrName; IDType destSliceSrId = 0; //AssemblyMapperAdaptor *asma; if (destSlice) { destSliceStart = Slice_getStart(destSlice); destSliceEnd = Slice_getEnd(destSlice); destSliceStrand = Slice_getStrand(destSlice); destSliceLength = Slice_getLength(destSlice); //??destSliceCs = Slice_getCoordSystem(destSlice); destSliceSrName = Slice_getSeqRegionName(destSlice); destSliceSrId = Slice_getSeqRegionId(destSlice); //??asma = DBAdaptor_getAssemblyMapperAdaptor(ea->dba); } ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row,0); IDType analysisId = row->getLongLongAt(row,1); IDType seqRegionId = row->getLongLongAt(row,2); long seqRegionStart = row->getLongAt(row,3); long seqRegionEnd = row->getLongAt(row,4); int seqRegionStrand = row->getIntAt(row,5); char *hitName = row->getStringAt(row,6); double score = row->getDoubleAt(row,7); char *scoreType = row->getStringAt(row,8); int spliceCanonical = row->getIntAt(row,9); // get the analysis object Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId); /* // need to get the internal_seq_region, if present $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); #get the slice object my $slice = $slice_hash{"ID:".$seq_region_id}; if(!$slice) { $slice = $sa->fetch_by_seq_region_id($seq_region_id); $slice_hash{"ID:".$seq_region_id} = $slice; $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); $sr_cs_hash{$seq_region_id} = $slice->coord_system(); } my $sr_name = $sr_name_hash{$seq_region_id}; my $sr_cs = $sr_cs_hash{$seq_region_id}; */ if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } Slice *slice = IDHash_getValue(sliceHash, seqRegionId); Slice *iseSlice = slice; char *srName = Slice_getSeqRegionName(slice); CoordSystem *srCs = Slice_getCoordSystem(slice); // // remap the feature coordinates to another coord system // if a mapper was provided // if (assMapper != NULL) { MapperRangeSet *mrs; // Slightly suspicious about need for this if statement so left in perl statements for now if (destSlice != NULL && assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) { mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice); } else { mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL); } // skip features that map to gaps or coord system boundaries //next FEATURE if (!defined($seq_region_id)); if (MapperRangeSet_getNumRange(mrs) == 0) { continue; } MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0); if (range->rangeType == MAPPERRANGE_GAP) { fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n"); exit(1); } else { MapperCoordinate *mc = (MapperCoordinate *)range; seqRegionId = mc->id; seqRegionStart = mc->start; seqRegionEnd = mc->end; seqRegionStrand = mc->strand; } MapperRangeSet_free(mrs); /* Was - but identical if and else so why test??? #get a slice in the coord system we just mapped to if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { $slice = $slice_hash{"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id); } else { $slice = $slice_hash{"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id); } */ // Instead... if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } iseSlice = IDHash_getValue(sliceHash, seqRegionId); } // // If a destination slice was provided convert the coords // If the dest_slice starts at 1 and is foward strand, nothing needs doing // if (destSlice != NULL) { if (destSliceStart != 1 || destSliceStrand != 1) { if (destSliceStrand == 1) { seqRegionStart = seqRegionStart - destSliceStart + 1; seqRegionEnd = seqRegionEnd - destSliceStart + 1; } else { long tmpSeqRegionStart = seqRegionStart; seqRegionStart = destSliceEnd - seqRegionEnd + 1; seqRegionEnd = destSliceEnd - tmpSeqRegionStart + 1; seqRegionStrand = -seqRegionStrand; } } // throw away features off the end of the requested slice if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) { continue; } iseSlice = destSlice; } IntronSupportingEvidence *ise = IntronSupportingEvidence_new(); IntronSupportingEvidence_setStart (ise, seqRegionStart); IntronSupportingEvidence_setEnd (ise, seqRegionEnd); IntronSupportingEvidence_setStrand (ise, seqRegionStrand); IntronSupportingEvidence_setSlice (ise, iseSlice); IntronSupportingEvidence_setAnalysis (ise, analysis); IntronSupportingEvidence_setAdaptor (ise, (BaseAdaptor *)isea); IntronSupportingEvidence_setDbID (ise, id); IntronSupportingEvidence_setHitName (ise, hitName); IntronSupportingEvidence_setScore (ise, score); IntronSupportingEvidence_setScoreType (ise, scoreType); IntronSupportingEvidence_setIsSpliceCanonical(ise, spliceCanonical); Vector_addElement(features, ise); } return features; }
// this function merges pairs that are adjacent into one // This function is a pain in the arse to implement in C void Mapper_mergePairs(Mapper *m) { int to = MAPPER_TO_IND; int from = MAPPER_FROM_IND; Mapper_setPairCount(m, 0); IDHash *toPairHash = Mapper_getPairHash(m, MAPPER_TO_IND); IDHash *fromPairHash = Mapper_getPairHash(m, MAPPER_FROM_IND); MapperPairSet **toPairValues = (MapperPairSet **)IDHash_getValues(toPairHash); int pairInd; for (pairInd = 0; pairInd<IDHash_getNumValues(toPairHash); pairInd++) { MapperPairSet *pairs = toPairValues[pairInd]; int i = 0; int next = 1; int length = MapperPairSet_getNumPair(pairs)-1; //$#{$lr}; while (next <= length) { MapperPair *currentPair = MapperPairSet_getPairAt(pairs, i); MapperPair *nextPair = MapperPairSet_getPairAt(pairs, next); MapperPair *delPair = NULL; if (MapperPair_isIndel(currentPair) || MapperPair_isIndel(nextPair)) { //necessary to modify the merge function to not merge indels next++; i++; } else { // duplicate filter if ( MapperPair_getUnit(currentPair,to)->start == MapperPair_getUnit(nextPair,to)->start && MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) { delPair = nextPair; } else if (( MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) && ( nextPair->ori == currentPair->ori ) && ( MapperPair_getUnit(nextPair,to)->start-1 == MapperPair_getUnit(currentPair,to)->end )) { if ( currentPair->ori == 1 ) { // check forward strand merge if ( MapperPair_getUnit(nextPair,from)->start-1 == MapperPair_getUnit(currentPair,from)->end) { // normal merge with previous element MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end; MapperPair_getUnit(currentPair,from)->end = MapperPair_getUnit(nextPair,from)->end; delPair = nextPair; } } else { // check backward strand merge if ( MapperPair_getUnit(nextPair,from)->end+1 == MapperPair_getUnit(currentPair,from)->start ) { // yes its a merge MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end; MapperPair_getUnit(currentPair,from)->start = MapperPair_getUnit(nextPair,from)->start; delPair = nextPair; } } } if (delPair != NULL) { // Have a pair to delete // Remove from the to pair set MapperPairSet_removePairAt(pairs, next); //splice( @$lr, $next, 1 ); MapperPairSet *fromPairs = IDHash_getValue(fromPairHash, MapperPair_getUnit(delPair, from)->id); //$self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})}; int j; for (j=0; j < MapperPairSet_getNumPair(fromPairs); j++) { MapperPair *fromPair = MapperPairSet_getPairAt(fromPairs, j); if ( fromPair == delPair) { // Is this really going to be an equality ??? //$lr_from->[$j] == $del_pair ) MapperPairSet_removePairAt(fromPairs, j); //splice( @$lr_from, $j, 1 ); break; } } // NIY: Do we need to free delPair??? length--; if ( length < next ) break; } else { next++; i++; } } } Mapper_addToPairCount(m, MapperPairSet_getNumPair(pairs)); // $self->{'pair_count'} += scalar( @$lr ); } }