/* =head2 add_map_coordinates Arg 1 int $id id of 'source' sequence Arg 2 int $start start coordinate of 'source' sequence Arg 3 int $end end coordinate of 'source' sequence Arg 4 int $strand relative orientation of source and target (+/- 1) Arg 5 int $id id of 'target' sequence Arg 6 int $start start coordinate of 'target' sequence Arg 7 int $end end coordinate of 'target' sequence Function Stores details of mapping between 'source' and 'target' regions. Returntype none Exceptions none Caller Bio::EnsEMBL::Mapper =cut */ void Mapper_addMapCoordinates(Mapper *m, IDType contigId, int contigStart, int contigEnd, int contigOri, IDType chrId, int chrStart, int chrEnd) { MapperPair *pair; MapperUnit *from; MapperUnit *to; IDHash *fromHash; IDHash *toHash; MapperPairSet *mps; if ((contigEnd - contigStart) != (chrEnd - chrStart)) { fprintf(stderr,"ERROR: Cannot deal with mis-lengthed mappings so far\n"); fprintf(stderr,"Contig %d to %d and chromosome %d to %d\n",contigStart,contigEnd, chrStart,chrEnd); exit(1); } pair = MapperPair_new(); from = MapperUnit_new(); from->start = contigStart; from->end = contigEnd; from->id = contigId; to = MapperUnit_new(); to->start = chrStart; to->end = chrEnd; to->id = chrId; MapperPair_setUnit(pair,MAPPER_TO_IND,to); MapperPair_setUnit(pair,MAPPER_FROM_IND,from); pair->ori = contigOri; // place into hash on both ids fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND); toHash = Mapper_getPairHash(m, MAPPER_TO_IND); if (!IDHash_contains(toHash,chrId)) { IDHash_add(toHash,chrId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(toHash,chrId); MapperPairSet_addPair(mps,pair); if (!IDHash_contains(fromHash,contigId)) { IDHash_add(fromHash,contigId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(fromHash,contigId); MapperPairSet_addPair(mps,pair); Mapper_incPairCount(m); Mapper_setIsSorted(m,0); }
// This is almost identical to Mapper_addCoordinates (just the isIndel line as far as I can see!) - I should refactor this int Mapper_addIndelCoordinates( Mapper *m, IDType contigId, long contigStart, long contigEnd, int contigOri, IDType chrId, long chrStart, long chrEnd) { MapperPair *pair; MapperUnit *from; MapperUnit *to; IDHash *fromHash; IDHash *toHash; MapperPairSet *mps; //we need to create the IndelPair object to add to both lists, to and from pair = MapperPair_new(); from = MapperUnit_new(); from->start = contigStart; from->end = contigEnd; from->id = contigId; to = MapperUnit_new(); to->start = chrStart; to->end = chrEnd; to->id = chrId; MapperPair_setUnit(pair,MAPPER_TO_IND,to); MapperPair_setUnit(pair,MAPPER_FROM_IND,from); pair->ori = contigOri; pair->isIndel = 1; // place into hash on both ids fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND); toHash = Mapper_getPairHash(m, MAPPER_TO_IND); if (!IDHash_contains(toHash,chrId)) { IDHash_add(toHash,chrId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(toHash,chrId); MapperPairSet_addPair(mps,pair); if (!IDHash_contains(fromHash,contigId)) { IDHash_add(fromHash,contigId,MapperPairSet_new()); } mps = (MapperPairSet *)IDHash_getValue(fromHash,contigId); MapperPairSet_addPair(mps,pair); Mapper_incPairCount(m); Mapper_setIsSorted(m,0); return 1; }
/* =head2 register_assembled Arg [1] : integer $asm_seq_region The dbID of the sequence region to register. Arg [2] : int $chunk_id The chunk number of the provided seq_region to register. Example : $asm_mapper->register_assembled( 'X', 4 ); Description: Flags a given assembled region as registered in this assembly mapper. This should only be called by this class or the AssemblyMapperAdaptor. Do not call this method unless you really know what you are doing. Return type: None Exceptions : Throws on incorrect arguments Caller : Internal, AssemblyMapperAdaptor Status : Stable =cut */ void AssemblyMapper_registerAssembled(AssemblyMapper *am, IDType asmSeqRegionId, int chunkId) { IDHash *assembledRegister = AssemblyMapper_getAssembledRegister(am); if ( !IDHash_contains(assembledRegister, asmSeqRegionId) ) { IDHash_add(assembledRegister, asmSeqRegionId, IDHash_new(IDHASH_MEDIUM)); } IDHash *chunkHash = IDHash_getValue(assembledRegister, asmSeqRegionId); if (!IDHash_contains(chunkHash, (IDType)chunkId)) { IDHash_add(chunkHash, (IDType)chunkId, &trueVal); } }
int AssemblyMapper_haveRegisteredAssembled(AssemblyMapper *am, IDType asmSeqRegionId, int chunkId) { IDHash *assembledRegister = AssemblyMapper_getAssembledRegister(am); if ( !IDHash_contains(assembledRegister, asmSeqRegionId) ) { return 0; } IDHash *chunkHash = IDHash_getValue(assembledRegister, asmSeqRegionId); if (!IDHash_contains(chunkHash, (IDType)chunkId)) { return 0; } return 1; }
void AssemblyMapper_registerComponent(AssemblyMapper *am, IDType cmpSeqRegionId) { IDHash *componentRegister = AssemblyMapper_getComponentRegister(am); if ( !IDHash_contains(componentRegister, cmpSeqRegionId) ) { IDHash_add(componentRegister, cmpSeqRegionId, &trueVal); } }
Chromosome *ChromosomeAdaptor_fetchByDbID(ChromosomeAdaptor *ca, IDType dbID) { Chromosome *chromosome; char qStr[256]; StatementHandle *sth; ResultRow *row; if (IDHash_contains(ca->chrCache,dbID)) { chromosome = IDHash_getValue(ca->chrCache, dbID); } else { sprintf(qStr,"SELECT chromosome_id, name, length" " FROM chromosome" " WHERE chromosome_id = " IDFMTSTR, dbID); sth = ca->prepare((BaseAdaptor *)ca,qStr,strlen(qStr)); sth->execute(sth); row = sth->fetchRow(sth); if( row == NULL ) { sth->finish(sth); return NULL; } chromosome = ChromosomeAdaptor_chromosomeFromRow(ca, row); sth->finish(sth); } return chromosome; }
int AssemblyMapper_haveRegisteredComponent(AssemblyMapper *am, IDType cmpSeqRegionId) { IDHash *componentRegister = AssemblyMapper_getComponentRegister(am); if ( !IDHash_contains(componentRegister, cmpSeqRegionId) ) { return 0; } return 1; }
// low level function to access the ranges // only use for read access Vector *RangeRegistry_getRanges(RangeRegistry *registry, IDType id) { IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list = NULL; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } return list; }
void Translation_transform(Translation *translation, IDHash *exonTransforms) { Exon * startExon = Translation_getStartExon(translation); Exon * endExon = Translation_getEndExon(translation); IDType startExonRef = (IDType)startExon; IDType endExonRef = (IDType)endExon; /* CHECK */ if (IDHash_contains(exonTransforms,startExonRef)) { Translation_setStartExon(translation,IDHash_getValue(exonTransforms,startExonRef)); } else { // do nothing, the start exon wasnt mapped } if (IDHash_contains(exonTransforms,endExonRef)) { Translation_setEndExon(translation,IDHash_getValue(exonTransforms,endExonRef)); } else { // do nothing, the end exon wasnt mapped } }
MapperRangeSet *AssemblyMapper_fastMapImpl(AssemblyMapper *am, char *frmSeqRegionName, long frmStart, long frmEnd, int frmStrand, CoordSystem *frmCs, Slice *toSlice) { Mapper *mapper = AssemblyMapper_getMapper(am); CoordSystem *asmCs = AssemblyMapper_getAssembledCoordSystem(am); CoordSystem *cmpCs = AssemblyMapper_getComponentCoordSystem(am); AssemblyMapperAdaptor *adaptor = AssemblyMapper_getAdaptor(am); char *frm; IDType seqRegionId = AssemblyMapper_getSeqRegionId(am, frmSeqRegionName, frmCs); // Speed critical section: // Try to do simple pointer equality comparisons of the coord system // objects first since this is likely to work most of the time and is // much faster than a function call. if ( frmCs == cmpCs || ( frmCs != asmCs && !CoordSystem_compare(frmCs, cmpCs)) ) { if ( !IDHash_contains(AssemblyMapper_getComponentRegister(am), seqRegionId) ) { AssemblyMapperAdaptor_registerComponent( adaptor, am, seqRegionId); } frm = "component"; } else if ( frmCs == asmCs || !CoordSystem_compare(frmCs, asmCs) ) { // This can be probably be sped up some by only calling registered // assembled if needed. AssemblyMapperAdaptor_registerAssembled( adaptor, am, seqRegionId, frmStart, frmEnd); frm = "assembled"; } else { fprintf(stderr,"Coordinate system %s %s is neither the assembled nor the component coordinate system of this AssemblyMapper\n", CoordSystem_getName(frmCs), CoordSystem_getVersion(frmCs) ); } return Mapper_fastMap( mapper, seqRegionId, frmStart, frmEnd, frmStrand, frm ); }
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) { //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name); Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName); // if there are 0 or 1 transcripts still do lazy-loading if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) { return transcripts; } // preload all of the exons now, instead of lazy loading later // faster than 1 query per transcript // get extent of region spanned by transcripts long minStart = 2000000000; long maxEnd = -2000000000; int i; for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) { minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t); } if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) { maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t); } } Slice *extSlice; if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) { extSlice = slice; } else { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice), minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0); } // associate exon identifiers with transcripts IDHash *trHash = IDHash_new(IDHASH_MEDIUM); for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) { IDHash_add(trHash, PredictionTranscript_getDbID(t), t); } } IDType *uniqueIds = IDHash_getKeys(trHash); char tmpStr[1024]; char *qStr = NULL; if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating qStr\n"); return transcripts; } int lenNum; int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE prediction_transcript_id IN ("); for (i=0; i<IDHash_getNumValues(trHash); i++) { if (i!=0) { qStr[endPoint++] = ','; qStr[endPoint++] = ' '; } lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]); memcpy(&(qStr[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } qStr[endPoint++] = ')'; qStr[endPoint] = '\0'; free(uniqueIds); StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr)); sth->execute(sth); IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType trId = row->getLongLongAt(row,0); IDType exId = row->getLongLongAt(row,1); int rank = row->getIntAt(row,2); if (! IDHash_contains(exTrHash, exId)) { Vector *vec = Vector_new(); Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free); IDHash_add(exTrHash, exId, vec); } Vector *exVec = IDHash_getValue(exTrHash, exId); PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank); Vector_addElement(exVec, trp); } IDHash_free(trHash, NULL); sth->finish(sth); PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba); Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice); // move exons onto transcript slice, and add them to transcripts for (i=0; i<Vector_getNumElement(exons); i++) { PredictionExon *ex = Vector_getElementAt(exons, i); // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue; PredictionExon *newEx; if (slice != extSlice) { newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice); if (newEx == NULL) { fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n"); exit(1); } } else { newEx = ex; } Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx)); int j; for (j=0; j<Vector_getNumElement(exVec); j++) { PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j); PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank); } } IDHash_free(exTrHash, Vector_free); free(qStr); return transcripts; }
long RangeRegistry_overlapSize(RangeRegistry *registry, IDType id, long start, long end) { long overlap = 0; if ( start > end ) return 0; IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { return 0; // No list for this id, so can't be any overlap } int len = Vector_getNumElement(list); if ( len == 0 ) { fprintf(stderr, "Odd have zero length list in RangeRegistry_overlapSize\n"); return 0; } int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < start ) { startIdx = midIdx; } else { endIdx = midIdx; } } int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); if ( pStart > end ) { break; } if ( pStart <= start && pEnd >= end ) { overlap = end - start + 1; break; } long mStart = ( start < pStart ? pStart : start ); long mEnd = ( end < pEnd ? end : pEnd ); if (mEnd - mStart >= 0) { overlap += ( mEnd - mStart + 1 ); } } return overlap; }
/* =head2 list_pairs Arg 1 int $id id of 'source' sequence Arg 2 int $start start coordinate of 'source' sequence Arg 3 int $end end coordinate of 'source' sequence Arg 4 string $type nature of transform - gives the type of coordinates to be transformed *from* Function list all pairs of mappings in a region Returntype list of Bio::EnsEMBL::Mapper::Pair Exceptions none Caller Bio::EnsEMBL::Mapper =cut */ MapperPairSet *Mapper_listPairs(Mapper *m, IDType id, long start, long end, char *type) { MapperPairSet *pairs; IDHash *hash; int from, to; MapperPairSet *retSet; int i; if (start > end) { fprintf(stderr,"ERROR: Start is greater than end for id " IDFMTSTR ", start %ld, end %ld\n",id,start,end); } if( Mapper_getIsSorted(m) == 0 ) { Mapper_sort(m); } if (!Mapper_compareType(type, Mapper_getTo(m))) { from = MAPPER_TO_IND; to = MAPPER_FROM_IND; } else { from = MAPPER_FROM_IND; to = MAPPER_TO_IND; } hash = Mapper_getPairHash(m, from); if (!hash) { fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type); exit(1); } if (!IDHash_contains(hash, id)) { return NULL; } pairs = IDHash_getValue(hash,id); retSet = MapperPairSet_new(); //Mapper_dump(m,NULL); //fprintf(stderr, "listPairs with %d %d %d\n",id,start,end); if (start == -1 && end == -1) { for (i=0;i<MapperPairSet_getNumPair(pairs);i++) { MapperPairSet_addPair(retSet,MapperPairSet_getPairAt(pairs,i)); } } else { for (i=0;i<MapperPairSet_getNumPair(pairs);i++) { MapperPair *pair = MapperPairSet_getPairAt(pairs,i); MapperUnit *fromCoord = MapperPair_getUnit(pair, from); //fprintf(stderr," unit %d %d\n",fromCoord->start,fromCoord->end); if( fromCoord->end < start ) { continue; } if( fromCoord->start > end ) { break; } MapperPairSet_addPair(retSet,pair); } } return retSet; }
Vector *DBEntryAdaptor_fetchByObjectType(DBEntryAdaptor *dbea, IDType ensObj, char *ensType) { Vector *out; char qStr[1024]; StatementHandle *sth; ResultRow *row; IDHash *seen; if (!ensObj) { fprintf(stderr,"Error: Can't fetchByObjectType without an object\n"); exit(1); } if (!ensType) { fprintf(stderr,"Error: Can't fetchByObjectType without a type\n"); exit(1); } // Not sure if idt identities are right way round sprintf(qStr, "SELECT xref.xref_id, xref.dbprimary_acc, xref.display_label, xref.version," " xref.description," " exDB.db_name, exDB.db_release, exDB.status," " oxr.object_xref_id," " es.synonym," " idt.xref_identity, idt.ensembl_identity" " FROM (external_db exDB, object_xref oxr, xref xref)" " LEFT JOIN external_synonym es on es.xref_id = xref.xref_id" " LEFT JOIN identity_xref idt on idt.object_xref_id = oxr.object_xref_id" " WHERE xref.xref_id = oxr.xref_id" " AND xref.external_db_id = exDB.external_db_id" " AND oxr.ensembl_id = " IDFMTSTR " AND oxr.ensembl_object_type = '%s'", ensObj, ensType); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); seen = IDHash_new(IDHASH_SMALL); out = Vector_new(); while ((row = sth->fetchRow(sth))) { DBEntry *exDB; IDType refID = row->getLongLongAt(row,0); // using an outer join on the synonyms as well as on identity_xref, we // now have to filter out the duplicates (see v.1.18 for // original). Since there is at most one identity_xref row per xref, // this is easy enough; all the 'extra' bits are synonyms if (!IDHash_contains(seen,refID)) { exDB = DBEntry_new(); DBEntry_setAdaptor(exDB,(BaseAdaptor *)dbea); DBEntry_setDbID(exDB, refID); DBEntry_setPrimaryId(exDB, row->getStringAt(row,1)); DBEntry_setDisplayId(exDB, row->getStringAt(row,2)); DBEntry_setVersion(exDB, row->getStringAt(row,3)); DBEntry_setDbName(exDB, row->getStringAt(row,5)); DBEntry_setRelease(exDB, row->getStringAt(row,6)); if (row->col(row,10)) { IdentityXref *idx = IdentityXref_new(); DBEntry_setIdentityXref(exDB,idx); IdentityXref_setQueryIdentity(idx, row->getDoubleAt(row,10)); IdentityXref_setTargetIdentity(idx, row->getDoubleAt(row,11)); } if (row->col(row,4)) DBEntry_setDescription(exDB, row->getStringAt(row,4)); if (row->col(row,7)) DBEntry_setStatus(exDB, row->getStringAt(row,7)); Vector_addElement(out, exDB); IDHash_add(seen, refID, exDB); } exDB = IDHash_getValue(seen, refID); if (row->col(row,9)) { DBEntry_addSynonym(exDB,row->getStringAt(row,9)); } } IDHash_free(seen, NULL); sth->finish(sth); return out; }
// NIY: May need some reworking to handle mapInsert because I'd changed the way it returns data // Change back to returning MapperRangeSet MapperRangeSet *Mapper_fastMap(Mapper *m, IDType id, long start, long end, int strand, char *type) { MapperPairSet *pairs; int i; IDHash *hash; int from, to; CoordSystem *cs; if(end+1 == start) { return Mapper_mapInsert(m, id, start, end, strand, type, 1); } if(!Mapper_compareType(type, Mapper_getTo(m))) { from = MAPPER_TO_IND; to = MAPPER_FROM_IND; cs = Mapper_getFromCoordSystem(m); } else { from = MAPPER_FROM_IND; to = MAPPER_TO_IND; cs = Mapper_getToCoordSystem(m); } hash = Mapper_getPairHash(m, from); if (!hash) { fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type); exit(1); } if (Mapper_getIsSorted(m) == 0) { Mapper_sort(m); } if (!IDHash_contains(hash, id)) { fprintf(stderr,"ERROR: Fastmap expects to be able to find an id. It couldnt for " IDFMTSTR "\n",id); exit(1); } pairs = IDHash_getValue(hash,id); MapperRangeSet *retSet = MapperRangeSet_new(); for (i=0;i<MapperPairSet_getNumPair(pairs);i++) { MapperPair *pair = MapperPairSet_getPairAt(pairs,i); MapperUnit *selfCoord = MapperPair_getUnit(pair, from); MapperUnit *targetCoord = MapperPair_getUnit(pair, to); // only super easy mapping is done if(start < selfCoord->start || end > selfCoord->end ) { continue; } if (pair->ori == 1) { MapperCoordinate *retRange = MapperCoordinate_new(targetCoord->id, targetCoord->start + start - selfCoord->start, targetCoord->start + end - selfCoord->start, strand, cs, 0); // Perl didn't set rank, so use 0 /* retRange->id = targetCoord->id; retRange->start = targetCoord->start + start - selfCoord->start; retRange->end = targetCoord->start + end - selfCoord->start; retRange->strand = strand; retRange->coordSystem = cs; */ MapperRangeSet_addRange(retSet, (MapperRange *)retRange); break; } else { MapperCoordinate *retRange = MapperCoordinate_new(targetCoord->id, targetCoord->end - (end - selfCoord->start), targetCoord->end - (start - selfCoord->start), -strand, cs, 0); // Perl didn't set rank, so use 0 /* retRange->id = targetCoord->id; retRange->start = targetCoord->end - (end - selfCoord->start); retRange->end = targetCoord->end - (start - selfCoord->start); retRange->strand = -strand; retRange->coordSystem = cs; */ MapperRangeSet_addRange(retSet, (MapperRange *)retRange); break; } } // NIY: Here we return empty set, in mapInsert it returns NULL for empty fastmap - need to work out which is right return retSet; }
MapperRangeSet *Mapper_mapCoordinates(Mapper *m, IDType id, long start, long end, int strand, char *type) { // special case for handling inserts: if ( start == end+1 ) { return Mapper_mapInsert(m, id, start, end, strand, type, 0 /*fastmap flag */); } else if (start > end+1) { fprintf(stderr,"ERROR: Start is greater than end for id " IDFMTSTR ", start %ld, end %ld\n",id,start,end); exit(1); } IDHash *hash; CoordSystem *cs; if( Mapper_getIsSorted(m) == 0 ) { Mapper_sort(m); } int from, to; if (!Mapper_compareType(type,Mapper_getTo(m))) { from = MAPPER_TO_IND; to = MAPPER_FROM_IND; cs = Mapper_getFromCoordSystem(m); } else if (!Mapper_compareType(type,Mapper_getFrom(m))) { from = MAPPER_FROM_IND; to = MAPPER_TO_IND; cs = Mapper_getToCoordSystem(m); } else { fprintf(stderr, "Invalid type [%s] in mapper (not from [%s] or to [%s])\n", type, Mapper_getFrom(m), Mapper_getTo(m)); exit(1); } hash = Mapper_getPairHash(m, from); if (!hash) { fprintf(stderr,"ERROR: Type %s is neither to or from coordinate system\n",type); exit(1); } MapperRangeSet *results = MapperRangeSet_new(); // Was upcasing the id - its a number in C, I haven't found a case yet where its a string if (!IDHash_contains(hash, id)) { // one big gap! MapperRange *gap = (MapperRange *)MapperGap_new(start,end,0); // Perl didn't set rank so use 0 MapperRangeSet_addRange(results,gap); return results; } MapperPairSet *pairs = IDHash_getValue(hash,id); //my $lr = $hash->{ uc($id) }; MapperPair *lastUsedPair = NULL; int startIdx, endIdx, midIdx; MapperPair *pair; MapperUnit *selfCoord; startIdx = 0; endIdx = MapperPairSet_getNumPair(pairs)-1; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; pair = MapperPairSet_getPairAt(pairs, midIdx); selfCoord = MapperPair_getUnit(pair, from); if ( selfCoord->end < start ) { startIdx = midIdx; } else { endIdx = midIdx; } } int rank = 0; long origStart = start; IDType lastTargetCoord; int lastTargetCoordIsSet = 0; int i; for (i=startIdx; i<MapperPairSet_getNumPair(pairs); i++) { MapperPair *pair = MapperPairSet_getPairAt(pairs,i); MapperUnit *selfCoord = MapperPair_getUnit(pair, from); MapperUnit *targetCoord = MapperPair_getUnit(pair, to); // // But not the case for haplotypes!! need to test for this case??? // so removing this till a better solution is found // // // if($self_coord->{'start'} < $start){ // $start = $orig_start; // $rank++; // } if ( lastTargetCoordIsSet && targetCoord->id != lastTargetCoord ) { if ( selfCoord->start < start ) { // i.e. the same bit is being mapped to another assembled bit start = origStart; } } else { lastTargetCoord = targetCoord->id; lastTargetCoordIsSet = 1; } // if we haven't even reached the start, move on if (selfCoord->end < origStart) { continue; } // if we have over run, break if (selfCoord->start > end) { break; } // Check is start not origStart if (start < selfCoord->start) { // gap detected MapperRange *gap = (MapperRange *)MapperGap_new(start, selfCoord->start-1, rank); MapperRangeSet_addRange(results,gap); start = gap->end+1; } long targetStart, targetEnd; MapperRange *res; if ( MapperPair_isIndel(pair) ) { // When next pair is an IndelPair and not a Coordinate, create the // new mapping Coordinate, the IndelCoordinate. targetStart = targetCoord->start; targetEnd = targetCoord->end; // create a Gap object MapperGap *gap = MapperGap_new(start, selfCoord->end < end ? selfCoord->end : end, 0); // Perl didn't set rank - don't know if need to // create the Coordinate object MapperCoordinate *coord = MapperCoordinate_new(targetCoord->id, targetStart, targetEnd, pair->ori * strand, cs, 0); // Perl didn't set rank - don't know if need to //and finally, the IndelCoordinate object with res = (MapperRange *)IndelCoordinate_new(gap, coord); } else { // start is somewhere inside the region if (pair->ori == 1) { targetStart = targetCoord->start + (start - selfCoord->start); } else { targetEnd = targetCoord->end - (start - selfCoord->start); } // Either we are enveloping this map or not. If yes, then end // point (self perspective) is determined solely by target. If // not we need to adjust. if (end > selfCoord->end) { // enveloped if( pair->ori == 1 ) { targetEnd = targetCoord->end; } else { targetStart = targetCoord->start; } } else { // need to adjust end if (pair->ori == 1) { targetEnd = targetCoord->start + (end - selfCoord->start); } else { targetStart = targetCoord->end - (end - selfCoord->start); } } res = (MapperRange *)MapperCoordinate_new(targetCoord->id, targetStart, targetEnd, pair->ori * strand, cs, rank); } // end else [ if ( exists $pair->{'indel'...})] MapperRangeSet_addRange(results, res); lastUsedPair = pair; start = selfCoord->end+1; } if (lastUsedPair == NULL) { MapperRange *gap = (MapperRange *)MapperGap_new(start,end, 0); // Perl doesn't set rank, so use 0 MapperRangeSet_addRange(results,gap); } else if (MapperPair_getUnit(lastUsedPair, from)->end < end) { // gap at the end MapperRange *gap = (MapperRange *)MapperGap_new( MapperPair_getUnit(lastUsedPair,from)->end + 1, end, 0 ); // Perl didn't set rank so use 0 MapperRangeSet_addRange(results,gap); } if (strand == -1) { MapperRangeSet_reverse(results); } return results; }
/* =head2 _objs_from_sth Arg [1] : DBI:st $sth An executed DBI statement handle Arg [2] : (optional) Bio::EnsEMBL::Mapper $mapper An mapper to be used to convert contig coordinates to assembly coordinates. Arg [3] : (optional) Bio::EnsEMBL::Slice $slice A slice to map the prediction transcript to. Example : $p_transcripts = $self->_objs_from_sth($sth); Description: Creates a list of Prediction transcripts from an executed DBI statement handle. The columns retrieved via the statement handle must be in the same order as the columns defined by the _columns method. If the slice argument is provided then the the prediction transcripts will be in returned in the coordinate system of the $slice argument. Otherwise the prediction transcripts will be returned in the RawContig coordinate system. Returntype : reference to a list of Bio::EnsEMBL::PredictionTranscripts Exceptions : none Caller : superclass generic_fetch Status : Stable =cut */ Vector *PredictionTranscriptAdaptor_objectsFromStatementHandle(PredictionTranscriptAdaptor *pta, StatementHandle *sth, AssemblyMapper *assMapper, Slice *destSlice) { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); AnalysisAdaptor *aa = DBAdaptor_getAnalysisAdaptor(pta->dba); Vector *pTranscripts = Vector_new(); IDHash *sliceHash = IDHash_new(IDHASH_SMALL); long destSliceStart; long destSliceEnd; int destSliceStrand; long destSliceLength; char * destSliceSrName; IDType destSliceSrId = 0; if (destSlice) { destSliceStart = Slice_getStart(destSlice); destSliceEnd = Slice_getEnd(destSlice); destSliceStrand = Slice_getStrand(destSlice); destSliceLength = Slice_getLength(destSlice); destSliceSrName = Slice_getSeqRegionName(destSlice); destSliceSrId = Slice_getSeqRegionId(destSlice); } ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType predictionTranscriptId = row->getLongLongAt(row,0); IDType seqRegionId = row->getLongLongAt(row,1); long seqRegionStart = row->getLongAt(row,2); long seqRegionEnd = row->getLongAt(row,3); int seqRegionStrand = row->getIntAt(row,4); IDType analysisId = row->getLongLongAt(row,5); char *displayLabel = row->getStringAt(row,6); // get the analysis object Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId); if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } Slice *slice = IDHash_getValue(sliceHash, seqRegionId); Slice *ptSlice = slice; char *srName = Slice_getSeqRegionName(slice); CoordSystem *srCs = Slice_getCoordSystem(slice); // // remap the feature coordinates to another coord system // if a mapper was provided // if (assMapper != NULL) { MapperRangeSet *mrs; // Slightly suspicious about need for this if statement so left in perl statements for now if (destSlice != NULL && assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) { mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice); } else { mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL); } // skip features that map to gaps or coord system boundaries if (MapperRangeSet_getNumRange(mrs) == 0) { continue; } MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0); if (range->rangeType == MAPPERRANGE_GAP) { fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n"); exit(1); } else { MapperCoordinate *mc = (MapperCoordinate *)range; seqRegionId = mc->id; seqRegionStart = mc->start; seqRegionEnd = mc->end; seqRegionStrand = mc->strand; } MapperRangeSet_free(mrs); if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } ptSlice = IDHash_getValue(sliceHash, seqRegionId); } // // If a destination slice was provided convert the coords // If the dest_slice starts at 1 and is foward strand, nothing needs doing // if (destSlice != NULL) { if (destSliceStart != 1 || destSliceStrand != 1) { if (destSliceStrand == 1) { seqRegionStart = seqRegionStart - destSliceStart + 1; seqRegionEnd = seqRegionEnd - destSliceStart + 1; } else { long tmpSeqRegionStart = seqRegionStart; seqRegionStart = destSliceEnd - seqRegionEnd + 1; seqRegionEnd = destSliceEnd - tmpSeqRegionStart + 1; seqRegionStrand = -seqRegionStrand; } } // throw away features off the end of the requested slice if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) { continue; } ptSlice = destSlice; } // Finally, create the new PredictionTranscript. PredictionTranscript *pt = PredictionTranscript_new(); PredictionTranscript_setStart (pt, seqRegionStart); PredictionTranscript_setEnd (pt, seqRegionEnd); PredictionTranscript_setStrand (pt, seqRegionStrand); PredictionTranscript_setSlice (pt, ptSlice); PredictionTranscript_setAnalysis (pt, analysis); PredictionTranscript_setAdaptor (pt, (BaseAdaptor *)pta); PredictionTranscript_setDbID (pt, predictionTranscriptId); PredictionTranscript_setDisplayLabel(pt, displayLabel); Vector_addElement(pTranscripts, pt); } IDHash_free(sliceHash, NULL); return pTranscripts; }
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) { int i; Vector *bigList = Vector_new(); IDHash *overlappingSets[2]; Vector *mergedAligns; for (i=0;i<Vector_getNumElement(alignSet1); i++) { GenomicAlign *align = Vector_getElementAt(alignSet1, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryStart(align), align, 0)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryEnd(align)+0.5, align, 0)); } for (i=0;i<Vector_getNumElement(alignSet2); i++) { GenomicAlign *align = Vector_getElementAt(alignSet2, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusStart(align), align, 1)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusEnd(align)+0.5, align, 1)); } Vector_sort(bigList, GenomicAlignListElem_compFunc); // walking from start to end through sortlist and keep track of the // currently overlapping set of Alignments overlappingSets[0] = IDHash_new(IDHASH_SMALL); overlappingSets[1] = IDHash_new(IDHASH_SMALL); mergedAligns = Vector_new(); for (i=0; i<Vector_getNumElement(bigList); i++) { GenomicAlignListElem *gale = Vector_getElementAt(bigList,i); GenomicAlign *align = gale->align; IDType alignID = GenomicAlign_getDbID(align); int setNo = gale->setNum; if (IDHash_contains(overlappingSets[setNo], alignID)) { // remove from current overlapping set IDHash_remove(overlappingSets[setNo], alignID, NULL); } else { int j; void **values = IDHash_getValues(overlappingSets[1-setNo]); // insert into the set and do all the overlap business IDHash_add(overlappingSets[setNo], alignID, align); // the other set contains everything this align overlaps with for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) { GenomicAlign *align2 = values[j]; if (setNo == 0) { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2); } else { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align); } } free(values); } } // NIY Free gale return mergedAligns; }
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak // memory Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, long rStart, long rEnd, int wantGaps) { // The following was commented out due to Ensembl Genomes requirements // for bacterial genomes. // The following was uncommented because I'm not caring about those requirements if ( start > end ) { fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end); exit(1); } if ( rStart > rEnd ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd); exit(1); } if ( rStart > start ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start); exit(1); } if ( rEnd < end ) { fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end); exit(1); } IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { list = Vector_new(); IDHash_add(regReg, id, list); } Vector *gapPairs = NULL; if (wantGaps) { gapPairs = Vector_new(); } int len = Vector_getNumElement(list); if (len == 0) { //this is the first request for this id, return a gap pair for the // entire range and register it as seen CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); return Vector_copy(list); } //#### // loop through the list of existing ranges recording any "gaps" where // the existing range does not cover part of the requested range // int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < rStart ) { startIdx = midIdx; } else { endIdx = midIdx; } } long gapStart; long gapEnd; int rIdx = -1; int rStartIdx = -1; int rEndIdx; gapStart = rStart; int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); // no work needs to be done at all if we find a range pair that // entirely overlaps the requested region if ( pStart <= start && pEnd >= end ) { return Vector_new(); // perl returns undef, but that causes me problems } // find adjacent or overlapping regions already registered if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) { if ( rStartIdx < 0 ) { // Not yet been set rStartIdx = i; } rEndIdx = i; } if ( pStart > rStart ) { gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1; if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, gapEnd); Vector_addElement(gapPairs, cp); } } gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1; if ( pEnd >= rEnd && rIdx < 0 ) { rIdx = i; break; } } // do we have to make another gap? if ( gapStart <= rEnd ) { if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, rEnd); Vector_addElement(gapPairs, cp); } } // // Merge the new range into the registered list // if (rStartIdx >= 0 ) { // rStartIdx has been set to something long newStart; long newEnd; CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); CoordPair *rEndIdxRange = Vector_getElementAt(list, rEndIdx); if ( rStart < CoordPair_getStart(rStartIdxRange)) { newStart = rStart; } else { newStart = CoordPair_getStart(rStartIdxRange); } if ( rEnd > CoordPair_getEnd(rEndIdxRange)) { newEnd = rEnd; } else { newEnd = CoordPair_getEnd(rEndIdxRange); } CoordPair *cp = CoordPair_new(newStart, newEnd); // Think its <= for (i=rStartIdx; i<=rEndIdx; i++) { Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called } Vector_insertElementAt(list, rStartIdx, cp); //splice( @$list, $rstart_idx, // $rend_idx - $rstart_idx + 1, // [ $new_start, $new_end ] ); } else if (rIdx >= 0) { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_insertElementAt(list, rIdx, cp); //splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); } else { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); } // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK return gapPairs; }
Vector *IntronSupportingEvidenceAdaptor_objectsFromStatementHandle(IntronSupportingEvidenceAdaptor *isea, StatementHandle *sth, AssemblyMapper *assMapper, Slice *destSlice) { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(isea->dba); AnalysisAdaptor *aa = DBAdaptor_getAnalysisAdaptor(isea->dba); Vector *features = Vector_new(); IDHash *sliceHash = IDHash_new(IDHASH_SMALL); /* Unneccesary my %analysis_hash; my %sr_name_hash; my %sr_cs_hash; */ /* Unused my $asm_cs; my $cmp_cs; my $asm_cs_vers; my $asm_cs_name; my $cmp_cs_vers; my $cmp_cs_name; if($mapper) { $asm_cs = $mapper->assembled_CoordSystem(); $cmp_cs = $mapper->component_CoordSystem(); $asm_cs_name = $asm_cs->name(); $asm_cs_vers = $asm_cs->version(); $cmp_cs_name = $cmp_cs->name(); $cmp_cs_vers = $cmp_cs->version(); } */ long destSliceStart; long destSliceEnd; int destSliceStrand; long destSliceLength; //CoordSystem *destSliceCs; char * destSliceSrName; IDType destSliceSrId = 0; //AssemblyMapperAdaptor *asma; if (destSlice) { destSliceStart = Slice_getStart(destSlice); destSliceEnd = Slice_getEnd(destSlice); destSliceStrand = Slice_getStrand(destSlice); destSliceLength = Slice_getLength(destSlice); //??destSliceCs = Slice_getCoordSystem(destSlice); destSliceSrName = Slice_getSeqRegionName(destSlice); destSliceSrId = Slice_getSeqRegionId(destSlice); //??asma = DBAdaptor_getAssemblyMapperAdaptor(ea->dba); } ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row,0); IDType analysisId = row->getLongLongAt(row,1); IDType seqRegionId = row->getLongLongAt(row,2); long seqRegionStart = row->getLongAt(row,3); long seqRegionEnd = row->getLongAt(row,4); int seqRegionStrand = row->getIntAt(row,5); char *hitName = row->getStringAt(row,6); double score = row->getDoubleAt(row,7); char *scoreType = row->getStringAt(row,8); int spliceCanonical = row->getIntAt(row,9); // get the analysis object Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId); /* // need to get the internal_seq_region, if present $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); #get the slice object my $slice = $slice_hash{"ID:".$seq_region_id}; if(!$slice) { $slice = $sa->fetch_by_seq_region_id($seq_region_id); $slice_hash{"ID:".$seq_region_id} = $slice; $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); $sr_cs_hash{$seq_region_id} = $slice->coord_system(); } my $sr_name = $sr_name_hash{$seq_region_id}; my $sr_cs = $sr_cs_hash{$seq_region_id}; */ if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } Slice *slice = IDHash_getValue(sliceHash, seqRegionId); Slice *iseSlice = slice; char *srName = Slice_getSeqRegionName(slice); CoordSystem *srCs = Slice_getCoordSystem(slice); // // remap the feature coordinates to another coord system // if a mapper was provided // if (assMapper != NULL) { MapperRangeSet *mrs; // Slightly suspicious about need for this if statement so left in perl statements for now if (destSlice != NULL && assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) { mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice); } else { mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL); } // skip features that map to gaps or coord system boundaries //next FEATURE if (!defined($seq_region_id)); if (MapperRangeSet_getNumRange(mrs) == 0) { continue; } MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0); if (range->rangeType == MAPPERRANGE_GAP) { fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n"); exit(1); } else { MapperCoordinate *mc = (MapperCoordinate *)range; seqRegionId = mc->id; seqRegionStart = mc->start; seqRegionEnd = mc->end; seqRegionStrand = mc->strand; } MapperRangeSet_free(mrs); /* Was - but identical if and else so why test??? #get a slice in the coord system we just mapped to if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { $slice = $slice_hash{"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id); } else { $slice = $slice_hash{"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id); } */ // Instead... if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } iseSlice = IDHash_getValue(sliceHash, seqRegionId); } // // If a destination slice was provided convert the coords // If the dest_slice starts at 1 and is foward strand, nothing needs doing // if (destSlice != NULL) { if (destSliceStart != 1 || destSliceStrand != 1) { if (destSliceStrand == 1) { seqRegionStart = seqRegionStart - destSliceStart + 1; seqRegionEnd = seqRegionEnd - destSliceStart + 1; } else { long tmpSeqRegionStart = seqRegionStart; seqRegionStart = destSliceEnd - seqRegionEnd + 1; seqRegionEnd = destSliceEnd - tmpSeqRegionStart + 1; seqRegionStrand = -seqRegionStrand; } } // throw away features off the end of the requested slice if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) { continue; } iseSlice = destSlice; } IntronSupportingEvidence *ise = IntronSupportingEvidence_new(); IntronSupportingEvidence_setStart (ise, seqRegionStart); IntronSupportingEvidence_setEnd (ise, seqRegionEnd); IntronSupportingEvidence_setStrand (ise, seqRegionStrand); IntronSupportingEvidence_setSlice (ise, iseSlice); IntronSupportingEvidence_setAnalysis (ise, analysis); IntronSupportingEvidence_setAdaptor (ise, (BaseAdaptor *)isea); IntronSupportingEvidence_setDbID (ise, id); IntronSupportingEvidence_setHitName (ise, hitName); IntronSupportingEvidence_setScore (ise, score); IntronSupportingEvidence_setScoreType (ise, scoreType); IntronSupportingEvidence_setIsSpliceCanonical(ise, spliceCanonical); Vector_addElement(features, ise); } return features; }
// Note I didn't implement the stable id fetching uggliness here. I'll probably make a separate method for that // if necessary Vector *BaseAdaptor_uncachedFetchAllByDbIDList(BaseAdaptor *ba, Vector *idList, Slice *slice) { if ( idList == NULL) { fprintf(stderr, "id_list list reference argument is required - bye!"); return NULL; } char constraintPref[1024]; if (!Vector_getNumElement(idList)) { return Vector_new(); } NameTableType *tables = ba->getTables(); char **t = (*tables)[0]; sprintf(constraintPref, "%s.%s_id ", t[SYN], t[NAME] ); // Ensure that we do not exceed MySQL's max_allowed_packet (defaults to // 1 MB) splitting large queries into smaller queries of at most 256 KB. // Assuming a (generous) average dbID string // length of 16, this means 16384 dbIDs in each query. int maxSize = 16384; // Uniquify the list IDHash *idListHash = IDHash_new(IDHASH_MEDIUM); int i; for (i=0; i<Vector_getNumElement(idList); i++) { IDType id = *(IDType *)(Vector_getElementAt(idList, i)); if (!IDHash_contains(idListHash, id)) { IDHash_add(idListHash, id, &trueVal); } } IDType *uniqueIds = IDHash_getKeys(idListHash); int nUniqueId = IDHash_getNumValues(idListHash); IDHash_free(idListHash, NULL); Vector *out = Vector_new(); int lenNum; for (i=0; i<nUniqueId; i+=maxSize) { char *constraint = NULL; if ((constraint = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating constraint\n"); return out; } strcpy(constraint, constraintPref); // Special case for one remaining Id if (i == nUniqueId-1) { sprintf(constraint, "%s = "IDFMTSTR, constraint, uniqueIds[i]); } else { char tmpStr[1024]; int endPoint = sprintf(constraint, "%s IN (", constraint); int j; for (j=0; j<maxSize && j+i<nUniqueId; j++) { if (j!=0) { constraint[endPoint++] = ','; constraint[endPoint++] = ' '; } lenNum = sprintf(tmpStr, IDFMTSTR, uniqueIds[i+j]); memcpy(&(constraint[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } constraint[endPoint++] = ')'; constraint[endPoint] = '\0'; } Vector *resChunk = BaseAdaptor_genericFetch(ba, constraint, NULL, slice); Vector_append(out, resChunk); Vector_free(resChunk); free(constraint); } free(uniqueIds); return out; }