void Mapper_sort(Mapper *m) { IDHash *fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND); IDHash *toHash = Mapper_getPairHash(m, MAPPER_TO_IND); MapperPairSet **sets; int nSet; int i; sets = (MapperPairSet **)IDHash_getValues(fromHash); nSet = IDHash_getNumValues(fromHash); for (i=0;i<nSet;i++) { MapperPairSet_sort(sets[i],MAPPER_FROM_IND); } free(sets); sets = (MapperPairSet **)IDHash_getValues(toHash); nSet = IDHash_getNumValues(toHash); for (i=0;i<nSet;i++) { MapperPairSet_sort(sets[i],MAPPER_TO_IND); } free(sets); // MergePairs yet to do Mapper_mergePairs(m); Mapper_setIsSorted(m, 1); }
void Mapper_dump(Mapper *m, FILE *fp) { IDHash *fromHash; IDType *keys; int nKey; int i; if (fp==NULL) { fp = stderr; } fromHash = Mapper_getPairHash(m, MAPPER_TO_IND); keys = IDHash_getKeys(fromHash); nKey = IDHash_getNumValues(fromHash); for (i=0;i<nKey;i++) { IDType id = keys[i]; MapperPairSet *set = IDHash_getValue(fromHash,id); int j; fprintf(fp, "From Hash " IDFMTSTR " with %d pairs\n",id, MapperPairSet_getNumPair(set)); for (j=0; j<MapperPairSet_getNumPair(set); j++) { MapperPair *pair = MapperPairSet_getPairAt(set,j); MapperUnit *fromCoord = MapperPair_getUnit(pair, MAPPER_FROM_IND); MapperUnit *toCoord = MapperPair_getUnit(pair, MAPPER_TO_IND); fprintf(fp, " %ld %ld:%ld %ld " IDFMTSTR "\n",fromCoord->start,fromCoord->end, toCoord->start,toCoord->end,toCoord->id); } } free(keys); }
// this function merges pairs that are adjacent into one // This function is a pain in the arse to implement in C void Mapper_mergePairs(Mapper *m) { int to = MAPPER_TO_IND; int from = MAPPER_FROM_IND; Mapper_setPairCount(m, 0); IDHash *toPairHash = Mapper_getPairHash(m, MAPPER_TO_IND); IDHash *fromPairHash = Mapper_getPairHash(m, MAPPER_FROM_IND); MapperPairSet **toPairValues = (MapperPairSet **)IDHash_getValues(toPairHash); int pairInd; for (pairInd = 0; pairInd<IDHash_getNumValues(toPairHash); pairInd++) { MapperPairSet *pairs = toPairValues[pairInd]; int i = 0; int next = 1; int length = MapperPairSet_getNumPair(pairs)-1; //$#{$lr}; while (next <= length) { MapperPair *currentPair = MapperPairSet_getPairAt(pairs, i); MapperPair *nextPair = MapperPairSet_getPairAt(pairs, next); MapperPair *delPair = NULL; if (MapperPair_isIndel(currentPair) || MapperPair_isIndel(nextPair)) { //necessary to modify the merge function to not merge indels next++; i++; } else { // duplicate filter if ( MapperPair_getUnit(currentPair,to)->start == MapperPair_getUnit(nextPair,to)->start && MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) { delPair = nextPair; } else if (( MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) && ( nextPair->ori == currentPair->ori ) && ( MapperPair_getUnit(nextPair,to)->start-1 == MapperPair_getUnit(currentPair,to)->end )) { if ( currentPair->ori == 1 ) { // check forward strand merge if ( MapperPair_getUnit(nextPair,from)->start-1 == MapperPair_getUnit(currentPair,from)->end) { // normal merge with previous element MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end; MapperPair_getUnit(currentPair,from)->end = MapperPair_getUnit(nextPair,from)->end; delPair = nextPair; } } else { // check backward strand merge if ( MapperPair_getUnit(nextPair,from)->end+1 == MapperPair_getUnit(currentPair,from)->start ) { // yes its a merge MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end; MapperPair_getUnit(currentPair,from)->start = MapperPair_getUnit(nextPair,from)->start; delPair = nextPair; } } } if (delPair != NULL) { // Have a pair to delete // Remove from the to pair set MapperPairSet_removePairAt(pairs, next); //splice( @$lr, $next, 1 ); MapperPairSet *fromPairs = IDHash_getValue(fromPairHash, MapperPair_getUnit(delPair, from)->id); //$self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})}; int j; for (j=0; j < MapperPairSet_getNumPair(fromPairs); j++) { MapperPair *fromPair = MapperPairSet_getPairAt(fromPairs, j); if ( fromPair == delPair) { // Is this really going to be an equality ??? //$lr_from->[$j] == $del_pair ) MapperPairSet_removePairAt(fromPairs, j); //splice( @$lr_from, $j, 1 ); break; } } // NIY: Do we need to free delPair??? length--; if ( length < next ) break; } else { next++; i++; } } } Mapper_addToPairCount(m, MapperPairSet_getNumPair(pairs)); // $self->{'pair_count'} += scalar( @$lr ); } }
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) { //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name); Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName); // if there are 0 or 1 transcripts still do lazy-loading if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) { return transcripts; } // preload all of the exons now, instead of lazy loading later // faster than 1 query per transcript // get extent of region spanned by transcripts long minStart = 2000000000; long maxEnd = -2000000000; int i; for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) { minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t); } if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) { maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t); } } Slice *extSlice; if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) { extSlice = slice; } else { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice), minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0); } // associate exon identifiers with transcripts IDHash *trHash = IDHash_new(IDHASH_MEDIUM); for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) { IDHash_add(trHash, PredictionTranscript_getDbID(t), t); } } IDType *uniqueIds = IDHash_getKeys(trHash); char tmpStr[1024]; char *qStr = NULL; if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating qStr\n"); return transcripts; } int lenNum; int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE prediction_transcript_id IN ("); for (i=0; i<IDHash_getNumValues(trHash); i++) { if (i!=0) { qStr[endPoint++] = ','; qStr[endPoint++] = ' '; } lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]); memcpy(&(qStr[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } qStr[endPoint++] = ')'; qStr[endPoint] = '\0'; free(uniqueIds); StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr)); sth->execute(sth); IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType trId = row->getLongLongAt(row,0); IDType exId = row->getLongLongAt(row,1); int rank = row->getIntAt(row,2); if (! IDHash_contains(exTrHash, exId)) { Vector *vec = Vector_new(); Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free); IDHash_add(exTrHash, exId, vec); } Vector *exVec = IDHash_getValue(exTrHash, exId); PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank); Vector_addElement(exVec, trp); } IDHash_free(trHash, NULL); sth->finish(sth); PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba); Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice); // move exons onto transcript slice, and add them to transcripts for (i=0; i<Vector_getNumElement(exons); i++) { PredictionExon *ex = Vector_getElementAt(exons, i); // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue; PredictionExon *newEx; if (slice != extSlice) { newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice); if (newEx == NULL) { fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n"); exit(1); } } else { newEx = ex; } Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx)); int j; for (j=0; j<Vector_getNumElement(exVec); j++) { PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j); PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank); } } IDHash_free(exTrHash, Vector_free); free(qStr); return transcripts; }
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) { int i; Vector *bigList = Vector_new(); IDHash *overlappingSets[2]; Vector *mergedAligns; for (i=0;i<Vector_getNumElement(alignSet1); i++) { GenomicAlign *align = Vector_getElementAt(alignSet1, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryStart(align), align, 0)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryEnd(align)+0.5, align, 0)); } for (i=0;i<Vector_getNumElement(alignSet2); i++) { GenomicAlign *align = Vector_getElementAt(alignSet2, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusStart(align), align, 1)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusEnd(align)+0.5, align, 1)); } Vector_sort(bigList, GenomicAlignListElem_compFunc); // walking from start to end through sortlist and keep track of the // currently overlapping set of Alignments overlappingSets[0] = IDHash_new(IDHASH_SMALL); overlappingSets[1] = IDHash_new(IDHASH_SMALL); mergedAligns = Vector_new(); for (i=0; i<Vector_getNumElement(bigList); i++) { GenomicAlignListElem *gale = Vector_getElementAt(bigList,i); GenomicAlign *align = gale->align; IDType alignID = GenomicAlign_getDbID(align); int setNo = gale->setNum; if (IDHash_contains(overlappingSets[setNo], alignID)) { // remove from current overlapping set IDHash_remove(overlappingSets[setNo], alignID, NULL); } else { int j; void **values = IDHash_getValues(overlappingSets[1-setNo]); // insert into the set and do all the overlap business IDHash_add(overlappingSets[setNo], alignID, align); // the other set contains everything this align overlaps with for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) { GenomicAlign *align2 = values[j]; if (setNo == 0) { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2); } else { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align); } } free(values); } } // NIY Free gale return mergedAligns; }
// Note I didn't implement the stable id fetching uggliness here. I'll probably make a separate method for that // if necessary Vector *BaseAdaptor_uncachedFetchAllByDbIDList(BaseAdaptor *ba, Vector *idList, Slice *slice) { if ( idList == NULL) { fprintf(stderr, "id_list list reference argument is required - bye!"); return NULL; } char constraintPref[1024]; if (!Vector_getNumElement(idList)) { return Vector_new(); } NameTableType *tables = ba->getTables(); char **t = (*tables)[0]; sprintf(constraintPref, "%s.%s_id ", t[SYN], t[NAME] ); // Ensure that we do not exceed MySQL's max_allowed_packet (defaults to // 1 MB) splitting large queries into smaller queries of at most 256 KB. // Assuming a (generous) average dbID string // length of 16, this means 16384 dbIDs in each query. int maxSize = 16384; // Uniquify the list IDHash *idListHash = IDHash_new(IDHASH_MEDIUM); int i; for (i=0; i<Vector_getNumElement(idList); i++) { IDType id = *(IDType *)(Vector_getElementAt(idList, i)); if (!IDHash_contains(idListHash, id)) { IDHash_add(idListHash, id, &trueVal); } } IDType *uniqueIds = IDHash_getKeys(idListHash); int nUniqueId = IDHash_getNumValues(idListHash); IDHash_free(idListHash, NULL); Vector *out = Vector_new(); int lenNum; for (i=0; i<nUniqueId; i+=maxSize) { char *constraint = NULL; if ((constraint = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating constraint\n"); return out; } strcpy(constraint, constraintPref); // Special case for one remaining Id if (i == nUniqueId-1) { sprintf(constraint, "%s = "IDFMTSTR, constraint, uniqueIds[i]); } else { char tmpStr[1024]; int endPoint = sprintf(constraint, "%s IN (", constraint); int j; for (j=0; j<maxSize && j+i<nUniqueId; j++) { if (j!=0) { constraint[endPoint++] = ','; constraint[endPoint++] = ' '; } lenNum = sprintf(tmpStr, IDFMTSTR, uniqueIds[i+j]); memcpy(&(constraint[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } constraint[endPoint++] = ')'; constraint[endPoint] = '\0'; } Vector *resChunk = BaseAdaptor_genericFetch(ba, constraint, NULL, slice); Vector_append(out, resChunk); Vector_free(resChunk); free(constraint); } free(uniqueIds); return out; }