// New Vector *Translation_getAllSeqEdits(Translation *translation) { char *edits[] = { "initial_met", "_selenocysteine", "amino_acid_sub", NULL }; Vector *seqEds = Vector_new(); char **editP = edits; while (*editP) { char *edit = *editP; Vector *attribs = Translation_getAllAttributes(translation, edit); // convert attributes to SeqEdit objects int i; for (i=0; i<Vector_getNumElement(attribs); i++) { Attribute *attrib = Vector_getElementAt(attribs, i); SeqEdit *seqEd = SeqEdit_newFromAttribute(attrib); Vector_addElement(seqEds, seqEd); } Vector_free(attribs); editP++; } return seqEds; }
// NIY: // Because this can filter the results the vector that gets returned must be freeable - so for now // make a copy of the translation->attributes vector if returning unfiltered so behaviour is // consistent. Long term probably want reference count incremented Vector *Translation_getAllAttributes(Translation *translation, char *attribCode) { if (translation->attributes == NULL) { TranslationAdaptor *tlna = (TranslationAdaptor *)Translation_getAdaptor(translation); if (tlna == NULL) { // No adaptor // Perl comments out the warning, I'll put it back for now, just in case //fprintf(stderr,"Warning: Cannot get attributes without an adaptor.\n"); return Vector_new(); } AttributeAdaptor *ata = DBAdaptor_getAttributeAdaptor(tlna->dba); translation->attributes = AttributeAdaptor_fetchAllByTranslation(ata, translation, NULL); } if (attribCode != NULL) { Vector *results = Vector_new(); int i; for (i=0; i<Vector_getNumElement(translation->attributes); i++) { Attribute *attrib = Vector_getElementAt(translation->attributes, i); if (!strcasecmp(attrib->code, attribCode)) { Vector_addElement(results, attrib); } } return results; } else { // See NIY note above for why I'm making a copy return Vector_copy(translation->attributes); } }
Vector *HomologyAdaptor_listStableIdsFromSpecies(HomologyAdaptor *ha, char *sp) { StatementHandle *sth; ResultRow *row; Vector *genes; char qStr[1024]; char *species; species = StrUtil_copyString(&species,sp,0); species = StrUtil_strReplChr(species,'_',' '); sprintf(qStr, "select distinct grm.member_stable_id " " from gene_relationship_member grm," " genome_db gd " " where gd.genome_db_id = grm.genome_db_id " " and gd.name = '%s'", species); sth = ha->prepare((BaseAdaptor *)ha, qStr, strlen(qStr)); sth->execute(sth); genes = Vector_new(); while ((row = sth->fetchRow(sth))) { char *tmpStr; Vector_addElement(genes,StrUtil_copyString(&tmpStr, row->getStringAt(row,0),0)); } sth->finish(sth); free(species); return genes; }
Vector *IntronSupportingEvidenceAdaptor_listLinkedTranscriptIds(IntronSupportingEvidenceAdaptor *isea, IntronSupportingEvidence *ise) { char qStr[1024]; sprintf(qStr,"SELECT transcript_id from transcript_intron_supporting_evidence " "WHERE intron_supporting_evidence_id = "IDFMTSTR, IntronSupportingEvidence_getDbID(ise)); StatementHandle *sth = isea->prepare((BaseAdaptor *)isea,qStr,strlen(qStr)); sth->execute(sth); Vector *idVec = Vector_new(); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row, 0); IDType *idP; if ((idP = calloc(1,sizeof(IDType))) == NULL) { fprintf(stderr, "Failed allocating space for a id\n"); exit(1); } *idP = id; Vector_addElement(idVec, idP); } sth->finish(sth); Vector_setFreeFunc(idVec, free); return idVec; }
Vector *PredictionTranscript_getAllTranslateableExons(PredictionTranscript *trans) { int i; if (!trans->translateableExons) { trans->translateableExons = Vector_new(); for (i=0; i<PredictionTranscript_getExonCount(trans); i++) { PredictionExon *ex = PredictionTranscript_getExonAt(trans,i); if (ex) { Vector_addElement(trans->translateableExons, ex); } } } return trans->translateableExons; }
void PredictionTranscript_addExon(PredictionTranscript *trans, PredictionExon *exon, int *positionP) { if (positionP) { Vector_setElementAt(trans->exons, *positionP, exon); } else { Vector_addElement(trans->exons, exon); } if (exon && (!PredictionTranscript_getStartIsSet(trans) || PredictionExon_getStart(exon) < PredictionTranscript_getStart(trans))) { PredictionTranscript_setStart(trans, PredictionExon_getStart(exon)); } if (exon && (!PredictionTranscript_getEndIsSet(trans) || PredictionExon_getEnd(exon) > PredictionTranscript_getEnd(trans))) { PredictionTranscript_setEnd(trans, PredictionExon_getEnd(exon)); } }
/* In perl note this is misspelled compared to normal _objs_from_sth Not sure if that's deliberate to avoid clash or just an error Obviously this isn't called through the normal path at all (just directly from within the fetch method). */ Vector *AttributeAdaptor_objectsFromStatementHandle(AttributeAdaptor *ata, StatementHandle *sth) { Vector *results = Vector_new(); ResultRow *row; // Note extra parentheses are to keep mac compiler happy while ((row = sth->fetchRow(sth))) { char *code = row->getStringAt(row, 0); char *name = row->getStringAt(row, 1); char *desc = row->getStringAt(row, 2); char *value = row->getStringAt(row, 3); Attribute *attr = Attribute_new(); Attribute_setCode(attr, code); Attribute_setName(attr, name); Attribute_setDescription(attr, desc); Attribute_setValue(attr, value); Vector_addElement(results, attr); } return results; }
/* =head2 fetch_all_by_Transcript Arg[1] : Bio::EnsEMBL::Transcript Transcript to search with Example : my $ises = $isea->fetch_all_by_Transcript($transcript); Description : Uses the given Transcript to search for all instances of IntronSupportingEvidence linked to the transcript in the database Returntype : ArrayRef of IntronSupportingEvidence objects Exceptions : Thrown if arguments are not as stated and for DB errors =cut */ Vector *IntronSupportingEvidenceAdaptor_fetchAllByTranscript(IntronSupportingEvidenceAdaptor *isea, Transcript *transcript) { char qStr[1024]; sprintf(qStr,"SELECT intron_supporting_evidence_id " "FROM transcript_intron_supporting_evidence " "WHERE transcript_id = "IDFMTSTR, Transcript_getDbID(transcript)); StatementHandle *sth = isea->prepare((BaseAdaptor *)isea,qStr,strlen(qStr)); sth->execute(sth); Vector *idVec = Vector_new(); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row, 0); IDType *idP; if ((idP = calloc(1,sizeof(IDType))) == NULL) { fprintf(stderr, "Failed allocating space for a id\n"); exit(1); } *idP = id; Vector_addElement(idVec, idP); } sth->finish(sth); Vector *out; if (Vector_getNumElement(idVec) > 0) { out = IntronSupportingEvidenceAdaptor_fetchAllByDbIDList(isea, idVec, NULL); } else { out = Vector_new(); } // Free ids vector Vector_setFreeFunc(idVec, free); Vector_free(idVec); return out; }
Vector *HomologyAdaptor_getHomologues(HomologyAdaptor *ha, char *qStr) { StatementHandle *sth; ResultRow *row; Vector *genes; sth = ha->prepare((BaseAdaptor *)ha, qStr, strlen(qStr)); sth->execute(sth); genes = Vector_new(); while ((row = sth->fetchRow(sth))) { Homology *homol = Homology_new(); Homology_setSpecies(homol, row->getStringAt(row,1)); Homology_setStableId(homol, row->getStringAt(row,0)); Homology_setChromosome(homol, row->getStringAt(row,2)); Homology_setChrStart(homol, row->getIntAt(row,3)); Homology_setChrEnd(homol, row->getIntAt(row,4)); Vector_addElement(genes,homol); } sth->finish(sth); return genes; }
Vector *SyntenyRegionAdaptor_fetchByClusterId(SyntenyRegionAdaptor *sra, IDType clusterId) { char qStr[256]; StatementHandle *sth; ResultRow *row; Vector *out = NULL; if (!clusterId) { fprintf(stderr, "Error: fetch_by_cluster_id with no cluster_id!\n"); } else { sprintf(qStr, "select synteny_region_id," " dnafrag_id," " seq_start," " seq_end " " from synteny_region " " where synteny_cluster_id = " IDFMTSTR, clusterId); sth = sra->prepare((BaseAdaptor *)sra, qStr, strlen(qStr)); sth->execute(sth); out = Vector_new(); while ((row = sth->fetchRow(sth))) { SyntenyRegion *sr = SyntenyRegionAdaptor_newRegionFromArray(sra, row->getLongLongAt(row,0), clusterId, row->getLongLongAt(row,1), row->getIntAt(row,2), row->getIntAt(row,3)); Vector_addElement(out,sr); } sth->finish(sth); } return out; }
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) { int i; Vector *bigList = Vector_new(); IDHash *overlappingSets[2]; Vector *mergedAligns; for (i=0;i<Vector_getNumElement(alignSet1); i++) { GenomicAlign *align = Vector_getElementAt(alignSet1, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryStart(align), align, 0)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryEnd(align)+0.5, align, 0)); } for (i=0;i<Vector_getNumElement(alignSet2); i++) { GenomicAlign *align = Vector_getElementAt(alignSet2, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusStart(align), align, 1)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusEnd(align)+0.5, align, 1)); } Vector_sort(bigList, GenomicAlignListElem_compFunc); // walking from start to end through sortlist and keep track of the // currently overlapping set of Alignments overlappingSets[0] = IDHash_new(IDHASH_SMALL); overlappingSets[1] = IDHash_new(IDHASH_SMALL); mergedAligns = Vector_new(); for (i=0; i<Vector_getNumElement(bigList); i++) { GenomicAlignListElem *gale = Vector_getElementAt(bigList,i); GenomicAlign *align = gale->align; IDType alignID = GenomicAlign_getDbID(align); int setNo = gale->setNum; if (IDHash_contains(overlappingSets[setNo], alignID)) { // remove from current overlapping set IDHash_remove(overlappingSets[setNo], alignID, NULL); } else { int j; void **values = IDHash_getValues(overlappingSets[1-setNo]); // insert into the set and do all the overlap business IDHash_add(overlappingSets[setNo], alignID, align); // the other set contains everything this align overlaps with for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) { GenomicAlign *align2 = values[j]; if (setNo == 0) { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2); } else { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align); } } free(values); } } // NIY Free gale return mergedAligns; }
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak // memory Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, long rStart, long rEnd, int wantGaps) { // The following was commented out due to Ensembl Genomes requirements // for bacterial genomes. // The following was uncommented because I'm not caring about those requirements if ( start > end ) { fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end); exit(1); } if ( rStart > rEnd ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd); exit(1); } if ( rStart > start ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start); exit(1); } if ( rEnd < end ) { fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end); exit(1); } IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { list = Vector_new(); IDHash_add(regReg, id, list); } Vector *gapPairs = NULL; if (wantGaps) { gapPairs = Vector_new(); } int len = Vector_getNumElement(list); if (len == 0) { //this is the first request for this id, return a gap pair for the // entire range and register it as seen CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); return Vector_copy(list); } //#### // loop through the list of existing ranges recording any "gaps" where // the existing range does not cover part of the requested range // int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < rStart ) { startIdx = midIdx; } else { endIdx = midIdx; } } long gapStart; long gapEnd; int rIdx = -1; int rStartIdx = -1; int rEndIdx; gapStart = rStart; int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); // no work needs to be done at all if we find a range pair that // entirely overlaps the requested region if ( pStart <= start && pEnd >= end ) { return Vector_new(); // perl returns undef, but that causes me problems } // find adjacent or overlapping regions already registered if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) { if ( rStartIdx < 0 ) { // Not yet been set rStartIdx = i; } rEndIdx = i; } if ( pStart > rStart ) { gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1; if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, gapEnd); Vector_addElement(gapPairs, cp); } } gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1; if ( pEnd >= rEnd && rIdx < 0 ) { rIdx = i; break; } } // do we have to make another gap? if ( gapStart <= rEnd ) { if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, rEnd); Vector_addElement(gapPairs, cp); } } // // Merge the new range into the registered list // if (rStartIdx >= 0 ) { // rStartIdx has been set to something long newStart; long newEnd; CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); CoordPair *rEndIdxRange = Vector_getElementAt(list, rEndIdx); if ( rStart < CoordPair_getStart(rStartIdxRange)) { newStart = rStart; } else { newStart = CoordPair_getStart(rStartIdxRange); } if ( rEnd > CoordPair_getEnd(rEndIdxRange)) { newEnd = rEnd; } else { newEnd = CoordPair_getEnd(rEndIdxRange); } CoordPair *cp = CoordPair_new(newStart, newEnd); // Think its <= for (i=rStartIdx; i<=rEndIdx; i++) { Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called } Vector_insertElementAt(list, rStartIdx, cp); //splice( @$list, $rstart_idx, // $rend_idx - $rstart_idx + 1, // [ $new_start, $new_end ] ); } else if (rIdx >= 0) { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_insertElementAt(list, rIdx, cp); //splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); } else { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); } // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK return gapPairs; }
int main(int argc, char *argv[]) { DBAdaptor * dba; StatementHandle *sth; ResultRow * row; Vector * slices; int nSlices; htsFile * out; int argNum = 1; char *inFName = NULL; char *outFName = NULL; char *dbUser = "******"; char *dbPass = NULL; int dbPort = 3306; char *dbHost = "ens-staging.internal.sanger.ac.uk"; char *dbName = "homo_sapiens_core_71_37"; char *assName = "GRCh37"; char *chrName = "1"; int flags = 0; int threads = 1; initEnsC(argc, argv); while (argNum < argc) { char *arg = argv[argNum]; char *val; // Ones without a val go here if (!strcmp(arg, "-U") || !strcmp(arg,"--ucsc_naming")) { flags |= M_UCSC_NAMING; } else { // Ones with a val go in this block if (argNum == argc-1) { Bamcov_usage(); } val = argv[++argNum]; if (!strcmp(arg, "-i") || !strcmp(arg,"--in_file")) { StrUtil_copyString(&inFName,val,0); } else if (!strcmp(arg, "-o") || !strcmp(arg,"--out_file")) { StrUtil_copyString(&outFName,val,0); } else if (!strcmp(arg, "-h") || !strcmp(arg,"--host")) { StrUtil_copyString(&dbHost,val,0); } else if (!strcmp(arg, "-p") || !strcmp(arg,"--password")) { StrUtil_copyString(&dbPass,val,0); } else if (!strcmp(arg, "-P") || !strcmp(arg,"--port")) { dbPort = atoi(val); } else if (!strcmp(arg, "-n") || !strcmp(arg,"--name")) { StrUtil_copyString(&dbName,val,0); } else if (!strcmp(arg, "-u") || !strcmp(arg,"--user")) { StrUtil_copyString(&dbUser,val,0); } else if (!strcmp(arg, "-t") || !strcmp(arg,"--threads")) { threads = atoi(val); } else if (!strcmp(arg, "-a") || !strcmp(arg,"--assembly")) { StrUtil_copyString(&assName,val,0); } else if (!strcmp(arg, "-v") || !strcmp(arg,"--verbosity")) { verbosity = atoi(val); // Temporary } else if (!strcmp(arg, "-c") || !strcmp(arg,"--chromosome")) { StrUtil_copyString(&chrName,val,0); } else { fprintf(stderr,"Error in command line at %s\n\n",arg); Bamcov_usage(); } } argNum++; } if (verbosity > 0) { printf("Program for calculating read coverage in a BAM file \n" "Steve M.J. Searle. [email protected] Last update April 2013.\n"); } if (!inFName || !outFName) { Bamcov_usage(); } dba = DBAdaptor_new(dbHost,dbUser,dbPass,dbName,dbPort,NULL); //nSlices = getSlices(dba, destName); nSlices = 1; slices = Vector_new(); SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba); Slice *slice = SliceAdaptor_fetchByRegion(sa,NULL,chrName,POS_UNDEF,POS_UNDEF,1,NULL, 0); Vector_addElement(slices,slice); if (Vector_getNumElement(slices) == 0) { fprintf(stderr, "Error: No slices.\n"); exit(1); } htsFile *in = hts_open(inFName, "rb"); if (in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", inFName); return 1; } hts_set_threads(in, threads); hts_idx_t *idx; idx = bam_index_load(inFName); // load BAM index if (idx == 0) { fprintf(stderr, "BAM index file is not available.\n"); return 1; } int i; for (i=0; i<Vector_getNumElement(slices); i++) { Slice *slice = Vector_getElementAt(slices,i); if (verbosity > 0) printf("Working on '%s'\n",Slice_getName(slice)); // if (verbosity > 0) printf("Stage 1 - retrieving annotation from database\n"); // Vector *genes = getGenes(slice, flags); if (verbosity > 0) printf("Stage 1 - calculating coverage\n"); calcCoverage(inFName, slice, in, idx, flags); } hts_idx_destroy(idx); hts_close(in); if (verbosity > 0) printf("Done\n"); return 0; }
/* =head2 _objs_from_sth Arg [1] : DBI:st $sth An executed DBI statement handle Arg [2] : (optional) Bio::EnsEMBL::Mapper $mapper An mapper to be used to convert contig coordinates to assembly coordinates. Arg [3] : (optional) Bio::EnsEMBL::Slice $slice A slice to map the prediction transcript to. Example : $p_transcripts = $self->_objs_from_sth($sth); Description: Creates a list of Prediction transcripts from an executed DBI statement handle. The columns retrieved via the statement handle must be in the same order as the columns defined by the _columns method. If the slice argument is provided then the the prediction transcripts will be in returned in the coordinate system of the $slice argument. Otherwise the prediction transcripts will be returned in the RawContig coordinate system. Returntype : reference to a list of Bio::EnsEMBL::PredictionTranscripts Exceptions : none Caller : superclass generic_fetch Status : Stable =cut */ Vector *PredictionTranscriptAdaptor_objectsFromStatementHandle(PredictionTranscriptAdaptor *pta, StatementHandle *sth, AssemblyMapper *assMapper, Slice *destSlice) { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); AnalysisAdaptor *aa = DBAdaptor_getAnalysisAdaptor(pta->dba); Vector *pTranscripts = Vector_new(); IDHash *sliceHash = IDHash_new(IDHASH_SMALL); long destSliceStart; long destSliceEnd; int destSliceStrand; long destSliceLength; char * destSliceSrName; IDType destSliceSrId = 0; if (destSlice) { destSliceStart = Slice_getStart(destSlice); destSliceEnd = Slice_getEnd(destSlice); destSliceStrand = Slice_getStrand(destSlice); destSliceLength = Slice_getLength(destSlice); destSliceSrName = Slice_getSeqRegionName(destSlice); destSliceSrId = Slice_getSeqRegionId(destSlice); } ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType predictionTranscriptId = row->getLongLongAt(row,0); IDType seqRegionId = row->getLongLongAt(row,1); long seqRegionStart = row->getLongAt(row,2); long seqRegionEnd = row->getLongAt(row,3); int seqRegionStrand = row->getIntAt(row,4); IDType analysisId = row->getLongLongAt(row,5); char *displayLabel = row->getStringAt(row,6); // get the analysis object Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId); if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } Slice *slice = IDHash_getValue(sliceHash, seqRegionId); Slice *ptSlice = slice; char *srName = Slice_getSeqRegionName(slice); CoordSystem *srCs = Slice_getCoordSystem(slice); // // remap the feature coordinates to another coord system // if a mapper was provided // if (assMapper != NULL) { MapperRangeSet *mrs; // Slightly suspicious about need for this if statement so left in perl statements for now if (destSlice != NULL && assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) { mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice); } else { mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL); } // skip features that map to gaps or coord system boundaries if (MapperRangeSet_getNumRange(mrs) == 0) { continue; } MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0); if (range->rangeType == MAPPERRANGE_GAP) { fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n"); exit(1); } else { MapperCoordinate *mc = (MapperCoordinate *)range; seqRegionId = mc->id; seqRegionStart = mc->start; seqRegionEnd = mc->end; seqRegionStrand = mc->strand; } MapperRangeSet_free(mrs); if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } ptSlice = IDHash_getValue(sliceHash, seqRegionId); } // // If a destination slice was provided convert the coords // If the dest_slice starts at 1 and is foward strand, nothing needs doing // if (destSlice != NULL) { if (destSliceStart != 1 || destSliceStrand != 1) { if (destSliceStrand == 1) { seqRegionStart = seqRegionStart - destSliceStart + 1; seqRegionEnd = seqRegionEnd - destSliceStart + 1; } else { long tmpSeqRegionStart = seqRegionStart; seqRegionStart = destSliceEnd - seqRegionEnd + 1; seqRegionEnd = destSliceEnd - tmpSeqRegionStart + 1; seqRegionStrand = -seqRegionStrand; } } // throw away features off the end of the requested slice if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) { continue; } ptSlice = destSlice; } // Finally, create the new PredictionTranscript. PredictionTranscript *pt = PredictionTranscript_new(); PredictionTranscript_setStart (pt, seqRegionStart); PredictionTranscript_setEnd (pt, seqRegionEnd); PredictionTranscript_setStrand (pt, seqRegionStrand); PredictionTranscript_setSlice (pt, ptSlice); PredictionTranscript_setAnalysis (pt, analysis); PredictionTranscript_setAdaptor (pt, (BaseAdaptor *)pta); PredictionTranscript_setDbID (pt, predictionTranscriptId); PredictionTranscript_setDisplayLabel(pt, displayLabel); Vector_addElement(pTranscripts, pt); } IDHash_free(sliceHash, NULL); return pTranscripts; }
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) { //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name); Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName); // if there are 0 or 1 transcripts still do lazy-loading if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) { return transcripts; } // preload all of the exons now, instead of lazy loading later // faster than 1 query per transcript // get extent of region spanned by transcripts long minStart = 2000000000; long maxEnd = -2000000000; int i; for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) { minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t); } if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) { maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t); } } Slice *extSlice; if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) { extSlice = slice; } else { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice), minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0); } // associate exon identifiers with transcripts IDHash *trHash = IDHash_new(IDHASH_MEDIUM); for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) { IDHash_add(trHash, PredictionTranscript_getDbID(t), t); } } IDType *uniqueIds = IDHash_getKeys(trHash); char tmpStr[1024]; char *qStr = NULL; if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating qStr\n"); return transcripts; } int lenNum; int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE prediction_transcript_id IN ("); for (i=0; i<IDHash_getNumValues(trHash); i++) { if (i!=0) { qStr[endPoint++] = ','; qStr[endPoint++] = ' '; } lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]); memcpy(&(qStr[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } qStr[endPoint++] = ')'; qStr[endPoint] = '\0'; free(uniqueIds); StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr)); sth->execute(sth); IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType trId = row->getLongLongAt(row,0); IDType exId = row->getLongLongAt(row,1); int rank = row->getIntAt(row,2); if (! IDHash_contains(exTrHash, exId)) { Vector *vec = Vector_new(); Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free); IDHash_add(exTrHash, exId, vec); } Vector *exVec = IDHash_getValue(exTrHash, exId); PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank); Vector_addElement(exVec, trp); } IDHash_free(trHash, NULL); sth->finish(sth); PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba); Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice); // move exons onto transcript slice, and add them to transcripts for (i=0; i<Vector_getNumElement(exons); i++) { PredictionExon *ex = Vector_getElementAt(exons, i); // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue; PredictionExon *newEx; if (slice != extSlice) { newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice); if (newEx == NULL) { fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n"); exit(1); } } else { newEx = ex; } Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx)); int j; for (j=0; j<Vector_getNumElement(exVec); j++) { PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j); PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank); } } IDHash_free(exTrHash, Vector_free); free(qStr); return transcripts; }
Vector *GenomicAlignAdaptor_fetchAllByDNAFragGenomeDB(GenomicAlignAdaptor *gaa, DNAFrag *dnaFrag, GenomeDB *targetGenome, int *startP, int *endP, char *alignmentType) { Vector *result = NULL; GenomeDB *genomeCons; IDType methodLinkId; GenomeDB *genomeQuery; Vector *mergedAligns; int ok = 1; if (!dnaFrag) { fprintf(stderr, "Error: dnaFrag argument must be non NULL\n"); ok = 0; } if (ok) { methodLinkId = GenomicAlignAdaptor_methodLinkIdByAlignmentType(gaa, alignmentType); genomeCons = DNAFrag_getGenomeDB(dnaFrag); genomeQuery = targetGenome; // direct or indirect ?? if (GenomeDB_hasConsensus(genomeCons, genomeQuery, methodLinkId) || GenomeDB_hasQuery(genomeCons, genomeQuery, methodLinkId)) { result = GenomicAlignAdaptor_fetchAllByDNAFragGenomeDBDirect(gaa, dnaFrag, targetGenome, startP, endP, methodLinkId); } else { // indirect checks Vector *linkedCons = GenomeDB_linkedGenomesByMethodLinkId(genomeCons, methodLinkId); Vector *linkedQuery = GenomeDB_linkedGenomesByMethodLinkId(genomeQuery, methodLinkId); // there are not many genomes, square effort is cheap Vector *linked = Vector_new(); Vector *set1 = Vector_new(); mergedAligns = Vector_new(); int i; for (i=0; i<Vector_getNumElement(linkedCons); i++) { int j; GenomeDB *g1 = Vector_getElementAt(linkedCons, i); for (j=0; j<Vector_getNumElement(linkedQuery); j++) { GenomeDB *g2 = Vector_getElementAt(linkedQuery, i); if (g1 == g2) { Vector_addElement(linked, g1); } } } Vector_free(linkedCons); Vector_free(linkedQuery); // collect GenomicAligns from all linked genomes for (i=0; i<Vector_getNumElement(linked); i++) { GenomeDB *g = Vector_getElementAt(linked, i); Vector *gres = GenomicAlignAdaptor_fetchAllByDNAFragGenomeDBDirect(gaa, dnaFrag, g, startP, endP, methodLinkId); Vector_append(set1, gres); Vector_free(gres); } // go from each dnafrag in the result set to target_genome // there is room for improvement here: create start end // my %frags = map { $_->query_dnafrag->dbID => $_->query_dnafrag } @$set1; for (i=0; i<Vector_getNumElement(set1); i++) { GenomicAlign *alignA = Vector_getElementAt(set1,i); DNAFrag *frag = GenomicAlign_getQueryDNAFrag(alignA); int qStart = GenomicAlign_getQueryStart(alignA); int qEnd = GenomicAlign_getQueryEnd(alignA); Vector *dres = GenomicAlignAdaptor_fetchAllByDNAFragGenomeDBDirect(gaa, frag, genomeQuery, &qStart, &qEnd, methodLinkId); int j; for (j=0; j<Vector_getNumElement(dres); j++) { GenomicAlign *alignB = Vector_getElementAt(dres,j); GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, alignA, alignB); } Vector_free(dres); } // NIY freeing result = mergedAligns; } } return result; }
void GenomicAlignAdaptor_addDerivedAlignments(GenomicAlignAdaptor *gaa, Vector *mergedAligns, GenomicAlign *alignA, GenomicAlign *alignB) { // variable name explanation // q - query c - consensus s - start e - end l - last // o, ov overlap j - jump_in_ // r - result int qs, qe, lqs, lqe, cs, ce, lce, ocs, oce, oqs, oqe, jc, jq, ovs, ove, rcs, rce, rqs, rqe; int currentMatch = 0; int newMatch; int cigAPos = 0, cigBPos = 0; char *resultCig; char tmpStr[128]; // initialization phase Vector *cigA = CigarStrUtil_getPieces(GenomicAlign_getCigarString(alignA)); Vector *cigB = CigarStrUtil_getPieces(GenomicAlign_getCigarString(alignB)); if (GenomicAlign_getQueryStrand(alignA) == -1 ) { Vector_reverse(cigB); } // need a 'normalized' start for qs, qe, oxs so I dont // have to check strandedness all the time // consensus is strand 1 and is not compared to anything, // can keep its original coordinate system lce = GenomicAlign_getConsensusStart(alignA) - 1; ce = lce; cs = ce + 1; // alignBs query can be + or - just keep relative coords for now lqe = 0; lqs = 1; qe = 0; qs = 1; // ocs will be found relative to oce and has to be comparable // to oqs. But it could be that we have to move downwards if we // are not - strand. thats why coordinates are transformed here if (GenomicAlign_getQueryStrand(alignA) == -1 ) { // query_end is first basepair of alignment if (GenomicAlign_getQueryEnd(alignA) < GenomicAlign_getConsensusEnd(alignB)) { oce = 0; ocs = 1; oqe = GenomicAlign_getConsensusEnd(alignB) - GenomicAlign_getQueryEnd(alignA); oqs = oqe + 1; } else { oqe = 0; oqs = 1; oce = GenomicAlign_getQueryEnd(alignA) - GenomicAlign_getConsensusEnd(alignB); ocs = oce + 1; } } else { // in theory no coordinate magic necessary :-) oqs = GenomicAlign_getQueryStart(alignA); oqe = oqs - 1; ocs = GenomicAlign_getConsensusStart(alignB); oce = ocs - 1; } // initializing result rcs = rce = rqs = rqe = 0; resultCig= StrUtil_copyString(&resultCig,"",0); while (1) { int newGa; // exit if you request a new piece of alignment and the cig list is // empty if (oce < ocs || oce < oqs) { // next M area in cigB if (cigBPos == Vector_getNumElement(cigB)) break; GenomicAlignAdaptor_nextCig(gaa, cigB, &cigBPos, &ocs, &oce, &qs, &qe ); continue; } if (oqe < oqs || oqe < ocs) { // next M area in cigA if (cigAPos == Vector_getNumElement(cigA)) break; GenomicAlignAdaptor_nextCig(gaa, cigA, &cigAPos, &cs, &ce, &oqs, &oqe ); continue; } // now matching region overlap in reference genome ovs = ocs < oqs ? oqs : ocs; ove = oce < oqe ? oce : oqe; if (currentMatch) { jc = cs + (ovs - oqs) - lce - 1; jq = qs + (ovs - ocs) - lqe - 1; } else { jc = jq = 0; } newMatch = ove - ovs + 1; newGa = 0; if (jc==0) { if (jq==0) { currentMatch += newMatch; } else { // store current match; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig,tmpStr); // jq deletions; if (jq == 1) { resultCig = StrUtil_appendString(resultCig,"D"); } else { sprintf(tmpStr,"%dD",jq); resultCig = StrUtil_appendString(resultCig,tmpStr); } currentMatch = newMatch; } } else { if (jq==0) { // store current match; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig,tmpStr); // jc insertions; if (jc==1) { resultCig = StrUtil_appendString(resultCig,"I"); } else { sprintf(tmpStr,"%dI",jc); resultCig = StrUtil_appendString(resultCig,tmpStr); } currentMatch = newMatch; } else { double percId; double score; GenomicAlign *ga; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig,tmpStr); // new GA int queryStrand = GenomicAlign_getQueryStrand(alignA) * GenomicAlign_getQueryStrand(alignB); int queryStart, queryEnd; if (queryStrand == 1) { queryStart = rqs + GenomicAlign_getQueryStart(alignB) - 1; queryEnd = rqe + GenomicAlign_getQueryStart(alignB) - 1; } else { queryEnd = GenomicAlign_getQueryEnd(alignB) - rqs + 1; queryStart = GenomicAlign_getQueryEnd(alignB) - rqe + 1; } score = (GenomicAlign_getScore(alignA) < GenomicAlign_getScore(alignB)) ? GenomicAlign_getScore(alignA) : GenomicAlign_getScore(alignB); percId = (int)(GenomicAlign_getPercentId(alignA)*GenomicAlign_getPercentId(alignB)/100.0); ga = GenomicAlign_new(); GenomicAlign_setConsensusDNAFrag(ga, GenomicAlign_getConsensusDNAFrag(alignA)); GenomicAlign_setQueryDNAFrag(ga, GenomicAlign_getQueryDNAFrag(alignB)); GenomicAlign_setCigarString(ga, resultCig); GenomicAlign_setConsensusStart(ga, rcs); GenomicAlign_setConsensusEnd(ga, rce); GenomicAlign_setQueryStrand(ga, queryStrand); GenomicAlign_setQueryStart(ga, queryStart); GenomicAlign_setQueryEnd(ga, queryEnd); GenomicAlign_setAdaptor(ga, (BaseAdaptor *)gaa); GenomicAlign_setPercentId(ga, percId); GenomicAlign_setScore(ga, score); Vector_addElement(mergedAligns, ga); rcs = rce = rqs = rqe = 0; resultCig[0] = '\0'; currentMatch = newMatch; } } if (!rcs) rcs = cs+(ovs-oqs); rce = cs+(ove-oqs); if (!rqs) rqs = qs+(ovs-ocs); rqe = qs+(ove-ocs); // update the last positions lce = rce; lqe = rqe; // next piece on the one that end earlier if (oce <= oqe) { // next M area in cigB if (cigBPos == Vector_getNumElement(cigB)) break; GenomicAlignAdaptor_nextCig(gaa, cigB, &cigBPos, &ocs, &oce, &qs, &qe ); } if (oce >= oqe) { // next M area in cigA if (cigAPos == Vector_getNumElement(cigA)) break; GenomicAlignAdaptor_nextCig(gaa, cigA, &cigAPos, &cs, &ce, &oqs, &oqe ); } } // end of while loop // if there is a last floating current match if (currentMatch) { // new GA int queryStrand = GenomicAlign_getQueryStrand(alignA) * GenomicAlign_getQueryStrand(alignB); int queryStart, queryEnd; double percId; double score; GenomicAlign *ga; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig, tmpStr); if (queryStrand == 1) { queryStart = rqs + GenomicAlign_getQueryStart(alignB) - 1; queryEnd = rqe + GenomicAlign_getQueryStart(alignB) - 1; } else { queryEnd = GenomicAlign_getQueryEnd(alignB) - rqs + 1; queryStart = GenomicAlign_getQueryEnd(alignB) - rqe + 1; } score = (GenomicAlign_getScore(alignA) < GenomicAlign_getScore(alignB)) ? GenomicAlign_getScore(alignA) : GenomicAlign_getScore(alignB); percId = (int)(GenomicAlign_getPercentId(alignA)*GenomicAlign_getPercentId(alignB)/100.0); ga = GenomicAlign_new(); GenomicAlign_setConsensusDNAFrag(ga, GenomicAlign_getConsensusDNAFrag(alignA)); GenomicAlign_setQueryDNAFrag(ga, GenomicAlign_getQueryDNAFrag(alignB)); GenomicAlign_setCigarString(ga, resultCig); GenomicAlign_setConsensusStart(ga, rcs); GenomicAlign_setConsensusEnd(ga, rce); GenomicAlign_setQueryStrand(ga, queryStrand); GenomicAlign_setQueryStart(ga, queryStart); GenomicAlign_setQueryEnd(ga, queryEnd); GenomicAlign_setAdaptor(ga, (BaseAdaptor *)gaa); GenomicAlign_setPercentId(ga, percId); GenomicAlign_setScore(ga, score); Vector_addElement(mergedAligns, ga); } free(resultCig); Vector_free(cigA); Vector_free(cigB); // nothing to return all in merged_aligns }
Vector *GenomicAlignAdaptor_objectsFromStatementHandle(GenomicAlignAdaptor *gaa, StatementHandle *sth, int reverse) { Vector *results = Vector_new(); ResultRow *row; DNAFragAdaptor *dfa; IDType consensusDNAFragId; IDType queryDNAFragId; int consensusStart; int consensusEnd; int queryStart; int queryEnd; int queryStrand; IDType methodLinkId; double score; double percId; char *cigarString; dfa = ComparaDBAdaptor_getDNAFragAdaptor(gaa->dba); while ((row = sth->fetchRow(sth))) { GenomicAlign *genomicAlign; char *alignmentType; if (reverse) { queryDNAFragId = row->getLongLongAt(row,0); queryStart = row->getIntAt(row,1); queryEnd = row->getIntAt(row,2); consensusDNAFragId = row->getLongLongAt(row,3); consensusStart = row->getIntAt(row,4); consensusEnd = row->getIntAt(row,5); } else { consensusDNAFragId = row->getLongLongAt(row,0); consensusStart = row->getIntAt(row,1); consensusEnd = row->getIntAt(row,2); queryDNAFragId = row->getLongLongAt(row,3); queryStart = row->getIntAt(row,4); queryEnd = row->getIntAt(row,5); } queryStrand = row->getIntAt(row,6); methodLinkId = row->getLongLongAt(row,7); score = row->getDoubleAt(row,8); percId = row->getDoubleAt(row,9); cigarString = row->getStringAt(row,10); alignmentType = GenomicAlignAdaptor_alignmentTypeByMethodLinkId(gaa, methodLinkId); if (reverse) { StrUtil_strReplChrs(cigarString,"DI","ID"); // alignment of the opposite strand if (queryStrand == -1) { cigarString = CigarStrUtil_reverse(cigarString, strlen(cigarString)); } } genomicAlign = GenomicAlign_new(); GenomicAlign_setAdaptor(genomicAlign, (BaseAdaptor *)gaa); GenomicAlign_setConsensusDNAFrag(genomicAlign, DNAFragAdaptor_fetchByDbID(dfa,consensusDNAFragId)); GenomicAlign_setConsensusStart(genomicAlign, consensusStart); GenomicAlign_setConsensusEnd(genomicAlign, consensusEnd); GenomicAlign_setQueryDNAFrag(genomicAlign, DNAFragAdaptor_fetchByDbID(dfa,queryDNAFragId)); GenomicAlign_setQueryStart(genomicAlign, queryStart); GenomicAlign_setQueryEnd(genomicAlign, queryEnd); GenomicAlign_setQueryStrand(genomicAlign, queryStrand); GenomicAlign_setAlignmentType(genomicAlign, alignmentType); GenomicAlign_setScore(genomicAlign, score); GenomicAlign_setPercentId(genomicAlign, percId); GenomicAlign_setCigarString(genomicAlign, cigarString); Vector_addElement(results, genomicAlign); } return results; }
Vector *DBEntryAdaptor_fetchByObjectType(DBEntryAdaptor *dbea, IDType ensObj, char *ensType) { Vector *out; char qStr[1024]; StatementHandle *sth; ResultRow *row; IDHash *seen; if (!ensObj) { fprintf(stderr,"Error: Can't fetchByObjectType without an object\n"); exit(1); } if (!ensType) { fprintf(stderr,"Error: Can't fetchByObjectType without a type\n"); exit(1); } // Not sure if idt identities are right way round sprintf(qStr, "SELECT xref.xref_id, xref.dbprimary_acc, xref.display_label, xref.version," " xref.description," " exDB.db_name, exDB.db_release, exDB.status," " oxr.object_xref_id," " es.synonym," " idt.xref_identity, idt.ensembl_identity" " FROM (external_db exDB, object_xref oxr, xref xref)" " LEFT JOIN external_synonym es on es.xref_id = xref.xref_id" " LEFT JOIN identity_xref idt on idt.object_xref_id = oxr.object_xref_id" " WHERE xref.xref_id = oxr.xref_id" " AND xref.external_db_id = exDB.external_db_id" " AND oxr.ensembl_id = " IDFMTSTR " AND oxr.ensembl_object_type = '%s'", ensObj, ensType); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); seen = IDHash_new(IDHASH_SMALL); out = Vector_new(); while ((row = sth->fetchRow(sth))) { DBEntry *exDB; IDType refID = row->getLongLongAt(row,0); // using an outer join on the synonyms as well as on identity_xref, we // now have to filter out the duplicates (see v.1.18 for // original). Since there is at most one identity_xref row per xref, // this is easy enough; all the 'extra' bits are synonyms if (!IDHash_contains(seen,refID)) { exDB = DBEntry_new(); DBEntry_setAdaptor(exDB,(BaseAdaptor *)dbea); DBEntry_setDbID(exDB, refID); DBEntry_setPrimaryId(exDB, row->getStringAt(row,1)); DBEntry_setDisplayId(exDB, row->getStringAt(row,2)); DBEntry_setVersion(exDB, row->getStringAt(row,3)); DBEntry_setDbName(exDB, row->getStringAt(row,5)); DBEntry_setRelease(exDB, row->getStringAt(row,6)); if (row->col(row,10)) { IdentityXref *idx = IdentityXref_new(); DBEntry_setIdentityXref(exDB,idx); IdentityXref_setQueryIdentity(idx, row->getDoubleAt(row,10)); IdentityXref_setTargetIdentity(idx, row->getDoubleAt(row,11)); } if (row->col(row,4)) DBEntry_setDescription(exDB, row->getStringAt(row,4)); if (row->col(row,7)) DBEntry_setStatus(exDB, row->getStringAt(row,7)); Vector_addElement(out, exDB); IDHash_add(seen, refID, exDB); } exDB = IDHash_getValue(seen, refID); if (row->col(row,9)) { DBEntry_addSynonym(exDB,row->getStringAt(row,9)); } } IDHash_free(seen, NULL); sth->finish(sth); return out; }
Vector *IntronSupportingEvidenceAdaptor_objectsFromStatementHandle(IntronSupportingEvidenceAdaptor *isea, StatementHandle *sth, AssemblyMapper *assMapper, Slice *destSlice) { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(isea->dba); AnalysisAdaptor *aa = DBAdaptor_getAnalysisAdaptor(isea->dba); Vector *features = Vector_new(); IDHash *sliceHash = IDHash_new(IDHASH_SMALL); /* Unneccesary my %analysis_hash; my %sr_name_hash; my %sr_cs_hash; */ /* Unused my $asm_cs; my $cmp_cs; my $asm_cs_vers; my $asm_cs_name; my $cmp_cs_vers; my $cmp_cs_name; if($mapper) { $asm_cs = $mapper->assembled_CoordSystem(); $cmp_cs = $mapper->component_CoordSystem(); $asm_cs_name = $asm_cs->name(); $asm_cs_vers = $asm_cs->version(); $cmp_cs_name = $cmp_cs->name(); $cmp_cs_vers = $cmp_cs->version(); } */ long destSliceStart; long destSliceEnd; int destSliceStrand; long destSliceLength; //CoordSystem *destSliceCs; char * destSliceSrName; IDType destSliceSrId = 0; //AssemblyMapperAdaptor *asma; if (destSlice) { destSliceStart = Slice_getStart(destSlice); destSliceEnd = Slice_getEnd(destSlice); destSliceStrand = Slice_getStrand(destSlice); destSliceLength = Slice_getLength(destSlice); //??destSliceCs = Slice_getCoordSystem(destSlice); destSliceSrName = Slice_getSeqRegionName(destSlice); destSliceSrId = Slice_getSeqRegionId(destSlice); //??asma = DBAdaptor_getAssemblyMapperAdaptor(ea->dba); } ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row,0); IDType analysisId = row->getLongLongAt(row,1); IDType seqRegionId = row->getLongLongAt(row,2); long seqRegionStart = row->getLongAt(row,3); long seqRegionEnd = row->getLongAt(row,4); int seqRegionStrand = row->getIntAt(row,5); char *hitName = row->getStringAt(row,6); double score = row->getDoubleAt(row,7); char *scoreType = row->getStringAt(row,8); int spliceCanonical = row->getIntAt(row,9); // get the analysis object Analysis *analysis = AnalysisAdaptor_fetchByDbID(aa, analysisId); /* // need to get the internal_seq_region, if present $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); #get the slice object my $slice = $slice_hash{"ID:".$seq_region_id}; if(!$slice) { $slice = $sa->fetch_by_seq_region_id($seq_region_id); $slice_hash{"ID:".$seq_region_id} = $slice; $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); $sr_cs_hash{$seq_region_id} = $slice->coord_system(); } my $sr_name = $sr_name_hash{$seq_region_id}; my $sr_cs = $sr_cs_hash{$seq_region_id}; */ if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } Slice *slice = IDHash_getValue(sliceHash, seqRegionId); Slice *iseSlice = slice; char *srName = Slice_getSeqRegionName(slice); CoordSystem *srCs = Slice_getCoordSystem(slice); // // remap the feature coordinates to another coord system // if a mapper was provided // if (assMapper != NULL) { MapperRangeSet *mrs; // Slightly suspicious about need for this if statement so left in perl statements for now if (destSlice != NULL && assMapper->objectType == CLASS_CHAINEDASSEMBLYMAPPER) { mrs = ChainedAssemblyMapper_map(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, 1, destSlice); } else { mrs = AssemblyMapper_fastMap(assMapper, srName, seqRegionStart, seqRegionEnd, seqRegionStrand, srCs, NULL); } // skip features that map to gaps or coord system boundaries //next FEATURE if (!defined($seq_region_id)); if (MapperRangeSet_getNumRange(mrs) == 0) { continue; } MapperRange *range = MapperRangeSet_getRangeAt(mrs, 0); if (range->rangeType == MAPPERRANGE_GAP) { fprintf(stderr,"Got a mapper gap in gene obj_from_sth - not sure if this is allowed\n"); exit(1); } else { MapperCoordinate *mc = (MapperCoordinate *)range; seqRegionId = mc->id; seqRegionStart = mc->start; seqRegionEnd = mc->end; seqRegionStrand = mc->strand; } MapperRangeSet_free(mrs); /* Was - but identical if and else so why test??? #get a slice in the coord system we just mapped to if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { $slice = $slice_hash{"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id); } else { $slice = $slice_hash{"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id); } */ // Instead... if (! IDHash_contains(sliceHash, seqRegionId)) { IDHash_add(sliceHash, seqRegionId, SliceAdaptor_fetchBySeqRegionId(sa, seqRegionId, POS_UNDEF, POS_UNDEF, STRAND_UNDEF)); } iseSlice = IDHash_getValue(sliceHash, seqRegionId); } // // If a destination slice was provided convert the coords // If the dest_slice starts at 1 and is foward strand, nothing needs doing // if (destSlice != NULL) { if (destSliceStart != 1 || destSliceStrand != 1) { if (destSliceStrand == 1) { seqRegionStart = seqRegionStart - destSliceStart + 1; seqRegionEnd = seqRegionEnd - destSliceStart + 1; } else { long tmpSeqRegionStart = seqRegionStart; seqRegionStart = destSliceEnd - seqRegionEnd + 1; seqRegionEnd = destSliceEnd - tmpSeqRegionStart + 1; seqRegionStrand = -seqRegionStrand; } } // throw away features off the end of the requested slice if (seqRegionEnd < 1 || seqRegionStart > destSliceLength || (destSliceSrId != seqRegionId)) { continue; } iseSlice = destSlice; } IntronSupportingEvidence *ise = IntronSupportingEvidence_new(); IntronSupportingEvidence_setStart (ise, seqRegionStart); IntronSupportingEvidence_setEnd (ise, seqRegionEnd); IntronSupportingEvidence_setStrand (ise, seqRegionStrand); IntronSupportingEvidence_setSlice (ise, iseSlice); IntronSupportingEvidence_setAnalysis (ise, analysis); IntronSupportingEvidence_setAdaptor (ise, (BaseAdaptor *)isea); IntronSupportingEvidence_setDbID (ise, id); IntronSupportingEvidence_setHitName (ise, hitName); IntronSupportingEvidence_setScore (ise, score); IntronSupportingEvidence_setScoreType (ise, scoreType); IntronSupportingEvidence_setIsSpliceCanonical(ise, spliceCanonical); Vector_addElement(features, ise); } return features; }
// For ordered, the default should be 0 (if you just need to fill out the args) // Note ONLY stable_id can be char, all other pk's must be IDType (see code) Vector *BaseAdaptor_listDbIDs(BaseAdaptor *ba, char *table, char *pk, int ordered) { int ok = 1; char colName[1024]; if (pk == NULL) { sprintf(colName, "%s_id", table); } else { strcpy(colName, pk); } char qStr[1024]; sprintf(qStr,"SELECT `%s` FROM `%s`", colName, table ); if ( BaseAdaptor_isMultiSpecies(BaseAdaptor *ba) // For now just the multi species because I don't have adaptors in the Class hierarchy // && $self->isa('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor') // && !$self->isa('Bio::EnsEMBL::DBSQL::UnmappedObjectAdaptor') ) { char tmpStr[1024]; sprintf(tmpStr, "JOIN seq_region USING (seq_region_id) " "JOIN coord_system cs USING (coord_system_id) " "WHERE cs.species_id = "IDFMTSTR, BaseAdaptor_getSpeciesId(ba)); sprintf(qStr, "%s %s", qStr, tmpStr); } if (ordered) { sprintf(qStr, "%s ORDER BY seq_region_id, seq_region_start", qStr); } StatementHandle *sth = ba->prepare(ba,qStr,strlen(qStr)); sth->execute(sth); Vector *out = Vector_new(); if (strcmp(pk, "stable_id")) { ResultRow *row; while ((row = sth->fetchRow(sth))) { char *stableId = row->getStringCopyAt(row, 0); Vector_addElement(out, stableId); } } else { IDType *idP; ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row, 0); if ((idP = calloc(1,sizeof(IDType))) == NULL) { fprintf(stderr, "Failed allocating space for a id\n"); ok = 0; } else { *idP = id; Vector_addElement(out, idP); } } } if (!ok) { Vector_free(out); out = NULL; } return out; }