int main(int argc, char *argv[]) { Vector *pieces; char *str; char *reverse; int testResult = 0; initEnsC(argc, argv); pieces = CigarStrUtil_getPieces(cigar1); testResult += ok(1, Vector_getNumElement(pieces) == 6); str = Vector_getElementAt(pieces,0); testResult += ok(2, !strcmp(str,"6M")); Vector_free(pieces); StrUtil_copyString(&str,cigar1,0); testResult += ok(3, !strcmp(str,cigar1)); reverse = CigarStrUtil_reverse(str,strlen(str)); free(str); pieces = CigarStrUtil_getPieces(reverse); testResult += ok(4, Vector_getNumElement(pieces) == 6); str = Vector_getElementAt(pieces,0); testResult += ok(5, !strcmp(str,"M")); return testResult; }
char *Translation_modifyTranslation(Translation *translation, char *seq) { Vector *seqEds = Translation_getAllSeqEdits(translation); // Sort in reverse order to avoid complication of adjusting // downstream edits. // HACK: The translation ENSP00000420939 somehow makes the next line // bomb out ($a or $b becomes undef) if the start() method // is used. I haven't been able to find out why. It has 10 // Selenocysteine seqedits that looks correct. // /Andreas (release 59) if (Vector_getNumElement(seqEds)) { Vector_sort(seqEds, SeqEdit_reverseStartCompFunc); // @seqeds = sort { $b->{'start'} <=> $a->{'start'} } @seqeds; // Apply all edits. // Not particularly efficient currently, could improve by precalculating maximum size of new seq prior to applying edits int i; for (i=0; i<Vector_getNumElement(seqEds); i++) { SeqEdit *se = Vector_getElementAt(seqEds, i); seq = SeqEdit_applyEdit(se, seq); } //$seq->seq($peptide); } Vector_free(seqEds); return seq; }
SeqFeature *BaseAdaptor_uncachedFetchByDbID(BaseAdaptor *ba, IDType id) { SeqFeature *feat = NULL; char constraint[1024]; //construct a constraint like 't1.table1_id = 123' NameTableType *tables = ba->getTables(); char **t = (*tables)[0]; sprintf(constraint, "%s.%s_id = "IDFMTSTR, t[SYN], t[NAME], id); //Should only be one Vector *vec = BaseAdaptor_genericFetch(ba, constraint, NULL, NULL); if (Vector_getNumElement(vec) > 1) { fprintf(stderr, "Error: Got more than one feature back in fetch ID call\n"); } else { if (Vector_getNumElement(vec) == 1) { feat = Vector_getElementAt(vec, 0); Object_incRefCount(feat); } } // NIY May want to set a free func??? Vector_free(vec); return feat; }
int main(int argc, char *argv[]) { DBAdaptor *dba; RepeatFeatureAdaptor *rfa; Slice *slice; Vector *features; int i; int failed; initEnsC(argc, argv); dba = Test_initROEnsDB(); slice = Test_getStandardSlice(dba); ok(1, slice!=NULL); rfa = DBAdaptor_getRepeatFeatureAdaptor(dba); ok(2, rfa!=NULL); features = Slice_getAllRepeatFeatures(slice,NULL,NULL, NULL); ok(3, features!=NULL); ok(4, Vector_getNumElement(features)!=0); failed = 0; for (i=0;i<Vector_getNumElement(features) && !failed;i++) { RepeatFeature *rf = Vector_getElementAt(features,i); int start = RepeatFeature_getStart(rf); int end = RepeatFeature_getEnd(rf); Vector *rrfVector; RepeatFeature *rrf; printf("slice start = %d end = %d\n",start,end); /* rrfVector = RepeatFeature_transformToRawContig(rf); if (Vector_getNumElement(rrfVector) > 1) { printf("Feature mapped to more than one rawcontig\n"); failed=1; } rrf = Vector_getElementAt(rrfVector,0); //printf("rc start = %d end = %d\n",RepeatFeature_getStart(rrf),RepeatFeature_getEnd(rrf)); rf = RepeatFeature_transformToSlice(rrf, slice); if (RepeatFeature_getStart(rf) != start || RepeatFeature_getEnd(rf) != end) { printf("Remapping to slice produced different coords\n"); failed =1; } */ } ok(5, !failed); return 0; }
int DBEntryAdaptor_fetchAllByGene(DBEntryAdaptor *dbea, Gene *gene) { char qStr[512]; StatementHandle *sth; ResultRow *row; sprintf(qStr, "SELECT t.transcript_id, t.canonical_translation_id" " FROM transcript t" " WHERE t.gene_id = " IDFMTSTR, Gene_getDbID(gene)); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); while ((row = sth->fetchRow(sth))) { IDType transcriptId = row->getLongLongAt(row,0); int i; Vector *transLinks; if (row->col(row,1)) { IDType translationId = row->getLongLongAt(row,1); Vector *translatLinks = DBEntryAdaptor_fetchByObjectType(dbea, translationId,"Translation"); for (i=0;i<Vector_getNumElement(translatLinks); i++) { Gene_addDBLink(gene,Vector_getElementAt(translatLinks,i)); } Vector_free(translatLinks); } transLinks = DBEntryAdaptor_fetchByObjectType(dbea, transcriptId,"Transcript"); for (i=0;i<Vector_getNumElement(transLinks); i++) { Gene_addDBLink(gene, Vector_getElementAt(transLinks,i)); } Vector_free(transLinks); } /* NIY This is wrong so I'm not going to implement it! if($gene->stable_id){ my $genelinks = $self->_fetch_by_object_type( $gene->stable_id, 'Gene' ); foreach my $genelink ( @$genelinks ) { $gene->add_DBLink( $genelink ); } } */ return 1; }
char *PredictionTranscript_getcDNA(PredictionTranscript *trans) { Vector *exons = PredictionTranscript_getAllExons(trans,0); char *cdna = StrUtil_copyString(&cdna, "", 0); //int lastPhase = 0; int i; int first = 1; int cdnaStart; int pepStart; cdnaStart = 1; pepStart = 1; for (i=0; i<Vector_getNumElement(exons); i++) { PredictionExon *exon = Vector_getElementAt(exons, i); int phase; if (!exon) { if (cdna[0] == '\0') { continue; } else { break; } } phase = 0; // NIY if (defined($exon->phase)) { phase = PredictionExon_getPhase(exon); // } //fprintf(stderr, " phase for exon %d is %d\n", i, phase); if (first) { cdna = SeqUtil_addNs(cdna,phase); first = 0; } /* // Hack for now - should never happen if (phase != lastPhase ) { if (lastPhase == 1) { cdna = StrUtil_appendString(cdna,"NN"); } else if (lastPhase == 2) { cdna = StrUtil_appendString(cdna,"N"); } // startpadding for this exon cdna = SeqUtil_addNs(cdna,phase); } */ cdna = StrUtil_appendString(cdna, PredictionExon_getSeqString(exon)); //lastPhase = PredictionExon_getEndPhase(exon); //lastPhase = phase; } // NIY Freeing exons vector? return cdna; }
// New Vector *Translation_getAllSeqEdits(Translation *translation) { char *edits[] = { "initial_met", "_selenocysteine", "amino_acid_sub", NULL }; Vector *seqEds = Vector_new(); char **editP = edits; while (*editP) { char *edit = *editP; Vector *attribs = Translation_getAllAttributes(translation, edit); // convert attributes to SeqEdit objects int i; for (i=0; i<Vector_getNumElement(attribs); i++) { Attribute *attrib = Vector_getElementAt(attribs, i); SeqEdit *seqEd = SeqEdit_newFromAttribute(attrib); Vector_addElement(seqEds, seqEd); } Vector_free(attribs); editP++; } return seqEds; }
// NIY: // Because this can filter the results the vector that gets returned must be freeable - so for now // make a copy of the translation->attributes vector if returning unfiltered so behaviour is // consistent. Long term probably want reference count incremented Vector *Translation_getAllAttributes(Translation *translation, char *attribCode) { if (translation->attributes == NULL) { TranslationAdaptor *tlna = (TranslationAdaptor *)Translation_getAdaptor(translation); if (tlna == NULL) { // No adaptor // Perl comments out the warning, I'll put it back for now, just in case //fprintf(stderr,"Warning: Cannot get attributes without an adaptor.\n"); return Vector_new(); } AttributeAdaptor *ata = DBAdaptor_getAttributeAdaptor(tlna->dba); translation->attributes = AttributeAdaptor_fetchAllByTranslation(ata, translation, NULL); } if (attribCode != NULL) { Vector *results = Vector_new(); int i; for (i=0; i<Vector_getNumElement(translation->attributes); i++) { Attribute *attrib = Vector_getElementAt(translation->attributes, i); if (!strcasecmp(attrib->code, attribCode)) { Vector_addElement(results, attrib); } } return results; } else { // See NIY note above for why I'm making a copy return Vector_copy(translation->attributes); } }
Mapper *PredictionTranscript_getcDNACoordMapper(PredictionTranscript *trans) { Mapper *mapper; int start = 1; int i; Vector *translateable; if (trans->exonCoordMapper) { return trans->exonCoordMapper; } // // the mapper is loaded with OBJECTS in place of the IDs !!!! // the objects are the contigs in the exons // // NIY: What should coordsystems be? mapper = Mapper_new( "cdna", "genomic", NULL, NULL ); translateable = PredictionTranscript_getAllTranslateableExons(trans); for (i=0; i<Vector_getNumElement(translateable); i++) { PredictionExon *exon = Vector_getElementAt(translateable,i); PredictionExon_loadGenomicMapper((Exon*)exon, mapper, (IDType)trans, start); start += PredictionExon_getLength(exon); } trans->exonCoordMapper = mapper; Vector_free(translateable); return mapper; }
// Removed the circular stuff void AttributeAdaptor_doStoreAllByTypeAndTableAndID(AttributeAdaptor *ata, char *type, char *table, IDType objectId, Vector *attributes) { int ok = 1; char qStr[1024]; sprintf(qStr, "INSERT into %s_attrib SET %s_id = %"IDFMTSTR", attrib_type_id = %"IDFMTSTR", value = '%%s'", table, type); StatementHandle *sth = ata->prepare((BaseAdaptor *)ata,qStr,strlen(qStr)); int i; for (i=0; i<Vector_getNumElement(attributes); i++) { Attribute *attrib = Vector_getElementAt(attributes, i); if (attrib == NULL ) { fprintf(stderr, "Reference to list of Bio::EnsEMBL::Attribute objects argument expected.\n"); ok = 0; break; } Class_assertType(CLASS_ATTRIBUTE, attrib->objectType); IDType atId = AttributeAdaptor_storeType(ata, attrib); sth->execute(sth, objectId, atId, Attribute_getValue(attrib)); } if (ok) { sth->finish(sth); } return; }
int PredictionTranscript_setExonCount(PredictionTranscript *trans, int count) { if (Vector_getNumElement(trans->exons) > count) { fprintf(stderr, "Error: Trying to shrink exon vector\n"); } Vector_setNumElement(trans->exons, count); return 1; }
int DBEntryAdaptor_fetchAllByTranscript(DBEntryAdaptor *dbea, Transcript *trans) { char qStr[512]; StatementHandle *sth; ResultRow *row; Vector *transLinks; int i; sprintf(qStr, "SELECT t.canonical_translation_id" " FROM transcript t" " WHERE t.transcript_id = " IDFMTSTR, Transcript_getDbID(trans)); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); // // Did this to be consistent with fetch_by_Gene, but don't like // it (filling in the object). I think returning the array would // be better. Oh well. EB // while ((row = sth->fetchRow(sth))) { IDType translationId = row->getLongLongAt(row,0); Vector *translatLinks = DBEntryAdaptor_fetchByObjectType(dbea, translationId,"Translation"); for (i=0;i<Vector_getNumElement(translatLinks); i++) { Transcript_addDBLink(trans,Vector_getElementAt(translatLinks,i)); } Vector_free(translatLinks); } sth->finish(sth); transLinks = DBEntryAdaptor_fetchByObjectType(dbea, Transcript_getDbID(trans),"Transcript"); fprintf(stderr,"transLinks\n"); for (i=0;i<Vector_getNumElement(transLinks); i++) { Transcript_addDBLink(trans,Vector_getElementAt(transLinks,i)); } Vector_free(transLinks); return 1; }
void GenomicAlignAdaptor_nextCig(GenomicAlignAdaptor *gaa, Vector *cigList, int *cigListPos, int *cs, int *ce, int *qs, int *qe) { int count; char type; char *cigElem; int lenElem; do { cigElem = Vector_getElementAt(cigList, *cigListPos); (*cigListPos)++; lenElem = strlen(cigElem); type = cigElem[lenElem-1]; if (type!='M' && type!='I' && type!='D') { fprintf(stderr,"Error: Cigar string format error for %s\n",cigElem); break; } if (lenElem > 1) { cigElem[lenElem-1] = '\0'; count = atol(cigElem); } else { count = 1; } switch (type) { case 'D': *qe += count; break; case 'I': *ce += count; break; case 'M': *cs = *ce + 1; *ce = *cs + count - 1; *qs = *qe + 1; *qe = *qs + count - 1; } } while (type != 'M' && *cigListPos!=Vector_getNumElement(cigList)); }
MapperRangeSet *PredictionTranscript_genomic2cDNA(PredictionTranscript *trans, int start, int end, int strand, BaseContig *contig) { Mapper *mapper; // "ids" in mapper are contigs of exons, so use the same contig that should // be attached to all of the exons... if (!contig) { Vector *translateable = PredictionTranscript_getAllTranslateableExons(trans); PredictionExon *firstExon; if (!Vector_getNumElement(translateable)) { return MapperRangeSet_new(); } firstExon = Vector_getElementAt(translateable, 0); contig = (BaseContig*)PredictionExon_getSlice(firstExon); Vector_free(translateable); } mapper = PredictionTranscript_getcDNACoordMapper(trans); return Mapper_mapCoordinates(mapper,(IDType)contig, start, end, strand, "genomic"); }
AssemblyMapper *AssemblyMapper_new(AssemblyMapperAdaptor *adaptor, Vector *coordSystems) { AssemblyMapper *am; if ((am = (AssemblyMapper *)calloc(1, sizeof(AssemblyMapper))) == NULL) { fprintf(stderr, "ERROR: Failed allocating space for AssemblyMapper\n"); return NULL; } am->objectType = CLASS_ASSEMBLYMAPPER; am->funcs = &assemblyMapperFuncs; Object_incRefCount(am); AssemblyMapper_setAdaptor(am, adaptor); AssemblyMapperAdaptor_cacheSeqIdsWithMultAssemblies(adaptor); if ( Vector_getNumElement(coordSystems) != 2 ) { fprintf(stderr, "Can only map between two coordinate systems %d were provided\n", Vector_getNumElement(coordSystems)); exit(1); } // Set the component and assembled coordinate systems AssemblyMapper_setAssembledCoordSystem(am, Vector_getElementAt(coordSystems, 0)); AssemblyMapper_setComponentCoordSystem(am, Vector_getElementAt(coordSystems, 1)); AssemblyMapper_setAssembledRegister(am, IDHash_new(IDHASH_MEDIUM)); AssemblyMapper_setComponentRegister(am, IDHash_new(IDHASH_MEDIUM)); // We load the mapper calling the 'ASSEMBLED' the 'from' coord system // and the 'COMPONENT' the 'to' coord system. AssemblyMapper_setMapper(am, Mapper_new("assembled", "component", AssemblyMapper_getAssembledCoordSystem(am), AssemblyMapper_getComponentCoordSystem(am))); AssemblyMapper_setMaxPairCount(am, AM_DEFAULT_MAX_PAIR_COUNT); return am; }
/* =head2 fetch_all_by_Transcript Arg[1] : Bio::EnsEMBL::Transcript Transcript to search with Example : my $ises = $isea->fetch_all_by_Transcript($transcript); Description : Uses the given Transcript to search for all instances of IntronSupportingEvidence linked to the transcript in the database Returntype : ArrayRef of IntronSupportingEvidence objects Exceptions : Thrown if arguments are not as stated and for DB errors =cut */ Vector *IntronSupportingEvidenceAdaptor_fetchAllByTranscript(IntronSupportingEvidenceAdaptor *isea, Transcript *transcript) { char qStr[1024]; sprintf(qStr,"SELECT intron_supporting_evidence_id " "FROM transcript_intron_supporting_evidence " "WHERE transcript_id = "IDFMTSTR, Transcript_getDbID(transcript)); StatementHandle *sth = isea->prepare((BaseAdaptor *)isea,qStr,strlen(qStr)); sth->execute(sth); Vector *idVec = Vector_new(); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType id = row->getLongLongAt(row, 0); IDType *idP; if ((idP = calloc(1,sizeof(IDType))) == NULL) { fprintf(stderr, "Failed allocating space for a id\n"); exit(1); } *idP = id; Vector_addElement(idVec, idP); } sth->finish(sth); Vector *out; if (Vector_getNumElement(idVec) > 0) { out = IntronSupportingEvidenceAdaptor_fetchAllByDbIDList(isea, idVec, NULL); } else { out = Vector_new(); } // Free ids vector Vector_setFreeFunc(idVec, free); Vector_free(idVec); return out; }
int main(int argc, char *argv[]) { DBAdaptor *dba; DBAdaptor *writeDba; ProteinAlignFeatureAdaptor *pafa; Slice *slice; Vector *features; int i; int failed; initEnsC(argc, argv); dba = Test_initROEnsDB(); writeDba = Test_initRWEnsDB(); slice = Test_getStandardSlice(dba); ok(1, slice!=NULL); pafa = DBAdaptor_getProteinAlignFeatureAdaptor(writeDba); SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba); ok(2, pafa!=NULL); //features = Slice_getAllDNAPepAlignFeatures(slice,NULL,NULL, NULL,NULL); //Slice *slice3 = SliceAdaptor_fetchByRegion(sa,"chromosome","1",2,260000000,1,NULL,0); Slice *slice2 = SliceAdaptor_fetchByRegion(sa,"chromosome","1",1000000,4000000,1,NULL,0); features = Slice_getAllProteinAlignFeatures(slice2,NULL,NULL, NULL,NULL); ok(3, features!=NULL); ok(4, Vector_getNumElement(features)!=0); ProteinAlignFeatureAdaptor_store((BaseFeatureAdaptor*)pafa, features); return 0; }
int PredictionTranscript_getExonCount(PredictionTranscript *trans) { return Vector_getNumElement(trans->exons); }
int main(int argc, char *argv[]) { DBAdaptor *dba; AssemblyMapperAdaptor *asma; int testNum = 1; initEnsC(argc, argv); dba = Test_initROEnsDB(); // // 1 Test AssemblyMapperAdaptor constructor // asma = DBAdaptor_getAssemblyMapperAdaptor(dba); ok(testNum++, asma!=NULL); // // Test fetch_by_CoordSystems // CoordSystemAdaptor *csa = DBAdaptor_getCoordSystemAdaptor(dba); CoordSystemAdaptor_dumpCachedMappings(csa); CoordSystem *toplevelCs = CoordSystemAdaptor_fetchByName(csa, "toplevel", NULL); CoordSystem *clnCs = CoordSystemAdaptor_fetchByName(csa, "clone", NULL); CoordSystem *superctgCs = CoordSystemAdaptor_fetchByName(csa, "supercontig", NULL); TopLevelAssemblyMapper *clnToplevelMapper = (TopLevelAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, toplevelCs, clnCs); TopLevelAssemblyMapper *superctgToplevelMapper = (TopLevelAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, toplevelCs, superctgCs); ok(testNum++, clnToplevelMapper!=NULL); // && $cln_toplevel_mapper->isa('Bio::EnsEMBL::TopLevelAssemblyMapper')); ok(testNum++, superctgToplevelMapper!=NULL); // && $cln_toplevel_mapper->isa('Bio::EnsEMBL::TopLevelAssemblyMapper')); // // test db has chr 20 (50KB -> 62MB) // // // Test map // MapperRangeSet *coords = NULL; if (clnToplevelMapper) { fprintf(stderr, "MAP 'AL359765.6'->toplevel\n"); coords = TopLevelAssemblyMapper_map(clnToplevelMapper,"AL359765.6", 1, 13780, 1, clnCs, 0, NULL); printCoords(coords); ok(testNum++, coords!=NULL); } if (superctgToplevelMapper) { fprintf(stderr, "MAP NT_028392->toplevel\n"); coords = TopLevelAssemblyMapper_map(superctgToplevelMapper, "NT_028392", 600000, 1000000, 1, superctgCs, 0, NULL); printCoords(coords); ok(testNum++, coords!=NULL); } // // Test list_seq_regions // Vector *seqRegions; int i; if (clnToplevelMapper) { seqRegions = TopLevelAssemblyMapper_listSeqRegions(clnToplevelMapper, "AL359765.6", 1, 13780, clnCs); ok(testNum++, seqRegions!=NULL && Vector_getNumElement(seqRegions) == 1 && !strcmp("20", Vector_getElementAt(seqRegions,0))); for (i=0;i<Vector_getNumElement(seqRegions); i++) { char *regionName = Vector_getElementAt(seqRegions, i); fprintf(stderr, "%s\n",regionName); } } if (superctgToplevelMapper) { seqRegions = TopLevelAssemblyMapper_listSeqRegions(superctgToplevelMapper, "NT_028392", 600000, 1000000, superctgCs); ok(testNum++, seqRegions!=NULL && Vector_getNumElement(seqRegions) == 1 && !strcmp("20", Vector_getElementAt(seqRegions,0))); for (i=0;i<Vector_getNumElement(seqRegions); i++) { char *regionName = Vector_getElementAt(seqRegions, i); fprintf(stderr, "%s\n",regionName); } } // // Test list_seq_ids // Vector *ids; if (clnToplevelMapper) { ids = TopLevelAssemblyMapper_listIds(clnToplevelMapper, "AL359765.6", 1, 13780, clnCs); ok(testNum++, ids!=NULL && Vector_getNumElement(ids) == 1 && *((IDType *)Vector_getElementAt(ids,0)) == 469283 ); for (i=0;i<Vector_getNumElement(ids); i++) { IDType id = *((IDType *)Vector_getElementAt(ids, i)); fprintf(stderr, IDFMTSTR"\n",id); } } if (superctgToplevelMapper) { ids = TopLevelAssemblyMapper_listIds(superctgToplevelMapper, "NT_028392", 600000, 1000000, superctgCs); ok(testNum++, ids!=NULL && Vector_getNumElement(ids) == 1 && *((IDType *)Vector_getElementAt(ids,0)) == 469283 ); for (i=0;i<Vector_getNumElement(ids); i++) { IDType id = *((IDType *)Vector_getElementAt(ids, i)); fprintf(stderr, IDFMTSTR"\n",id); } } // Test for a not implemented method // seqRegions = TopLevelAssemblyMapper_listContigIds(clnToplevelMapper, "AL359765.6", 1, 13780, 1); return 0; }
void GenomicAlignAdaptor_addDerivedAlignments(GenomicAlignAdaptor *gaa, Vector *mergedAligns, GenomicAlign *alignA, GenomicAlign *alignB) { // variable name explanation // q - query c - consensus s - start e - end l - last // o, ov overlap j - jump_in_ // r - result int qs, qe, lqs, lqe, cs, ce, lce, ocs, oce, oqs, oqe, jc, jq, ovs, ove, rcs, rce, rqs, rqe; int currentMatch = 0; int newMatch; int cigAPos = 0, cigBPos = 0; char *resultCig; char tmpStr[128]; // initialization phase Vector *cigA = CigarStrUtil_getPieces(GenomicAlign_getCigarString(alignA)); Vector *cigB = CigarStrUtil_getPieces(GenomicAlign_getCigarString(alignB)); if (GenomicAlign_getQueryStrand(alignA) == -1 ) { Vector_reverse(cigB); } // need a 'normalized' start for qs, qe, oxs so I dont // have to check strandedness all the time // consensus is strand 1 and is not compared to anything, // can keep its original coordinate system lce = GenomicAlign_getConsensusStart(alignA) - 1; ce = lce; cs = ce + 1; // alignBs query can be + or - just keep relative coords for now lqe = 0; lqs = 1; qe = 0; qs = 1; // ocs will be found relative to oce and has to be comparable // to oqs. But it could be that we have to move downwards if we // are not - strand. thats why coordinates are transformed here if (GenomicAlign_getQueryStrand(alignA) == -1 ) { // query_end is first basepair of alignment if (GenomicAlign_getQueryEnd(alignA) < GenomicAlign_getConsensusEnd(alignB)) { oce = 0; ocs = 1; oqe = GenomicAlign_getConsensusEnd(alignB) - GenomicAlign_getQueryEnd(alignA); oqs = oqe + 1; } else { oqe = 0; oqs = 1; oce = GenomicAlign_getQueryEnd(alignA) - GenomicAlign_getConsensusEnd(alignB); ocs = oce + 1; } } else { // in theory no coordinate magic necessary :-) oqs = GenomicAlign_getQueryStart(alignA); oqe = oqs - 1; ocs = GenomicAlign_getConsensusStart(alignB); oce = ocs - 1; } // initializing result rcs = rce = rqs = rqe = 0; resultCig= StrUtil_copyString(&resultCig,"",0); while (1) { int newGa; // exit if you request a new piece of alignment and the cig list is // empty if (oce < ocs || oce < oqs) { // next M area in cigB if (cigBPos == Vector_getNumElement(cigB)) break; GenomicAlignAdaptor_nextCig(gaa, cigB, &cigBPos, &ocs, &oce, &qs, &qe ); continue; } if (oqe < oqs || oqe < ocs) { // next M area in cigA if (cigAPos == Vector_getNumElement(cigA)) break; GenomicAlignAdaptor_nextCig(gaa, cigA, &cigAPos, &cs, &ce, &oqs, &oqe ); continue; } // now matching region overlap in reference genome ovs = ocs < oqs ? oqs : ocs; ove = oce < oqe ? oce : oqe; if (currentMatch) { jc = cs + (ovs - oqs) - lce - 1; jq = qs + (ovs - ocs) - lqe - 1; } else { jc = jq = 0; } newMatch = ove - ovs + 1; newGa = 0; if (jc==0) { if (jq==0) { currentMatch += newMatch; } else { // store current match; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig,tmpStr); // jq deletions; if (jq == 1) { resultCig = StrUtil_appendString(resultCig,"D"); } else { sprintf(tmpStr,"%dD",jq); resultCig = StrUtil_appendString(resultCig,tmpStr); } currentMatch = newMatch; } } else { if (jq==0) { // store current match; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig,tmpStr); // jc insertions; if (jc==1) { resultCig = StrUtil_appendString(resultCig,"I"); } else { sprintf(tmpStr,"%dI",jc); resultCig = StrUtil_appendString(resultCig,tmpStr); } currentMatch = newMatch; } else { double percId; double score; GenomicAlign *ga; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig,tmpStr); // new GA int queryStrand = GenomicAlign_getQueryStrand(alignA) * GenomicAlign_getQueryStrand(alignB); int queryStart, queryEnd; if (queryStrand == 1) { queryStart = rqs + GenomicAlign_getQueryStart(alignB) - 1; queryEnd = rqe + GenomicAlign_getQueryStart(alignB) - 1; } else { queryEnd = GenomicAlign_getQueryEnd(alignB) - rqs + 1; queryStart = GenomicAlign_getQueryEnd(alignB) - rqe + 1; } score = (GenomicAlign_getScore(alignA) < GenomicAlign_getScore(alignB)) ? GenomicAlign_getScore(alignA) : GenomicAlign_getScore(alignB); percId = (int)(GenomicAlign_getPercentId(alignA)*GenomicAlign_getPercentId(alignB)/100.0); ga = GenomicAlign_new(); GenomicAlign_setConsensusDNAFrag(ga, GenomicAlign_getConsensusDNAFrag(alignA)); GenomicAlign_setQueryDNAFrag(ga, GenomicAlign_getQueryDNAFrag(alignB)); GenomicAlign_setCigarString(ga, resultCig); GenomicAlign_setConsensusStart(ga, rcs); GenomicAlign_setConsensusEnd(ga, rce); GenomicAlign_setQueryStrand(ga, queryStrand); GenomicAlign_setQueryStart(ga, queryStart); GenomicAlign_setQueryEnd(ga, queryEnd); GenomicAlign_setAdaptor(ga, (BaseAdaptor *)gaa); GenomicAlign_setPercentId(ga, percId); GenomicAlign_setScore(ga, score); Vector_addElement(mergedAligns, ga); rcs = rce = rqs = rqe = 0; resultCig[0] = '\0'; currentMatch = newMatch; } } if (!rcs) rcs = cs+(ovs-oqs); rce = cs+(ove-oqs); if (!rqs) rqs = qs+(ovs-ocs); rqe = qs+(ove-ocs); // update the last positions lce = rce; lqe = rqe; // next piece on the one that end earlier if (oce <= oqe) { // next M area in cigB if (cigBPos == Vector_getNumElement(cigB)) break; GenomicAlignAdaptor_nextCig(gaa, cigB, &cigBPos, &ocs, &oce, &qs, &qe ); } if (oce >= oqe) { // next M area in cigA if (cigAPos == Vector_getNumElement(cigA)) break; GenomicAlignAdaptor_nextCig(gaa, cigA, &cigAPos, &cs, &ce, &oqs, &oqe ); } } // end of while loop // if there is a last floating current match if (currentMatch) { // new GA int queryStrand = GenomicAlign_getQueryStrand(alignA) * GenomicAlign_getQueryStrand(alignB); int queryStart, queryEnd; double percId; double score; GenomicAlign *ga; sprintf(tmpStr,"%dM",currentMatch); resultCig = StrUtil_appendString(resultCig, tmpStr); if (queryStrand == 1) { queryStart = rqs + GenomicAlign_getQueryStart(alignB) - 1; queryEnd = rqe + GenomicAlign_getQueryStart(alignB) - 1; } else { queryEnd = GenomicAlign_getQueryEnd(alignB) - rqs + 1; queryStart = GenomicAlign_getQueryEnd(alignB) - rqe + 1; } score = (GenomicAlign_getScore(alignA) < GenomicAlign_getScore(alignB)) ? GenomicAlign_getScore(alignA) : GenomicAlign_getScore(alignB); percId = (int)(GenomicAlign_getPercentId(alignA)*GenomicAlign_getPercentId(alignB)/100.0); ga = GenomicAlign_new(); GenomicAlign_setConsensusDNAFrag(ga, GenomicAlign_getConsensusDNAFrag(alignA)); GenomicAlign_setQueryDNAFrag(ga, GenomicAlign_getQueryDNAFrag(alignB)); GenomicAlign_setCigarString(ga, resultCig); GenomicAlign_setConsensusStart(ga, rcs); GenomicAlign_setConsensusEnd(ga, rce); GenomicAlign_setQueryStrand(ga, queryStrand); GenomicAlign_setQueryStart(ga, queryStart); GenomicAlign_setQueryEnd(ga, queryEnd); GenomicAlign_setAdaptor(ga, (BaseAdaptor *)gaa); GenomicAlign_setPercentId(ga, percId); GenomicAlign_setScore(ga, score); Vector_addElement(mergedAligns, ga); } free(resultCig); Vector_free(cigA); Vector_free(cigB); // nothing to return all in merged_aligns }
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak // memory Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, long rStart, long rEnd, int wantGaps) { // The following was commented out due to Ensembl Genomes requirements // for bacterial genomes. // The following was uncommented because I'm not caring about those requirements if ( start > end ) { fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end); exit(1); } if ( rStart > rEnd ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd); exit(1); } if ( rStart > start ) { fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start); exit(1); } if ( rEnd < end ) { fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end); exit(1); } IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { list = Vector_new(); IDHash_add(regReg, id, list); } Vector *gapPairs = NULL; if (wantGaps) { gapPairs = Vector_new(); } int len = Vector_getNumElement(list); if (len == 0) { //this is the first request for this id, return a gap pair for the // entire range and register it as seen CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); return Vector_copy(list); } //#### // loop through the list of existing ranges recording any "gaps" where // the existing range does not cover part of the requested range // int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < rStart ) { startIdx = midIdx; } else { endIdx = midIdx; } } long gapStart; long gapEnd; int rIdx = -1; int rStartIdx = -1; int rEndIdx; gapStart = rStart; int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); // no work needs to be done at all if we find a range pair that // entirely overlaps the requested region if ( pStart <= start && pEnd >= end ) { return Vector_new(); // perl returns undef, but that causes me problems } // find adjacent or overlapping regions already registered if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) { if ( rStartIdx < 0 ) { // Not yet been set rStartIdx = i; } rEndIdx = i; } if ( pStart > rStart ) { gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1; if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, gapEnd); Vector_addElement(gapPairs, cp); } } gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1; if ( pEnd >= rEnd && rIdx < 0 ) { rIdx = i; break; } } // do we have to make another gap? if ( gapStart <= rEnd ) { if (wantGaps) { CoordPair *cp = CoordPair_new(gapStart, rEnd); Vector_addElement(gapPairs, cp); } } // // Merge the new range into the registered list // if (rStartIdx >= 0 ) { // rStartIdx has been set to something long newStart; long newEnd; CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); CoordPair *rEndIdxRange = Vector_getElementAt(list, rEndIdx); if ( rStart < CoordPair_getStart(rStartIdxRange)) { newStart = rStart; } else { newStart = CoordPair_getStart(rStartIdxRange); } if ( rEnd > CoordPair_getEnd(rEndIdxRange)) { newEnd = rEnd; } else { newEnd = CoordPair_getEnd(rEndIdxRange); } CoordPair *cp = CoordPair_new(newStart, newEnd); // Think its <= for (i=rStartIdx; i<=rEndIdx; i++) { Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called } Vector_insertElementAt(list, rStartIdx, cp); //splice( @$list, $rstart_idx, // $rend_idx - $rstart_idx + 1, // [ $new_start, $new_end ] ); } else if (rIdx >= 0) { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_insertElementAt(list, rIdx, cp); //splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); } else { CoordPair *cp = CoordPair_new(rStart, rEnd); Vector_addElement(list, cp); } // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK return gapPairs; }
long RangeRegistry_overlapSize(RangeRegistry *registry, IDType id, long start, long end) { long overlap = 0; if ( start > end ) return 0; IDHash *regReg = RangeRegistry_getRegistry(registry); Vector *list; if (IDHash_contains(regReg, id)) { list = IDHash_getValue(regReg, id); } else { return 0; // No list for this id, so can't be any overlap } int len = Vector_getNumElement(list); if ( len == 0 ) { fprintf(stderr, "Odd have zero length list in RangeRegistry_overlapSize\n"); return 0; } int startIdx = 0; int endIdx = Vector_getNumElement(list)-1; int midIdx; CoordPair *range; // binary search the relevant pairs // helps if the list is big while ( ( endIdx - startIdx ) > 1 ) { midIdx = ( startIdx + endIdx ) >> 1; range = Vector_getElementAt(list, midIdx); if ( CoordPair_getEnd(range) < start ) { startIdx = midIdx; } else { endIdx = midIdx; } } int i; for (i=startIdx; i < len ; i++ ) { CoordPair *pRange = Vector_getElementAt(list, i); long pStart = CoordPair_getStart(pRange); long pEnd = CoordPair_getEnd(pRange); if ( pStart > end ) { break; } if ( pStart <= start && pEnd >= end ) { overlap = end - start + 1; break; } long mStart = ( start < pStart ? pStart : start ); long mEnd = ( end < pEnd ? end : pEnd ); if (mEnd - mStart >= 0) { overlap += ( mEnd - mStart + 1 ); } } return overlap; }
void GenomicAlignAdaptor_store(GenomicAlignAdaptor *gaa, Vector *genomicAligns) { int ok = 1; char *qStr = NULL; StatementHandle *sth; char commaStr[2] = {'\0','\0'}; int i; char *tmpStr = NULL; if ((tmpStr = (char *)calloc(65556,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating tmpStr\n"); ok = 0; } if (ok) { StrUtil_copyString(&qStr, "INSERT INTO genomic_align_block" " (consensus_dnafrag_id, consensus_start, consensus_end," " query_dnafrag_id, query_start, query_end, query_strand, method_link_id," " score, perc_id, cigar_line) VALUES ",0); for (i=0; i<Vector_getNumElement(genomicAligns); i++) { GenomicAlign *ga = Vector_getElementAt(genomicAligns,i); DNAFrag *consDNAFrag = GenomicAlign_getConsensusDNAFrag(ga); DNAFrag *queryDNAFrag = GenomicAlign_getQueryDNAFrag(ga); // check that everything has dbIDs if (!DNAFrag_getDbID(consDNAFrag) || !DNAFrag_getDbID(queryDNAFrag)) { fprintf(stderr, "Error: dna_fragment in GenomicAlign is not in DB\n"); ok = 0; break; } } } GenomicAlign *ga = NULL; DNAFrag *consDNAFrag = NULL; DNAFrag *queryDNAFrag = NULL; IDType methodLinkId = 0; if (ok) { // all clear for storing for (i=0; i<Vector_getNumElement(genomicAligns); i++) { ga = Vector_getElementAt(genomicAligns,i); consDNAFrag = GenomicAlign_getConsensusDNAFrag(ga); queryDNAFrag = GenomicAlign_getQueryDNAFrag(ga); methodLinkId = GenomicAlignAdaptor_methodLinkIdByAlignmentType(gaa, GenomicAlign_getAlignmentType(ga)); if (!methodLinkId) { fprintf(stderr, "Error: There is no method_link with this type [%s] in the DB.\n", GenomicAlign_getAlignmentType(ga)); ok = 0; break; } } if (ok) { sprintf(tmpStr," %s(" IDFMTSTR ", %d, %d, " IDFMTSTR ", %d, %d, %d, " IDFMTSTR ", %f, %f, '%s')", commaStr, DNAFrag_getDbID(consDNAFrag), GenomicAlign_getConsensusStart(ga), GenomicAlign_getConsensusEnd(ga), DNAFrag_getDbID(queryDNAFrag), GenomicAlign_getQueryStart(ga), GenomicAlign_getQueryEnd(ga), GenomicAlign_getQueryStrand(ga), methodLinkId, GenomicAlign_getScore(ga), GenomicAlign_getPercentId(ga), GenomicAlign_getCigarString(ga)); qStr = StrUtil_appendString(qStr, tmpStr); commaStr[0] = ','; } sth = gaa->prepare((BaseAdaptor *)gaa, qStr, strlen(qStr)); sth->execute(sth); sth->finish(sth); } if (qStr) free(qStr); if (tmpStr) free(tmpStr); }
int main(int argc, char *argv[]) { DBAdaptor * dba; StatementHandle *sth; ResultRow * row; Vector * slices; int nSlices; htsFile * out; int argNum = 1; char *inFName = NULL; char *outFName = NULL; char *dbUser = "******"; char *dbPass = NULL; int dbPort = 3306; char *dbHost = "ens-staging.internal.sanger.ac.uk"; char *dbName = "homo_sapiens_core_71_37"; char *assName = "GRCh37"; char *chrName = "1"; int flags = 0; int threads = 1; initEnsC(argc, argv); while (argNum < argc) { char *arg = argv[argNum]; char *val; // Ones without a val go here if (!strcmp(arg, "-U") || !strcmp(arg,"--ucsc_naming")) { flags |= M_UCSC_NAMING; } else { // Ones with a val go in this block if (argNum == argc-1) { Bamcov_usage(); } val = argv[++argNum]; if (!strcmp(arg, "-i") || !strcmp(arg,"--in_file")) { StrUtil_copyString(&inFName,val,0); } else if (!strcmp(arg, "-o") || !strcmp(arg,"--out_file")) { StrUtil_copyString(&outFName,val,0); } else if (!strcmp(arg, "-h") || !strcmp(arg,"--host")) { StrUtil_copyString(&dbHost,val,0); } else if (!strcmp(arg, "-p") || !strcmp(arg,"--password")) { StrUtil_copyString(&dbPass,val,0); } else if (!strcmp(arg, "-P") || !strcmp(arg,"--port")) { dbPort = atoi(val); } else if (!strcmp(arg, "-n") || !strcmp(arg,"--name")) { StrUtil_copyString(&dbName,val,0); } else if (!strcmp(arg, "-u") || !strcmp(arg,"--user")) { StrUtil_copyString(&dbUser,val,0); } else if (!strcmp(arg, "-t") || !strcmp(arg,"--threads")) { threads = atoi(val); } else if (!strcmp(arg, "-a") || !strcmp(arg,"--assembly")) { StrUtil_copyString(&assName,val,0); } else if (!strcmp(arg, "-v") || !strcmp(arg,"--verbosity")) { verbosity = atoi(val); // Temporary } else if (!strcmp(arg, "-c") || !strcmp(arg,"--chromosome")) { StrUtil_copyString(&chrName,val,0); } else { fprintf(stderr,"Error in command line at %s\n\n",arg); Bamcov_usage(); } } argNum++; } if (verbosity > 0) { printf("Program for calculating read coverage in a BAM file \n" "Steve M.J. Searle. [email protected] Last update April 2013.\n"); } if (!inFName || !outFName) { Bamcov_usage(); } dba = DBAdaptor_new(dbHost,dbUser,dbPass,dbName,dbPort,NULL); //nSlices = getSlices(dba, destName); nSlices = 1; slices = Vector_new(); SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba); Slice *slice = SliceAdaptor_fetchByRegion(sa,NULL,chrName,POS_UNDEF,POS_UNDEF,1,NULL, 0); Vector_addElement(slices,slice); if (Vector_getNumElement(slices) == 0) { fprintf(stderr, "Error: No slices.\n"); exit(1); } htsFile *in = hts_open(inFName, "rb"); if (in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", inFName); return 1; } hts_set_threads(in, threads); hts_idx_t *idx; idx = bam_index_load(inFName); // load BAM index if (idx == 0) { fprintf(stderr, "BAM index file is not available.\n"); return 1; } int i; for (i=0; i<Vector_getNumElement(slices); i++) { Slice *slice = Vector_getElementAt(slices,i); if (verbosity > 0) printf("Working on '%s'\n",Slice_getName(slice)); // if (verbosity > 0) printf("Stage 1 - retrieving annotation from database\n"); // Vector *genes = getGenes(slice, flags); if (verbosity > 0) printf("Stage 1 - calculating coverage\n"); calcCoverage(inFName, slice, in, idx, flags); } hts_idx_destroy(idx); hts_close(in); if (verbosity > 0) printf("Done\n"); return 0; }
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) { int ref; int begRange; int endRange; char region[1024]; char region_name[512]; if (Slice_getChrStart(slice) != 1) { fprintf(stderr, "Currently only allow a slice start position of 1\n"); return 1; } if (flags & M_UCSC_NAMING) { sprintf(region,"chr%s", Slice_getSeqRegionName(slice)); } else { sprintf(region,"%s", Slice_getSeqRegionName(slice)); } bam_hdr_t *header = bam_hdr_init(); header = bam_hdr_read(in->fp.bgzf); ref = bam_name2id(header, region); if (ref < 0) { fprintf(stderr, "Invalid region %s\n", region); exit(1); } sprintf(region,"%s:%ld-%ld", region_name, Slice_getSeqRegionStart(slice), Slice_getSeqRegionEnd(slice)); if (hts_parse_reg(region, &begRange, &endRange) == NULL) { fprintf(stderr, "Could not parse %s\n", region); exit(2); } bam_hdr_destroy(header); hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange); bam1_t *b = bam_init1(); Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage)); long counter = 0; long overlapping = 0; long bad = 0; int startIndex = 0; while (bam_itr_next(in, iter, b) >= 0) { if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) { bad++; continue; } int end; //end = bam_calend(&b->core, bam1_cigar(b)); end = bam_endpos(b); // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in). // That is the reason for the || end == begRange test if (end == begRange) { continue; } counter++; if (!(counter%1000000)) { if (verbosity > 1) { printf("."); } fflush(stdout); } // Remember: b->core.pos is zero based! int cigInd; int refPos; int readPos; uint32_t *cigar = bam_get_cigar(b); for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) { int k; int lenCigBlock = cigar[cigInd]>>4; int op = cigar[cigInd]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (k = 0; k < lenCigBlock; ++k) { //if (ref[refPos+k] == 0) break; // out of boundary coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; readPos += lenCigBlock; } else if (op == BAM_CDEL) { for (k = 0; k < lenCigBlock; ++k) { // if (ref[refPos+k] == 0) break; coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; } else if (op == BAM_CSOFT_CLIP) { readPos += lenCigBlock; } else if (op == BAM_CHARD_CLIP) { } else if (op == BAM_CINS) { readPos += lenCigBlock; } else if (op == BAM_CREF_SKIP) { refPos += lenCigBlock; } } #ifdef DONE int j; int done = 0; int hadOverlap = 0; for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) { Gene *gene = Vector_getElementAt(genes,j); if (!gene) { continue; } // Remember: b->core.pos is zero based! if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) { int k; int doneGene = 0; for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) { Transcript *trans = Gene_getTranscriptAt(gene,k); if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) { int m; for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) { Exon *exon = Transcript_getExonAt(trans,m); if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) { // Only count as overlapping once (could be that a read overlaps more than one gene) if (!hadOverlap) { overlapping++; hadOverlap = 1; } gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); gs->score++; doneGene = 1; } } } } } else if (Gene_getStart(gene) > end) { done = 1; } else if (Gene_getEnd(gene) < b->core.pos+1) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); if (verbosity > 1) { printf("Removing gene %s (index %d) with extent %d to %d\n", Gene_getStableId(gene), gs->index, Gene_getStart(gene), Gene_getEnd(gene)); } Vector_setElementAt(genes,j,NULL); // Magic (very important for speed) - move startIndex to first non null gene int n; startIndex = 0; for (n=0;n<Vector_getNumElement(genes);n++) { void *v = Vector_getElementAt(genes,n); if (v != NULL) { break; } startIndex++; } if (verbosity > 1) { printf("startIndex now %d\n",startIndex); } } } #endif } if (verbosity > 1) { printf("\n"); } #ifdef DONE // Print out read counts for what ever's left in the genes array int n; for (n=0;n<Vector_getNumElement(genes);n++) { Gene *gene = Vector_getElementAt(genes,n); if (gene != NULL) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); } } #endif printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad); long i; for (i=0; i< Slice_getLength(slice); i++) { printf("%ld %ld\n", i+1, coverage[i].coverage); } sam_itr_destroy(iter); bam_destroy1(b); return 1; }
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) { //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name); Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName); // if there are 0 or 1 transcripts still do lazy-loading if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) { return transcripts; } // preload all of the exons now, instead of lazy loading later // faster than 1 query per transcript // get extent of region spanned by transcripts long minStart = 2000000000; long maxEnd = -2000000000; int i; for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) { minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t); } if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) { maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t); } } Slice *extSlice; if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) { extSlice = slice; } else { SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba); extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice), minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0); } // associate exon identifiers with transcripts IDHash *trHash = IDHash_new(IDHASH_MEDIUM); for (i=0; i<Vector_getNumElement(transcripts); i++) { PredictionTranscript *t = Vector_getElementAt(transcripts, i); if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) { IDHash_add(trHash, PredictionTranscript_getDbID(t), t); } } IDType *uniqueIds = IDHash_getKeys(trHash); char tmpStr[1024]; char *qStr = NULL; if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) { fprintf(stderr,"Failed allocating qStr\n"); return transcripts; } int lenNum; int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE prediction_transcript_id IN ("); for (i=0; i<IDHash_getNumValues(trHash); i++) { if (i!=0) { qStr[endPoint++] = ','; qStr[endPoint++] = ' '; } lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]); memcpy(&(qStr[endPoint]), tmpStr, lenNum); endPoint+=lenNum; } qStr[endPoint++] = ')'; qStr[endPoint] = '\0'; free(uniqueIds); StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr)); sth->execute(sth); IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM); ResultRow *row; while ((row = sth->fetchRow(sth))) { IDType trId = row->getLongLongAt(row,0); IDType exId = row->getLongLongAt(row,1); int rank = row->getIntAt(row,2); if (! IDHash_contains(exTrHash, exId)) { Vector *vec = Vector_new(); Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free); IDHash_add(exTrHash, exId, vec); } Vector *exVec = IDHash_getValue(exTrHash, exId); PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank); Vector_addElement(exVec, trp); } IDHash_free(trHash, NULL); sth->finish(sth); PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba); Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice); // move exons onto transcript slice, and add them to transcripts for (i=0; i<Vector_getNumElement(exons); i++) { PredictionExon *ex = Vector_getElementAt(exons, i); // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue; PredictionExon *newEx; if (slice != extSlice) { newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice); if (newEx == NULL) { fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n"); exit(1); } } else { newEx = ex; } Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx)); int j; for (j=0; j<Vector_getNumElement(exVec); j++) { PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j); PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank); } } IDHash_free(exTrHash, Vector_free); free(qStr); return transcripts; }
IDType DBEntryAdaptor_store(DBEntryAdaptor *dbea, DBEntry *exObj, IDType ensObject, char *ensType, int ignoreRelease) { fprintf(stderr,"DBEntryAdaptor_store does not implement ignoreRelease functionality yet\n"); char qStr[512]; StatementHandle *sth; ResultRow *row; IDType dbRef; IDType dbX; // // Check for the existance of the external_db, throw if it does not exist // sprintf(qStr, "SELECT external_db_id" " FROM external_db" " WHERE db_name = '%s'" " AND db_release = %s", DBEntry_getDbName(exObj), DBEntry_getRelease(exObj)); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); row = sth->fetchRow(sth); if( row == NULL ) { sth->finish(sth); fprintf(stderr,"Error: external_db [%s] release [%s] does not exist\n", DBEntry_getDbName(exObj), DBEntry_getRelease(exObj)); exit(1); } dbRef = row->getLongLongAt(row,0); sth->finish(sth); // // Check for the existance of the external reference, add it if not present // sprintf(qStr, "SELECT xref_id" " FROM xref" " WHERE external_db_id = " IDFMTSTR " AND dbprimary_acc = '%s'" " AND version = %s", dbRef, DBEntry_getPrimaryId(exObj), DBEntry_getVersion(exObj)); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); row = sth->fetchRow(sth); if (row != NULL) { dbX = row->getLongLongAt(row,0); sth->finish(sth); } else { // // store the new xref // // First finish the old sth sth->finish(sth); // NIY Handling NULL values sprintf(qStr, "INSERT ignore INTO xref" " SET dbprimary_acc = '%s'," " display_label = '%s'," " version = %s," " description = '%s'," " external_db_id = " IDFMTSTR, DBEntry_getPrimaryId(exObj), DBEntry_getDisplayId(exObj), DBEntry_getVersion(exObj), DBEntry_getDescription(exObj), dbRef ); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); dbX = sth->getInsertId(sth); sth->finish(sth); // // store the synonyms for the new xref // if (DBEntry_getAllSynonyms(exObj)) { StatementHandle *checkSth; StatementHandle *storeSth; int i; Vector *synonyms; sprintf(qStr, "SELECT xref_id, synonym" " FROM external_synonym" " WHERE xref_id = %" IDFMTSTR " AND synonym = '%%s'"); checkSth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sprintf(qStr, "INSERT ignore INTO external_synonym" " SET xref_id = %" IDFMTSTR ", synonym = '%%s'"); storeSth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); synonyms = DBEntry_getAllSynonyms(exObj); for (i=0;i<Vector_getNumElement(synonyms); i++) { char *syn = Vector_getElementAt(synonyms,i); checkSth->execute(checkSth, dbX, syn); row = checkSth->fetchRow(checkSth); if (!row) { storeSth->execute(storeSth, dbX, syn); } } checkSth->finish(checkSth); storeSth->finish(storeSth); } } // // check if the object mapping was already stored // sprintf(qStr, "SELECT xref_id" " FROM object_xref" " WHERE xref_id = " IDFMTSTR " AND ensembl_object_type = '%s'" " AND ensembl_id = " IDFMTSTR, dbX, ensType, ensObject); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); row = sth->fetchRow(sth); // NOTE row will be invalid after this call but will still // indicate whether something was found sth->finish(sth); if (!row) { IDType Xidt; // // Store the reference to the internal ensembl object // sprintf(qStr, "INSERT ignore INTO object_xref" " SET xref_id = " IDFMTSTR "," " ensembl_object_type = '%s'," " ensembl_id = " IDFMTSTR, dbX, ensType, ensObject); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); DBEntry_setDbID(exObj, dbX); DBEntry_setAdaptor(exObj, (BaseAdaptor *)dbea); Xidt = sth->getInsertId(sth); // // If this is an IdentityXref need to store in that table too // if (DBEntry_getIdentityXref(exObj)) { IdentityXref *idx = DBEntry_getIdentityXref(exObj); sprintf(qStr, "INSERT ignore INTO identity_xref" " SET object_xref_id = " IDFMTSTR "," " query_identity = %f," " target_identity = %f", Xidt, IdentityXref_getQueryIdentity(idx), IdentityXref_getTargetIdentity(idx)); sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr)); sth->execute(sth); sth->finish(sth); } } return dbX; }
int main(int argc, char *argv[]) { DBAdaptor *dba; GeneAdaptor *ga; Slice *slice = NULL; Vector *genes = NULL; int i = 0; int failed = 0; initEnsC(argc, argv); // ProcUtil_showBacktrace(EnsC_progName); dba = Test_initROEnsDB(); slice = Test_getStandardSlice(dba); // DBAdaptor *seqdba = DBAdaptor_new("genebuild6.internal.sanger.ac.uk","ensadmin","ensembl","steve_chicken_rnaseq_missing_reference",3306,NULL); // dba = DBAdaptor_new("genebuild1.internal.sanger.ac.uk","ensadmin","ensembl","steve_chicken_rnaseq_missing_refined",3306,seqdba); ok(1, slice!=NULL); ga = DBAdaptor_getGeneAdaptor(dba); SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba); ok(2, ga!=NULL); slice = SliceAdaptor_fetchByRegion(sa,"chromosome","20",10000000,50000000,1,NULL,0); // slice = SliceAdaptor_fetchByRegion(sa,"chromosome","17",1000000,5000000,1,NULL,0); // slice = SliceAdaptor_fetchByRegion(sa,"chromosome","17",1,5000000,1,NULL,0); // Has a seleno // slice = SliceAdaptor_fetchByRegion(sa,"chromosome","1",1000000,27000000,1,NULL,0); // slice = SliceAdaptor_fetchByRegion(sa,"chromosome","MT",1,17000,1,NULL,0); genes = Slice_getAllGenes(slice, NULL, NULL, 1, NULL, NULL); fprintf(stdout, "Have %d genes\n", Vector_getNumElement(genes)); ok(3, genes!=NULL); ok(4, Vector_getNumElement(genes)!=0); failed = dumpGenes(genes, 1); ok(5, !failed); //Vector *toplevelSlices = SliceAdaptor_fetchAll(sa, "toplevel", NULL, 0); Vector *toplevelSlices = SliceAdaptor_fetchAll(sa, "chromosome", NULL, 0); for (i=0;i<Vector_getNumElement(toplevelSlices) && !failed;i++) { Slice *tlSlice = Vector_getElementAt(toplevelSlices, i); fprintf(stderr, "Slice %s\n", Slice_getName(tlSlice)); genes = Slice_getAllGenes(tlSlice, NULL, NULL, 1, NULL, NULL); fprintf(stderr, "Got %d genes on %s\n", Vector_getNumElement(genes), Slice_getName(tlSlice)); failed = dumpGenes(genes, 0); } //tc_malloc_stats(); fprintf(stderr,"\nEcostring table stats:\n"); EcoString_getInfo(ecoSTable); fprintf(stderr,"\n"); ProcUtil_timeInfo("at end of GeneTest"); return 0; }
int dumpGenes(Vector *genes, int withSupport) { FILE *fp = stderr; int i; int failed = 0; for (i=0;i<Vector_getNumElement(genes) && !failed;i++) { Gene *g = Vector_getElementAt(genes,i); fprintf(fp,"Gene %s (%s) coords: %ld %ld %d\n",Gene_getStableId(g),(Gene_getDisplayXref(g) ? DBEntry_getDisplayId(Gene_getDisplayXref(g)) : ""),Gene_getStart(g),Gene_getEnd(g),Gene_getStrand(g)); int j; for (j=0;j<Gene_getTranscriptCount(g);j++) { Transcript *t = Gene_getTranscriptAt(g,j); int k; fprintf(fp," Trans %s coords: %ld %ld %d biotype: %s\n",Transcript_getStableId(t), Transcript_getStart(t),Transcript_getEnd(t),Transcript_getStrand(t),Transcript_getBiotype(t)); if (withSupport) { Vector *support = Transcript_getAllSupportingFeatures(t); for (k=0; k<Vector_getNumElement(support); k++) { BaseAlignFeature *baf = Vector_getElementAt(support, k); fprintf(fp," support %s coords: %ld %ld %d\n", BaseAlignFeature_getHitSeqName(baf), BaseAlignFeature_getStart(baf), BaseAlignFeature_getEnd(baf), BaseAlignFeature_getStrand(baf)); } Vector *intronSupport = Transcript_getAllIntronSupportingEvidence(t); for (k=0; k<Vector_getNumElement(intronSupport); k++) { IntronSupportingEvidence *ise = Vector_getElementAt(intronSupport, k); fprintf(fp," intron support %s coords: %ld %ld %d\n", IntronSupportingEvidence_getHitName(ise), IntronSupportingEvidence_getStart(ise), IntronSupportingEvidence_getEnd(ise), IntronSupportingEvidence_getStrand(ise)); } } for (k=0;k<Transcript_getExonCount(t);k++) { Exon *e = Transcript_getExonAt(t,k); fprintf(fp," exon %s (%p) coords: %ld %ld %d\n",Exon_getStableId(e), e, Exon_getStart(e), Exon_getEnd(e), Exon_getStrand(e)); if (withSupport) { Vector *support = Exon_getAllSupportingFeatures(e); int m; for (m=0; m<Vector_getNumElement(support); m++) { BaseAlignFeature *baf = Vector_getElementAt(support, m); fprintf(fp," support %s coords: %ld %ld %d\n", BaseAlignFeature_getHitSeqName(baf), BaseAlignFeature_getStart(baf), BaseAlignFeature_getEnd(baf), BaseAlignFeature_getStrand(baf)); } } } Translation *tln = Transcript_getTranslation(t); if (tln) { fprintf(fp," translation id: %s %s %d %s %d\n",Translation_getStableId(tln), Exon_getStableId(Translation_getStartExon(tln)), Translation_getStart(tln), Exon_getStableId(Translation_getEndExon(tln)), Translation_getEnd(tln)); char *tSeq = Transcript_translate(t); fprintf(fp," translation: %s\n",tSeq); free(tSeq); Vector *tlnAttribs = Translation_getAllAttributes(tln, NULL); if (Vector_getNumElement(tlnAttribs)) { fprintf(fp, " translation attributes:\n"); int n; for (n=0; n<Vector_getNumElement(tlnAttribs); n++) { Attribute *attrib = Vector_getElementAt(tlnAttribs, n); fprintf(fp, " code %s name %s desc %s value %s\n", Attribute_getCode(attrib), Attribute_getName(attrib), Attribute_getDescription(attrib), Attribute_getValue(attrib)); } } } } } return failed; }
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) { int i; Vector *bigList = Vector_new(); IDHash *overlappingSets[2]; Vector *mergedAligns; for (i=0;i<Vector_getNumElement(alignSet1); i++) { GenomicAlign *align = Vector_getElementAt(alignSet1, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryStart(align), align, 0)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)), GenomicAlign_getQueryEnd(align)+0.5, align, 0)); } for (i=0;i<Vector_getNumElement(alignSet2); i++) { GenomicAlign *align = Vector_getElementAt(alignSet2, i); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusStart(align), align, 1)); Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)), GenomicAlign_getConsensusEnd(align)+0.5, align, 1)); } Vector_sort(bigList, GenomicAlignListElem_compFunc); // walking from start to end through sortlist and keep track of the // currently overlapping set of Alignments overlappingSets[0] = IDHash_new(IDHASH_SMALL); overlappingSets[1] = IDHash_new(IDHASH_SMALL); mergedAligns = Vector_new(); for (i=0; i<Vector_getNumElement(bigList); i++) { GenomicAlignListElem *gale = Vector_getElementAt(bigList,i); GenomicAlign *align = gale->align; IDType alignID = GenomicAlign_getDbID(align); int setNo = gale->setNum; if (IDHash_contains(overlappingSets[setNo], alignID)) { // remove from current overlapping set IDHash_remove(overlappingSets[setNo], alignID, NULL); } else { int j; void **values = IDHash_getValues(overlappingSets[1-setNo]); // insert into the set and do all the overlap business IDHash_add(overlappingSets[setNo], alignID, align); // the other set contains everything this align overlaps with for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) { GenomicAlign *align2 = values[j]; if (setNo == 0) { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2); } else { GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align); } } free(values); } } // NIY Free gale return mergedAligns; }