Ejemplo n.º 1
int main(int argc, char *argv[]) {
  Vector *pieces;
  char *str;
  char *reverse;
  int testResult = 0;

  initEnsC(argc, argv);

  pieces = CigarStrUtil_getPieces(cigar1);

  testResult += ok(1, Vector_getNumElement(pieces) == 6);
  str = Vector_getElementAt(pieces,0);

  testResult += ok(2, !strcmp(str,"6M"));


  testResult += ok(3, !strcmp(str,cigar1));
  reverse = CigarStrUtil_reverse(str,strlen(str));

  pieces = CigarStrUtil_getPieces(reverse);

  testResult += ok(4, Vector_getNumElement(pieces) == 6);

  str = Vector_getElementAt(pieces,0);

  testResult += ok(5, !strcmp(str,"M"));

  return testResult;
Ejemplo n.º 2
char *Translation_modifyTranslation(Translation *translation, char *seq) {
  Vector *seqEds = Translation_getAllSeqEdits(translation);

  // Sort in reverse order to avoid complication of adjusting
  // downstream edits.
  // HACK:   The translation ENSP00000420939 somehow makes the next line
  //         bomb out ($a or $b becomes undef) if the start() method
  //         is used.  I haven't been able to find out why.  It has 10
  //         Selenocysteine seqedits that looks correct.
  //         /Andreas (release 59)
  if (Vector_getNumElement(seqEds)) {
    Vector_sort(seqEds, SeqEdit_reverseStartCompFunc);
  //  @seqeds = sort { $b->{'start'} <=> $a->{'start'} } @seqeds;
    // Apply all edits.
    // Not particularly efficient currently, could improve by precalculating maximum size of new seq prior to applying edits
    int i;
    for (i=0; i<Vector_getNumElement(seqEds); i++) {
      SeqEdit *se = Vector_getElementAt(seqEds, i);
      seq = SeqEdit_applyEdit(se, seq);


  return seq;
Ejemplo n.º 3
SeqFeature *BaseAdaptor_uncachedFetchByDbID(BaseAdaptor *ba, IDType id) {
  SeqFeature *feat = NULL;
  char constraint[1024];

  //construct a constraint like 't1.table1_id = 123'
  NameTableType *tables = ba->getTables();
  char **t = (*tables)[0];
  sprintf(constraint, "%s.%s_id = "IDFMTSTR, t[SYN], t[NAME], id); 

  //Should only be one
  Vector *vec = BaseAdaptor_genericFetch(ba, constraint, NULL, NULL);

  if (Vector_getNumElement(vec) > 1) {
    fprintf(stderr, "Error: Got more than one feature back in fetch ID call\n");
  } else {
    if (Vector_getNumElement(vec) == 1) {
      feat = Vector_getElementAt(vec, 0);
// NIY May want to set a free func???

  return feat;
Ejemplo n.º 4
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  RepeatFeatureAdaptor *rfa;
  Slice *slice;
  Vector *features;
  int i;
  int failed;
  initEnsC(argc, argv);

  dba = Test_initROEnsDB();

  slice = Test_getStandardSlice(dba);

  ok(1, slice!=NULL);

  rfa = DBAdaptor_getRepeatFeatureAdaptor(dba);

  ok(2, rfa!=NULL);

  features =  Slice_getAllRepeatFeatures(slice,NULL,NULL, NULL);

  ok(3, features!=NULL);
  ok(4, Vector_getNumElement(features)!=0);

  failed = 0;
  for (i=0;i<Vector_getNumElement(features) && !failed;i++) {
    RepeatFeature *rf = Vector_getElementAt(features,i);
    int start = RepeatFeature_getStart(rf);
    int end   = RepeatFeature_getEnd(rf);
    Vector *rrfVector;
    RepeatFeature *rrf;

    printf("slice start = %d end = %d\n",start,end);
    rrfVector = RepeatFeature_transformToRawContig(rf);
    if (Vector_getNumElement(rrfVector) > 1) {
      printf("Feature mapped to more than one rawcontig\n");
    rrf = Vector_getElementAt(rrfVector,0);

    //printf("rc start = %d end = %d\n",RepeatFeature_getStart(rrf),RepeatFeature_getEnd(rrf));
    rf = RepeatFeature_transformToSlice(rrf, slice);
    if (RepeatFeature_getStart(rf) != start ||
        RepeatFeature_getEnd(rf) != end) {
      printf("Remapping to slice produced different coords\n");
      failed =1;
  ok(5, !failed);
  return 0;
Ejemplo n.º 5
int DBEntryAdaptor_fetchAllByGene(DBEntryAdaptor *dbea, Gene *gene) {
  char qStr[512];
  StatementHandle *sth;
  ResultRow *row;
      "SELECT t.transcript_id, t.canonical_translation_id"
      " FROM   transcript t"
      " WHERE  t.gene_id = " IDFMTSTR,

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
  while ((row = sth->fetchRow(sth))) {
    IDType transcriptId = row->getLongLongAt(row,0);
    int i;
    Vector *transLinks;

    if (row->col(row,1)) {
      IDType translationId = row->getLongLongAt(row,1);
      Vector *translatLinks = DBEntryAdaptor_fetchByObjectType(dbea, translationId,"Translation");

      for (i=0;i<Vector_getNumElement(translatLinks); i++) {

    transLinks = DBEntryAdaptor_fetchByObjectType(dbea, transcriptId,"Transcript");
    for (i=0;i<Vector_getNumElement(transLinks); i++) {
      Gene_addDBLink(gene, Vector_getElementAt(transLinks,i));

/* NIY This is wrong so I'm not going to implement it!
    my $genelinks = $self->_fetch_by_object_type( $gene->stable_id, 'Gene' );
    foreach my $genelink ( @$genelinks ) {
      $gene->add_DBLink( $genelink );
  return 1;
Ejemplo n.º 6
char *PredictionTranscript_getcDNA(PredictionTranscript *trans) {
    Vector *exons = PredictionTranscript_getAllExons(trans,0);
    char *cdna = StrUtil_copyString(&cdna, "", 0);
    //int lastPhase = 0;
    int i;
    int first = 1;

    int cdnaStart;
    int pepStart;

    cdnaStart = 1;
    pepStart = 1;

    for (i=0; i<Vector_getNumElement(exons); i++) {
        PredictionExon *exon = Vector_getElementAt(exons, i);
        int phase;
        if (!exon) {
            if (cdna[0] == '\0') {
            } else {

        phase = 0;

// NIY    if (defined($exon->phase)) {
        phase = PredictionExon_getPhase(exon);
//    }

        //fprintf(stderr, " phase for exon %d is %d\n", i, phase);

        if (first) {
            cdna = SeqUtil_addNs(cdna,phase);
            first = 0;

        // Hack for now - should never happen
            if (phase != lastPhase ) {

              if (lastPhase == 1) {
        	cdna = StrUtil_appendString(cdna,"NN");
              } else if (lastPhase == 2) {
        	cdna = StrUtil_appendString(cdna,"N");

              // startpadding for this exon
              cdna = SeqUtil_addNs(cdna,phase);

        cdna = StrUtil_appendString(cdna, PredictionExon_getSeqString(exon));
        //lastPhase = PredictionExon_getEndPhase(exon);
        //lastPhase = phase;

// NIY Freeing exons vector?
    return cdna;
Ejemplo n.º 7
// New
Vector *Translation_getAllSeqEdits(Translation *translation) {
  char *edits[] = { "initial_met", "_selenocysteine", "amino_acid_sub", NULL };

  Vector *seqEds = Vector_new();

  char **editP = edits;
  while (*editP) {
    char *edit = *editP;

    Vector *attribs = Translation_getAllAttributes(translation, edit);

    // convert attributes to SeqEdit objects
    int i;
    for (i=0; i<Vector_getNumElement(attribs); i++) {
      Attribute *attrib = Vector_getElementAt(attribs, i);
      SeqEdit *seqEd = SeqEdit_newFromAttribute(attrib);

      Vector_addElement(seqEds, seqEd);


  return seqEds;
Ejemplo n.º 8
// NIY:
// Because this can filter the results the vector that gets returned must be freeable - so for now
// make a copy of the translation->attributes vector if returning unfiltered so behaviour is 
// consistent. Long term probably want reference count incremented
Vector *Translation_getAllAttributes(Translation *translation, char *attribCode) {
  if (translation->attributes == NULL) {
    TranslationAdaptor *tlna = (TranslationAdaptor *)Translation_getAdaptor(translation);
    if (tlna == NULL) { // No adaptor
// Perl comments out the warning, I'll put it back for now, just in case
      //fprintf(stderr,"Warning: Cannot get attributes without an adaptor.\n");
      return Vector_new();

    AttributeAdaptor *ata = DBAdaptor_getAttributeAdaptor(tlna->dba);
    translation->attributes = AttributeAdaptor_fetchAllByTranslation(ata, translation, NULL);

  if (attribCode != NULL) {
    Vector *results = Vector_new();
    int i;
    for (i=0; i<Vector_getNumElement(translation->attributes); i++) {
      Attribute *attrib = Vector_getElementAt(translation->attributes, i);
      if (!strcasecmp(attrib->code, attribCode)) {
        Vector_addElement(results, attrib);
    return results;
  } else {
// See NIY note above for why I'm making a copy 
    return Vector_copy(translation->attributes);
Ejemplo n.º 9
Mapper *PredictionTranscript_getcDNACoordMapper(PredictionTranscript *trans) {
    Mapper *mapper;
    int start = 1;
    int i;
    Vector *translateable;

    if (trans->exonCoordMapper) {
        return trans->exonCoordMapper;

    // the mapper is loaded with OBJECTS in place of the IDs !!!!
    //  the objects are the contigs in the exons
// NIY: What should coordsystems be?
    mapper = Mapper_new( "cdna", "genomic", NULL, NULL );

    translateable = PredictionTranscript_getAllTranslateableExons(trans);
    for (i=0; i<Vector_getNumElement(translateable); i++) {
        PredictionExon *exon = Vector_getElementAt(translateable,i);

        PredictionExon_loadGenomicMapper((Exon*)exon, mapper, (IDType)trans, start);
        start += PredictionExon_getLength(exon);
    trans->exonCoordMapper = mapper;
    return mapper;
Ejemplo n.º 10
// Removed the circular stuff 
void AttributeAdaptor_doStoreAllByTypeAndTableAndID(AttributeAdaptor *ata, char *type, char *table, IDType objectId, Vector *attributes) {
  int ok = 1;
  char qStr[1024];
  sprintf(qStr, "INSERT into %s_attrib SET %s_id = %"IDFMTSTR", attrib_type_id = %"IDFMTSTR", value = '%%s'", table, type);

  StatementHandle *sth = ata->prepare((BaseAdaptor *)ata,qStr,strlen(qStr));

  int i;
  for (i=0; i<Vector_getNumElement(attributes); i++) {
    Attribute *attrib = Vector_getElementAt(attributes, i);

    if (attrib == NULL ) {
      fprintf(stderr, "Reference to list of Bio::EnsEMBL::Attribute objects argument expected.\n");
      ok = 0;

    Class_assertType(CLASS_ATTRIBUTE, attrib->objectType);

    IDType atId = AttributeAdaptor_storeType(ata, attrib);

    sth->execute(sth, objectId, atId, Attribute_getValue(attrib));

  if (ok) {

Ejemplo n.º 11
int PredictionTranscript_setExonCount(PredictionTranscript *trans, int count) {

    if (Vector_getNumElement(trans->exons) > count) {
        fprintf(stderr, "Error: Trying to shrink exon vector\n");
    Vector_setNumElement(trans->exons, count);

    return 1;
Ejemplo n.º 12
int DBEntryAdaptor_fetchAllByTranscript(DBEntryAdaptor *dbea, Transcript *trans) {
  char qStr[512];
  StatementHandle *sth;
  ResultRow *row;
  Vector *transLinks;
  int i;

    "SELECT t.canonical_translation_id" 
    " FROM transcript t"
    " WHERE t.transcript_id = " IDFMTSTR,
  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));


  // Did this to be consistent with fetch_by_Gene, but don't like
  // it (filling in the object). I think returning the array would
  // be better. Oh well. EB
  while ((row = sth->fetchRow(sth))) {
    IDType translationId = row->getLongLongAt(row,0);
    Vector *translatLinks = DBEntryAdaptor_fetchByObjectType(dbea, translationId,"Translation");
    for (i=0;i<Vector_getNumElement(translatLinks); i++) {


  transLinks = DBEntryAdaptor_fetchByObjectType(dbea, Transcript_getDbID(trans),"Transcript");
  for (i=0;i<Vector_getNumElement(transLinks); i++) {

  return 1;
Ejemplo n.º 13
void GenomicAlignAdaptor_nextCig(GenomicAlignAdaptor *gaa,
    Vector *cigList, int *cigListPos, int *cs, int *ce, int *qs, int *qe)  {
  int count;
  char type;
  char *cigElem;
  int lenElem;
  do {
    cigElem = Vector_getElementAt(cigList, *cigListPos);
    lenElem = strlen(cigElem);
    type = cigElem[lenElem-1];

    if (type!='M' && type!='I' && type!='D') {
      fprintf(stderr,"Error: Cigar string format error for %s\n",cigElem);
    if (lenElem > 1) {
      cigElem[lenElem-1] = '\0';
      count = atol(cigElem);
    } else {
      count = 1;

    switch (type) {
      case 'D':
        *qe += count;
      case 'I':
        *ce += count;
      case 'M':
        *cs = *ce + 1;
        *ce = *cs + count - 1;
        *qs = *qe + 1;
        *qe = *qs + count - 1;
  } while (type != 'M' && *cigListPos!=Vector_getNumElement(cigList));
Ejemplo n.º 14
MapperRangeSet *PredictionTranscript_genomic2cDNA(PredictionTranscript *trans, int start, int end, int strand, BaseContig *contig) {
    Mapper *mapper;

    // "ids" in mapper are contigs of exons, so use the same contig that should
    // be attached to all of the exons...
    if (!contig) {
        Vector *translateable = PredictionTranscript_getAllTranslateableExons(trans);
        PredictionExon *firstExon;
        if (!Vector_getNumElement(translateable)) {
            return MapperRangeSet_new();
        firstExon = Vector_getElementAt(translateable, 0);
        contig = (BaseContig*)PredictionExon_getSlice(firstExon);

    mapper = PredictionTranscript_getcDNACoordMapper(trans);

    return Mapper_mapCoordinates(mapper,(IDType)contig, start, end, strand, "genomic");
Ejemplo n.º 15
AssemblyMapper *AssemblyMapper_new(AssemblyMapperAdaptor *adaptor, Vector *coordSystems) {
  AssemblyMapper *am;

  if ((am = (AssemblyMapper *)calloc(1, sizeof(AssemblyMapper))) == NULL) {
    fprintf(stderr, "ERROR: Failed allocating space for AssemblyMapper\n");
    return NULL;

  am->objectType = CLASS_ASSEMBLYMAPPER;

  am->funcs = &assemblyMapperFuncs;


  AssemblyMapper_setAdaptor(am, adaptor);


  if ( Vector_getNumElement(coordSystems) != 2 ) {
    fprintf(stderr, "Can only map between two coordinate systems %d were provided\n", Vector_getNumElement(coordSystems));

  // Set the component and assembled coordinate systems
  AssemblyMapper_setAssembledCoordSystem(am, Vector_getElementAt(coordSystems, 0));
  AssemblyMapper_setComponentCoordSystem(am, Vector_getElementAt(coordSystems, 1));

  AssemblyMapper_setAssembledRegister(am, IDHash_new(IDHASH_MEDIUM));
  AssemblyMapper_setComponentRegister(am, IDHash_new(IDHASH_MEDIUM));

  // We load the mapper calling the 'ASSEMBLED' the 'from' coord system
  // and the 'COMPONENT' the 'to' coord system.
  AssemblyMapper_setMapper(am, Mapper_new("assembled", "component", AssemblyMapper_getAssembledCoordSystem(am), 

  AssemblyMapper_setMaxPairCount(am, AM_DEFAULT_MAX_PAIR_COUNT);

  return am;
=head2 fetch_all_by_Transcript

  Arg[1]      : Bio::EnsEMBL::Transcript Transcript to search with
  Example     : my $ises = $isea->fetch_all_by_Transcript($transcript);
  Description : Uses the given Transcript to search for all instances of
                IntronSupportingEvidence linked to the transcript in the 
  Returntype  : ArrayRef of IntronSupportingEvidence objects
  Exceptions  : Thrown if arguments are not as stated and for DB errors 

Vector *IntronSupportingEvidenceAdaptor_fetchAllByTranscript(IntronSupportingEvidenceAdaptor *isea, Transcript *transcript) {
  char qStr[1024];

  sprintf(qStr,"SELECT intron_supporting_evidence_id "
                 "FROM transcript_intron_supporting_evidence "
                "WHERE transcript_id = "IDFMTSTR, Transcript_getDbID(transcript));

  StatementHandle *sth = isea->prepare((BaseAdaptor *)isea,qStr,strlen(qStr));

  Vector *idVec = Vector_new();
  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType id = row->getLongLongAt(row, 0);
    IDType *idP;

    if ((idP = calloc(1,sizeof(IDType))) == NULL) {
      fprintf(stderr, "Failed allocating space for a id\n");

    *idP = id;
    Vector_addElement(idVec, idP);

  Vector *out;
  if (Vector_getNumElement(idVec) > 0) {
    out = IntronSupportingEvidenceAdaptor_fetchAllByDbIDList(isea, idVec, NULL); 
  } else {
    out = Vector_new();
  // Free ids vector
  Vector_setFreeFunc(idVec, free);

  return out;
Ejemplo n.º 17
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  DBAdaptor *writeDba;
  ProteinAlignFeatureAdaptor *pafa;
  Slice *slice;
  Vector *features;
  int i;
  int failed;
  initEnsC(argc, argv);

  dba = Test_initROEnsDB();

  writeDba = Test_initRWEnsDB();

  slice = Test_getStandardSlice(dba);

  ok(1, slice!=NULL);

  pafa = DBAdaptor_getProteinAlignFeatureAdaptor(writeDba);
  SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba);

  ok(2, pafa!=NULL);

  //features =  Slice_getAllDNAPepAlignFeatures(slice,NULL,NULL, NULL,NULL);

  //Slice *slice3 = SliceAdaptor_fetchByRegion(sa,"chromosome","1",2,260000000,1,NULL,0);
  Slice *slice2 = SliceAdaptor_fetchByRegion(sa,"chromosome","1",1000000,4000000,1,NULL,0);
  features =  Slice_getAllProteinAlignFeatures(slice2,NULL,NULL, NULL,NULL);

  ok(3, features!=NULL);
  ok(4, Vector_getNumElement(features)!=0);

  ProteinAlignFeatureAdaptor_store((BaseFeatureAdaptor*)pafa, features);

  return 0;
Ejemplo n.º 18
int PredictionTranscript_getExonCount(PredictionTranscript *trans) {
    return Vector_getNumElement(trans->exons);
Ejemplo n.º 19
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  AssemblyMapperAdaptor *asma;
  int testNum = 1;

  initEnsC(argc, argv);

  dba = Test_initROEnsDB();

  // 1 Test AssemblyMapperAdaptor constructor
  asma = DBAdaptor_getAssemblyMapperAdaptor(dba);

  ok(testNum++, asma!=NULL);

  // Test fetch_by_CoordSystems

  CoordSystemAdaptor *csa = DBAdaptor_getCoordSystemAdaptor(dba);


  CoordSystem *toplevelCs  = CoordSystemAdaptor_fetchByName(csa, "toplevel", NULL);
  CoordSystem *clnCs  = CoordSystemAdaptor_fetchByName(csa, "clone", NULL);
  CoordSystem *superctgCs = CoordSystemAdaptor_fetchByName(csa, "supercontig", NULL);

  TopLevelAssemblyMapper *clnToplevelMapper = (TopLevelAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, toplevelCs, clnCs);
  TopLevelAssemblyMapper *superctgToplevelMapper = (TopLevelAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, toplevelCs, superctgCs);

  ok(testNum++, clnToplevelMapper!=NULL); //  && $cln_toplevel_mapper->isa('Bio::EnsEMBL::TopLevelAssemblyMapper'));
  ok(testNum++, superctgToplevelMapper!=NULL); //  && $cln_toplevel_mapper->isa('Bio::EnsEMBL::TopLevelAssemblyMapper'));

// test db has chr 20  (50KB -> 62MB)

// Test map
  MapperRangeSet *coords = NULL;

  if (clnToplevelMapper) {
    fprintf(stderr, "MAP 'AL359765.6'->toplevel\n");
    coords = TopLevelAssemblyMapper_map(clnToplevelMapper,"AL359765.6", 1, 13780, 1, clnCs,  0, NULL);
    ok(testNum++, coords!=NULL);

  if (superctgToplevelMapper) {
      fprintf(stderr, "MAP NT_028392->toplevel\n");
      coords = TopLevelAssemblyMapper_map(superctgToplevelMapper, "NT_028392", 600000, 1000000, 1, superctgCs, 0, NULL);
      ok(testNum++, coords!=NULL);

// Test list_seq_regions
  Vector *seqRegions;
  int i;
  if (clnToplevelMapper) {
    seqRegions = TopLevelAssemblyMapper_listSeqRegions(clnToplevelMapper, "AL359765.6", 1, 13780, clnCs);
    ok(testNum++, seqRegions!=NULL && Vector_getNumElement(seqRegions) == 1 && !strcmp("20", Vector_getElementAt(seqRegions,0)));
    for (i=0;i<Vector_getNumElement(seqRegions); i++) {
      char *regionName = Vector_getElementAt(seqRegions, i);
      fprintf(stderr, "%s\n",regionName);

  if (superctgToplevelMapper) {
    seqRegions = TopLevelAssemblyMapper_listSeqRegions(superctgToplevelMapper, "NT_028392", 600000, 1000000, superctgCs);
    ok(testNum++, seqRegions!=NULL && Vector_getNumElement(seqRegions) == 1 && !strcmp("20", Vector_getElementAt(seqRegions,0)));
    for (i=0;i<Vector_getNumElement(seqRegions); i++) {
      char *regionName = Vector_getElementAt(seqRegions, i);
      fprintf(stderr, "%s\n",regionName);

// Test list_seq_ids
  Vector *ids;

  if (clnToplevelMapper) {
    ids = TopLevelAssemblyMapper_listIds(clnToplevelMapper, "AL359765.6", 1, 13780, clnCs);
    ok(testNum++, ids!=NULL && Vector_getNumElement(ids) == 1 && *((IDType *)Vector_getElementAt(ids,0)) == 469283 );
    for (i=0;i<Vector_getNumElement(ids); i++) {
      IDType id = *((IDType *)Vector_getElementAt(ids, i));
      fprintf(stderr, IDFMTSTR"\n",id);

  if (superctgToplevelMapper) {
    ids = TopLevelAssemblyMapper_listIds(superctgToplevelMapper, "NT_028392", 600000, 1000000, superctgCs);
    ok(testNum++, ids!=NULL && Vector_getNumElement(ids) == 1 && *((IDType *)Vector_getElementAt(ids,0)) == 469283 );
    for (i=0;i<Vector_getNumElement(ids); i++) {
      IDType id = *((IDType *)Vector_getElementAt(ids, i));
      fprintf(stderr, IDFMTSTR"\n",id);

// Test for a not implemented method
//  seqRegions = TopLevelAssemblyMapper_listContigIds(clnToplevelMapper, "AL359765.6", 1, 13780, 1);

  return 0;
Ejemplo n.º 20
void GenomicAlignAdaptor_addDerivedAlignments(GenomicAlignAdaptor *gaa, 
                     Vector *mergedAligns, GenomicAlign *alignA, GenomicAlign *alignB) {

  // variable name explanation
  // q - query c - consensus s - start e - end l - last
  // o, ov overlap j - jump_in_
  // r - result

  int  qs, qe, lqs, lqe, cs, ce, lce,
       ocs, oce, oqs, oqe, jc, jq, ovs, ove,
       rcs, rce, rqs, rqe;
  int currentMatch = 0;
  int newMatch;
  int cigAPos = 0, cigBPos = 0;
  char *resultCig;
  char tmpStr[128];

  // initialization phase
  Vector *cigA = CigarStrUtil_getPieces(GenomicAlign_getCigarString(alignA));
  Vector *cigB = CigarStrUtil_getPieces(GenomicAlign_getCigarString(alignB));

  if (GenomicAlign_getQueryStrand(alignA) == -1 ) {

  // need a 'normalized' start for qs, qe, oxs so I dont 
  // have to check strandedness all the time  

  // consensus is strand 1 and is not compared to anything,
  // can keep its original coordinate system
  lce = GenomicAlign_getConsensusStart(alignA) - 1;
  ce = lce;
  cs = ce + 1;
  // alignBs query can be + or - just keep relative coords for now
  lqe = 0; lqs = 1;
  qe = 0; qs = 1;

  // ocs will be found relative to oce and has to be comparable
  // to oqs. But it could be that we have to move downwards if we
  // are not - strand. thats why coordinates are transformed here

  if (GenomicAlign_getQueryStrand(alignA) == -1 ) {
    // query_end is first basepair of alignment
    if (GenomicAlign_getQueryEnd(alignA) < GenomicAlign_getConsensusEnd(alignB)) {
      oce = 0; ocs = 1;
      oqe = GenomicAlign_getConsensusEnd(alignB) - GenomicAlign_getQueryEnd(alignA);
      oqs = oqe + 1;
    } else {
      oqe = 0; oqs = 1;
      oce = GenomicAlign_getQueryEnd(alignA) - GenomicAlign_getConsensusEnd(alignB);
      ocs = oce + 1;
  } else {
    // in theory no coordinate magic necessary :-)
    oqs = GenomicAlign_getQueryStart(alignA);
    oqe = oqs - 1; 
    ocs = GenomicAlign_getConsensusStart(alignB);
    oce = ocs - 1;

  // initializing result
  rcs = rce = rqs = rqe = 0;
  resultCig= StrUtil_copyString(&resultCig,"",0);

  while (1) {
    int newGa;
    // exit if you request a new piece of alignment and the cig list is 
    // empty

    if (oce < ocs || oce < oqs) {
      // next M area in cigB
      if (cigBPos == Vector_getNumElement(cigB)) break;
      GenomicAlignAdaptor_nextCig(gaa, cigB, &cigBPos, &ocs, &oce, &qs, &qe ); 
    if (oqe < oqs || oqe < ocs) {
      // next M area in cigA
      if (cigAPos == Vector_getNumElement(cigA)) break;
      GenomicAlignAdaptor_nextCig(gaa, cigA, &cigAPos, &cs, &ce, &oqs, &oqe );

    // now matching region overlap in reference genome
    ovs = ocs < oqs ? oqs : ocs;
    ove = oce < oqe ? oce : oqe;
    if (currentMatch) {
      jc = cs + (ovs - oqs) - lce - 1;
      jq = qs + (ovs - ocs) - lqe - 1;
    } else {
      jc = jq = 0;

    newMatch = ove - ovs + 1;
    newGa = 0;

    if (jc==0) {
      if (jq==0) {
	currentMatch += newMatch;
      } else {
        // store current match;
        resultCig = StrUtil_appendString(resultCig,tmpStr);

	// jq deletions;
	if (jq == 1) {
          resultCig = StrUtil_appendString(resultCig,"D");
        } else {
          resultCig = StrUtil_appendString(resultCig,tmpStr);
	currentMatch = newMatch;
    } else {
      if (jq==0) {
        // store current match;
        resultCig = StrUtil_appendString(resultCig,tmpStr);

	// jc insertions;
	if (jc==1) {
          resultCig = StrUtil_appendString(resultCig,"I");
        } else {
          resultCig = StrUtil_appendString(resultCig,tmpStr);
	currentMatch = newMatch;
      } else {
        double percId;
        double score;
        GenomicAlign *ga;

        resultCig = StrUtil_appendString(resultCig,tmpStr);

	// new GA
	int queryStrand = GenomicAlign_getQueryStrand(alignA) * GenomicAlign_getQueryStrand(alignB);
	int queryStart, queryEnd;
	if (queryStrand == 1) {
	  queryStart = rqs + GenomicAlign_getQueryStart(alignB) - 1;
	  queryEnd = rqe + GenomicAlign_getQueryStart(alignB) - 1;
	} else {
	  queryEnd = GenomicAlign_getQueryEnd(alignB) - rqs + 1;
	  queryStart = GenomicAlign_getQueryEnd(alignB) - rqe + 1;
        score = (GenomicAlign_getScore(alignA) < GenomicAlign_getScore(alignB)) ? 
          GenomicAlign_getScore(alignA) : GenomicAlign_getScore(alignB);
        percId =  (int)(GenomicAlign_getPercentId(alignA)*GenomicAlign_getPercentId(alignB)/100.0);
        ga = GenomicAlign_new();
        GenomicAlign_setConsensusDNAFrag(ga, GenomicAlign_getConsensusDNAFrag(alignA));
        GenomicAlign_setQueryDNAFrag(ga, GenomicAlign_getQueryDNAFrag(alignB));
        GenomicAlign_setCigarString(ga, resultCig);
        GenomicAlign_setConsensusStart(ga, rcs);
        GenomicAlign_setConsensusEnd(ga, rce);
        GenomicAlign_setQueryStrand(ga, queryStrand);
        GenomicAlign_setQueryStart(ga, queryStart);
        GenomicAlign_setQueryEnd(ga, queryEnd);
        GenomicAlign_setAdaptor(ga, (BaseAdaptor *)gaa);
        GenomicAlign_setPercentId(ga, percId);
        GenomicAlign_setScore(ga, score);

	Vector_addElement(mergedAligns, ga);

        rcs = rce = rqs = rqe = 0;
	resultCig[0] = '\0';
	currentMatch = newMatch;

    if (!rcs) rcs = cs+(ovs-oqs);
    rce = cs+(ove-oqs);
    if (!rqs) rqs = qs+(ovs-ocs);
    rqe = qs+(ove-ocs);

    // update the last positions
    lce = rce; 
    lqe = rqe;

    // next piece on the one that end earlier
    if (oce <= oqe) {
      // next M area in cigB
      if (cigBPos == Vector_getNumElement(cigB)) break;
      GenomicAlignAdaptor_nextCig(gaa, cigB, &cigBPos, &ocs, &oce, &qs, &qe ); 
    if (oce >= oqe) {
      // next M area in cigA
      if (cigAPos == Vector_getNumElement(cigA)) break;
      GenomicAlignAdaptor_nextCig(gaa, cigA, &cigAPos, &cs, &ce, &oqs, &oqe );
  } // end of while loop

  // if there is a last floating current match
  if (currentMatch) {
    // new GA
    int queryStrand = GenomicAlign_getQueryStrand(alignA) * GenomicAlign_getQueryStrand(alignB);
    int queryStart, queryEnd;
    double percId;
    double score;
    GenomicAlign *ga;

    resultCig = StrUtil_appendString(resultCig, tmpStr);

    if (queryStrand == 1) {
      queryStart = rqs + GenomicAlign_getQueryStart(alignB) - 1;
      queryEnd = rqe + GenomicAlign_getQueryStart(alignB) - 1;
    } else {
      queryEnd = GenomicAlign_getQueryEnd(alignB) - rqs + 1;
      queryStart = GenomicAlign_getQueryEnd(alignB) - rqe + 1;
    score = (GenomicAlign_getScore(alignA) < GenomicAlign_getScore(alignB)) ? 
      GenomicAlign_getScore(alignA) : GenomicAlign_getScore(alignB);
    percId =  (int)(GenomicAlign_getPercentId(alignA)*GenomicAlign_getPercentId(alignB)/100.0);
    ga = GenomicAlign_new();

    GenomicAlign_setConsensusDNAFrag(ga, GenomicAlign_getConsensusDNAFrag(alignA));
    GenomicAlign_setQueryDNAFrag(ga, GenomicAlign_getQueryDNAFrag(alignB));
    GenomicAlign_setCigarString(ga, resultCig);
    GenomicAlign_setConsensusStart(ga, rcs);
    GenomicAlign_setConsensusEnd(ga, rce);
    GenomicAlign_setQueryStrand(ga, queryStrand);
    GenomicAlign_setQueryStart(ga, queryStart);
    GenomicAlign_setQueryEnd(ga, queryEnd);
    GenomicAlign_setAdaptor(ga, (BaseAdaptor *)gaa);
    GenomicAlign_setPercentId(ga, percId);
    GenomicAlign_setScore(ga, score);

    Vector_addElement(mergedAligns, ga);



  // nothing to return all in merged_aligns
Ejemplo n.º 21
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak
// memory
Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, 
                                       long rStart, long rEnd, int wantGaps) {

  // The following was commented out due to Ensembl Genomes requirements
  // for bacterial genomes.
  // The following was uncommented because I'm not caring about those requirements
  if ( start > end ) {
    fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end);
  if ( rStart > rEnd ) {
    fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd);

  if ( rStart > start ) {
    fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start);

  if ( rEnd < end ) {
    fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end);

  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list;
  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id);
  } else {
    list = Vector_new();
    IDHash_add(regReg, id, list);

  Vector *gapPairs = NULL;
  if (wantGaps) {
    gapPairs = Vector_new();

  int len = Vector_getNumElement(list);

  if (len == 0) {
    //this is the first request for this id, return a gap pair for the
    // entire range and register it as seen
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_addElement(list, cp);

    return Vector_copy(list);

  // loop through the list of existing ranges recording any "gaps" where
  // the existing range does not cover part of the requested range

  int startIdx = 0;
  int endIdx   = Vector_getNumElement(list)-1;
  int midIdx;
  CoordPair *range;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    range  = Vector_getElementAt(list, midIdx);

    if ( CoordPair_getEnd(range) < rStart ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;

  long gapStart;
  long gapEnd;
  int rIdx = -1;
  int rStartIdx = -1;
  int rEndIdx;

  gapStart = rStart;

  int i;
  for (i=startIdx; i < len ; i++ ) {
    CoordPair *pRange = Vector_getElementAt(list, i);
    long pStart = CoordPair_getStart(pRange);
    long pEnd   = CoordPair_getEnd(pRange);
    // no work needs to be done at all if we find a range pair that
    // entirely overlaps the requested region
    if ( pStart <= start && pEnd >= end ) {
      return Vector_new(); // perl returns undef, but that causes me problems

    // find adjacent or overlapping regions already registered
    if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) {
      if ( rStartIdx < 0 ) { // Not yet been set
        rStartIdx = i;
      rEndIdx = i;

    if ( pStart > rStart ) {
      gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1;
      if (wantGaps) {
        CoordPair *cp = CoordPair_new(gapStart, gapEnd);
        Vector_addElement(gapPairs, cp);

    gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1;

    if ( pEnd >= rEnd && rIdx < 0 ) {
      rIdx = i;

  // do we have to make another gap?
  if ( gapStart <= rEnd ) {
    if (wantGaps) {
      CoordPair *cp = CoordPair_new(gapStart, rEnd);
      Vector_addElement(gapPairs, cp);

  // Merge the new range into the registered list
  if (rStartIdx >= 0 ) { // rStartIdx has been set to something 
    long newStart;
    long newEnd;
    CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); 
    CoordPair *rEndIdxRange   = Vector_getElementAt(list, rEndIdx); 

    if ( rStart < CoordPair_getStart(rStartIdxRange)) {
      newStart = rStart;
    } else {
      newStart = CoordPair_getStart(rStartIdxRange);

    if ( rEnd > CoordPair_getEnd(rEndIdxRange)) {
      newEnd = rEnd;
    } else {
      newEnd = CoordPair_getEnd(rEndIdxRange);

    CoordPair *cp = CoordPair_new(newStart, newEnd);

    // Think its <=
    for (i=rStartIdx; i<=rEndIdx; i++) {
      Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called
    Vector_insertElementAt(list, rStartIdx, cp);
    //splice( @$list, $rstart_idx,
    //        $rend_idx - $rstart_idx + 1,
    //        [ $new_start, $new_end ] );

  } else if (rIdx >= 0) {
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_insertElementAt(list, rIdx, cp);
    //splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
  } else {
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_addElement(list, cp);

  // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK
  return gapPairs;
Ejemplo n.º 22
long RangeRegistry_overlapSize(RangeRegistry *registry, IDType id, long start, long end)  {
  long overlap = 0;

  if ( start > end ) return 0;

  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list;
  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id);
  } else {
    return 0; // No list for this id, so can't be any overlap 

  int len = Vector_getNumElement(list);

  if ( len == 0 ) {
    fprintf(stderr, "Odd have zero length list in RangeRegistry_overlapSize\n");
    return 0;

  int startIdx = 0;
  int endIdx   = Vector_getNumElement(list)-1;
  int midIdx;
  CoordPair *range;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    range   = Vector_getElementAt(list, midIdx);
    if ( CoordPair_getEnd(range) < start ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;

  int i;
  for (i=startIdx; i < len ; i++ ) {
    CoordPair *pRange = Vector_getElementAt(list, i);
    long pStart = CoordPair_getStart(pRange);
    long pEnd   = CoordPair_getEnd(pRange);

    if ( pStart > end ) {

    if ( pStart <= start && pEnd >= end ) {
      overlap = end - start + 1;

    long mStart = ( start < pStart ? pStart : start );
    long mEnd   = ( end   < pEnd   ? end    : pEnd );

    if (mEnd - mStart >= 0) {
      overlap += ( mEnd - mStart + 1 );

  return overlap;
Ejemplo n.º 23
void GenomicAlignAdaptor_store(GenomicAlignAdaptor *gaa, Vector *genomicAligns) {
  int ok = 1;
  char *qStr = NULL;
  StatementHandle *sth;
  char commaStr[2] = {'\0','\0'};
  int i;
  char *tmpStr = NULL;
  if ((tmpStr = (char *)calloc(65556,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating tmpStr\n");
    ok = 0;

  if (ok) {
    StrUtil_copyString(&qStr, "INSERT INTO genomic_align_block"
                       " (consensus_dnafrag_id, consensus_start, consensus_end,"
                       "  query_dnafrag_id, query_start, query_end, query_strand, method_link_id,"
                       "  score, perc_id, cigar_line) VALUES ",0);
    for (i=0; i<Vector_getNumElement(genomicAligns); i++) {
      GenomicAlign *ga = Vector_getElementAt(genomicAligns,i);
      DNAFrag *consDNAFrag  = GenomicAlign_getConsensusDNAFrag(ga);
      DNAFrag *queryDNAFrag = GenomicAlign_getQueryDNAFrag(ga);

      // check that everything has dbIDs
      if (!DNAFrag_getDbID(consDNAFrag) || !DNAFrag_getDbID(queryDNAFrag)) {
        fprintf(stderr, "Error: dna_fragment in GenomicAlign is not in DB\n");
        ok = 0;

  GenomicAlign *ga = NULL;
  DNAFrag *consDNAFrag = NULL;
  DNAFrag *queryDNAFrag = NULL;
  IDType methodLinkId = 0;

  if (ok) {
    // all clear for storing
    for (i=0; i<Vector_getNumElement(genomicAligns); i++) {
      ga = Vector_getElementAt(genomicAligns,i);
      consDNAFrag  = GenomicAlign_getConsensusDNAFrag(ga);
      queryDNAFrag = GenomicAlign_getQueryDNAFrag(ga);

      methodLinkId = GenomicAlignAdaptor_methodLinkIdByAlignmentType(gaa, GenomicAlign_getAlignmentType(ga));

      if (!methodLinkId) {
        fprintf(stderr, "Error: There is no method_link with this type [%s] in the DB.\n",
        ok = 0;
    if (ok) {
      sprintf(tmpStr," %s(" IDFMTSTR ", %d, %d, " IDFMTSTR ", %d, %d, %d, " IDFMTSTR ", %f, %f, '%s')", 

      qStr = StrUtil_appendString(qStr, tmpStr);
      commaStr[0] = ','; 
    sth = gaa->prepare((BaseAdaptor *)gaa, qStr, strlen(qStr));

  if (qStr)

  if (tmpStr)
Ejemplo n.º 24
int main(int argc, char *argv[]) {
  DBAdaptor *      dba;
  StatementHandle *sth;
  ResultRow *      row;
  Vector *         slices;
  int              nSlices;
  htsFile *      out;

  int   argNum = 1;

  char *inFName  = NULL;
  char *outFName = NULL;

  char *dbUser = "******";
  char *dbPass = NULL;
  int   dbPort = 3306;

  char *dbHost = "ens-staging.internal.sanger.ac.uk";
  char *dbName = "homo_sapiens_core_71_37";

  char *assName = "GRCh37";

  char *chrName = "1";

  int flags = 0;
  int   threads  = 1;

  initEnsC(argc, argv);

  while (argNum < argc) {
    char *arg = argv[argNum];
    char *val;

// Ones without a val go here
    if (!strcmp(arg, "-U") || !strcmp(arg,"--ucsc_naming")) {
      flags |= M_UCSC_NAMING;
    } else {
// Ones with a val go in this block
      if (argNum == argc-1) {

      val = argv[++argNum];
      if (!strcmp(arg, "-i") || !strcmp(arg,"--in_file")) {
      } else if (!strcmp(arg, "-o") || !strcmp(arg,"--out_file")) {
      } else if (!strcmp(arg, "-h") || !strcmp(arg,"--host")) {
      } else if (!strcmp(arg, "-p") || !strcmp(arg,"--password")) {
      } else if (!strcmp(arg, "-P") || !strcmp(arg,"--port")) {
        dbPort = atoi(val);
      } else if (!strcmp(arg, "-n") || !strcmp(arg,"--name")) {
      } else if (!strcmp(arg, "-u") || !strcmp(arg,"--user")) {
      } else if (!strcmp(arg, "-t") || !strcmp(arg,"--threads")) {
        threads = atoi(val);
      } else if (!strcmp(arg, "-a") || !strcmp(arg,"--assembly")) {
      } else if (!strcmp(arg, "-v") || !strcmp(arg,"--verbosity")) {
        verbosity = atoi(val);
// Temporary
      } else if (!strcmp(arg, "-c") || !strcmp(arg,"--chromosome")) {
      } else {
        fprintf(stderr,"Error in command line at %s\n\n",arg);

  if (verbosity > 0) {
    printf("Program for calculating read coverage in a BAM file \n"
           "Steve M.J. Searle.  [email protected]  Last update April 2013.\n");

  if (!inFName || !outFName) {

  dba = DBAdaptor_new(dbHost,dbUser,dbPass,dbName,dbPort,NULL);

  //nSlices = getSlices(dba, destName);
  nSlices = 1;

  slices = Vector_new();

  SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba);

  Slice *slice = SliceAdaptor_fetchByRegion(sa,NULL,chrName,POS_UNDEF,POS_UNDEF,1,NULL, 0);


  if (Vector_getNumElement(slices) == 0) {
    fprintf(stderr, "Error: No slices.\n");

  htsFile *in = hts_open(inFName, "rb");
  if (in == 0) {
    fprintf(stderr, "Fail to open BAM file %s\n", inFName);
    return 1;

  hts_set_threads(in, threads);
  hts_idx_t *idx;
  idx = bam_index_load(inFName); // load BAM index
  if (idx == 0) {
    fprintf(stderr, "BAM index file is not available.\n");
    return 1;

  int i;
  for (i=0; i<Vector_getNumElement(slices); i++) {
    Slice *slice = Vector_getElementAt(slices,i);

    if (verbosity > 0) printf("Working on '%s'\n",Slice_getName(slice));

//    if (verbosity > 0) printf("Stage 1 - retrieving annotation from database\n");
//    Vector *genes = getGenes(slice, flags);

    if (verbosity > 0) printf("Stage 1 - calculating coverage\n");
    calcCoverage(inFName, slice, in, idx, flags);


  if (verbosity > 0) printf("Done\n");
  return 0;
Ejemplo n.º 25
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) {
  int  ref;
  int  begRange;
  int  endRange;
  char region[1024];
  char region_name[512];

  if (Slice_getChrStart(slice) != 1) {
    fprintf(stderr, "Currently only allow a slice start position of 1\n");
    return 1;
  if (flags & M_UCSC_NAMING) {
    sprintf(region,"chr%s", Slice_getSeqRegionName(slice));
  } else {
    sprintf(region,"%s", Slice_getSeqRegionName(slice));
  bam_hdr_t *header = bam_hdr_init();
  header = bam_hdr_read(in->fp.bgzf);
  ref = bam_name2id(header, region);
  if (ref < 0) {
    fprintf(stderr, "Invalid region %s\n", region);
  sprintf(region,"%s:%ld-%ld", region_name,
  if (hts_parse_reg(region, &begRange, &endRange) == NULL) {
    fprintf(stderr, "Could not parse %s\n", region);

  hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange);
  bam1_t *b = bam_init1();

  Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage));

  long counter = 0;
  long overlapping = 0;
  long bad = 0;
  int startIndex = 0;
  while (bam_itr_next(in, iter, b) >= 0) {
    if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) {

    int end;
    //end = bam_calend(&b->core, bam1_cigar(b));
    end = bam_endpos(b);

    // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in).
    // That is the reason for the || end == begRange test
    if (end == begRange) {

    if (!(counter%1000000)) {
      if (verbosity > 1) { printf("."); }

// Remember: b->core.pos is zero based!
    int cigInd;
    int refPos;
    int readPos;
    uint32_t *cigar = bam_get_cigar(b);
    for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) {
      int k;
      int lenCigBlock = cigar[cigInd]>>4;
      int op          = cigar[cigInd]&0xf;

      if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
        for (k = 0; k < lenCigBlock; ++k) {
          //if (ref[refPos+k] == 0) break; // out of boundary
        if (k < lenCigBlock) break;
        refPos += lenCigBlock; readPos += lenCigBlock;
      } else if (op == BAM_CDEL) {
        for (k = 0; k < lenCigBlock; ++k) {
        //  if (ref[refPos+k] == 0) break;
        if (k < lenCigBlock) break;
        refPos += lenCigBlock;
      } else if (op == BAM_CSOFT_CLIP) {
        readPos += lenCigBlock;
      } else if (op == BAM_CHARD_CLIP) {
      } else if (op == BAM_CINS) {
         readPos += lenCigBlock;
      } else if (op == BAM_CREF_SKIP) {
         refPos += lenCigBlock;

#ifdef DONE
    int j;
    int done = 0;
    int hadOverlap = 0;
    for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) {
      Gene *gene = Vector_getElementAt(genes,j); 
      if (!gene) {
// Remember: b->core.pos is zero based!
      if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) {
        int k;

        int doneGene = 0;
        for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) {
          Transcript *trans = Gene_getTranscriptAt(gene,k);

          if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) {
            int m;
            for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) {
              Exon *exon = Transcript_getExonAt(trans,m);

              if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) {

                // Only count as overlapping once (could be that a read overlaps more than one gene)
                if (!hadOverlap) {
                  hadOverlap = 1;

                gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
                doneGene = 1;
      } else if (Gene_getStart(gene) > end) {
        done = 1;
      } else if (Gene_getEnd(gene) < b->core.pos+1) {
        gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
        printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                          Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 

        if (verbosity > 1) { 
          printf("Removing gene %s (index %d) with extent %d to %d\n", 

        // Magic (very important for speed) - move startIndex to first non null gene
        int n;
        startIndex = 0;
        for (n=0;n<Vector_getNumElement(genes);n++) {
          void *v = Vector_getElementAt(genes,n);

          if (v != NULL) {
        if (verbosity > 1) { 
          printf("startIndex now %d\n",startIndex);
  if (verbosity > 1) { printf("\n"); }

#ifdef DONE
// Print out read counts for what ever's left in the genes array
  int n;
  for (n=0;n<Vector_getNumElement(genes);n++) {
    Gene *gene = Vector_getElementAt(genes,n);

    if (gene != NULL) {
      gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
      printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                        Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 


  printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad);

  long i;
  for (i=0; i< Slice_getLength(slice); i++) {
    printf("%ld %ld\n", i+1, coverage[i].coverage);


  return 1;
Ejemplo n.º 26
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) {

  //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name);
  Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName);

  // if there are 0 or 1 transcripts still do lazy-loading
  if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) {
    return transcripts;

  // preload all of the exons now, instead of lazy loading later
  // faster than 1 query per transcript

  // get extent of region spanned by transcripts
  long minStart =  2000000000;
  long maxEnd   = -2000000000;

  int i;
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) {
      minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t);
    if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) {
      maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t);

  Slice *extSlice;

  if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) {
    extSlice = slice;
  } else {
    SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba);
    extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice),
                                          minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0);

  // associate exon identifiers with transcripts
  IDHash *trHash = IDHash_new(IDHASH_MEDIUM);
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) {
      IDHash_add(trHash, PredictionTranscript_getDbID(t), t);

  IDType *uniqueIds = IDHash_getKeys(trHash);

  char tmpStr[1024];
  char *qStr = NULL;
  if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating qStr\n");
    return transcripts;

  int lenNum;
  int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE  prediction_transcript_id IN (");
  for (i=0; i<IDHash_getNumValues(trHash); i++) {
    if (i!=0) {
      qStr[endPoint++] = ',';
      qStr[endPoint++] = ' ';
    lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]);
    memcpy(&(qStr[endPoint]), tmpStr, lenNum);
  qStr[endPoint++] = ')';
  qStr[endPoint] = '\0';


  StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr));

  IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM);
  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType trId = row->getLongLongAt(row,0);
    IDType exId = row->getLongLongAt(row,1);
    int    rank = row->getIntAt(row,2);

    if (! IDHash_contains(exTrHash, exId)) {
      Vector *vec = Vector_new();
      Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free);
      IDHash_add(exTrHash, exId, vec);
    Vector *exVec = IDHash_getValue(exTrHash, exId);
    PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank);
    Vector_addElement(exVec, trp);

  IDHash_free(trHash, NULL);


  PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba);
  Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice);

  // move exons onto transcript slice, and add them to transcripts
  for (i=0; i<Vector_getNumElement(exons); i++) {
    PredictionExon *ex = Vector_getElementAt(exons, i);

  // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it
    if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue;

    PredictionExon *newEx;
    if (slice != extSlice) {
      newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice);
      if (newEx == NULL) {
        fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n");
    } else {
      newEx = ex;

    Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx));
    int j;
    for (j=0; j<Vector_getNumElement(exVec); j++) {
      PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j);
      PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank);

  IDHash_free(exTrHash, Vector_free);

  return transcripts;
Ejemplo n.º 27
IDType DBEntryAdaptor_store(DBEntryAdaptor *dbea, DBEntry *exObj, 
                            IDType ensObject, char *ensType, int ignoreRelease) {
  fprintf(stderr,"DBEntryAdaptor_store does not implement ignoreRelease functionality yet\n");

  char qStr[512];
  StatementHandle *sth;
  ResultRow *row;
  IDType dbRef;
  IDType dbX;

  // Check for the existance of the external_db, throw if it does not exist
     "SELECT external_db_id"
     "  FROM external_db"
     " WHERE db_name = '%s'"
     "   AND db_release = %s",

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
  row = sth->fetchRow(sth);
  if( row == NULL ) {
    fprintf(stderr,"Error: external_db [%s] release [%s] does not exist\n", 
            DBEntry_getDbName(exObj), DBEntry_getRelease(exObj));

  dbRef =  row->getLongLongAt(row,0);
  // Check for the existance of the external reference, add it if not present
       "SELECT xref_id"
       "  FROM xref"
       " WHERE external_db_id = " IDFMTSTR
       "   AND dbprimary_acc = '%s'"
       "   AND version = %s",

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

  row = sth->fetchRow(sth);
  if (row != NULL) {
    dbX =  row->getLongLongAt(row,0);
  } else {
    // store the new xref

    // First finish the old sth

// NIY Handling NULL values
       "INSERT ignore INTO xref"
       " SET dbprimary_acc = '%s',"
       "    display_label = '%s',"
       "    version = %s,"
       "    description = '%s',"
       "    external_db_id = " IDFMTSTR,
    sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

    dbX = sth->getInsertId(sth);

    // store the synonyms for the new xref
    if (DBEntry_getAllSynonyms(exObj)) {
      StatementHandle *checkSth;
      StatementHandle *storeSth;
      int i;
      Vector *synonyms;

              "SELECT xref_id, synonym"
              " FROM external_synonym"
              " WHERE xref_id = %" IDFMTSTR
              " AND synonym = '%%s'");

      checkSth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

        "INSERT ignore INTO external_synonym"
        " SET xref_id = %" IDFMTSTR ", synonym = '%%s'");     

      storeSth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

      synonyms = DBEntry_getAllSynonyms(exObj);

      for (i=0;i<Vector_getNumElement(synonyms); i++) {	    
        char *syn = Vector_getElementAt(synonyms,i);
        checkSth->execute(checkSth, dbX, syn);
        row = checkSth->fetchRow(checkSth);
        if (!row) {
          storeSth->execute(storeSth, dbX, syn);

  // check if the object mapping was already stored
           "SELECT xref_id"
           " FROM object_xref"
           " WHERE xref_id = " IDFMTSTR
           " AND   ensembl_object_type = '%s'"
           " AND   ensembl_id = " IDFMTSTR,
         dbX, ensType, ensObject);

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));


  row = sth->fetchRow(sth);
// NOTE row will be invalid after this call but will still
//      indicate whether something was found
  if (!row) {
    IDType Xidt;

    // Store the reference to the internal ensembl object
         "INSERT ignore INTO object_xref"
         " SET xref_id = " IDFMTSTR ","
         "     ensembl_object_type = '%s',"
         "     ensembl_id = " IDFMTSTR,
        dbX, ensType, ensObject);

    sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
    DBEntry_setDbID(exObj, dbX);
    DBEntry_setAdaptor(exObj, (BaseAdaptor *)dbea);
    Xidt = sth->getInsertId(sth);

    // If this is an IdentityXref need to store in that table too
    if (DBEntry_getIdentityXref(exObj)) {
      IdentityXref *idx = DBEntry_getIdentityXref(exObj);
             "INSERT ignore INTO identity_xref"
             " SET object_xref_id = " IDFMTSTR ","
             "     query_identity = %f,"
             "     target_identity = %f",

      sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
  return dbX;    
Ejemplo n.º 28
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  GeneAdaptor *ga;
  Slice *slice = NULL;
  Vector *genes = NULL;
  int i = 0;
  int failed = 0;
  initEnsC(argc, argv);

//  ProcUtil_showBacktrace(EnsC_progName);

  dba = Test_initROEnsDB();
  slice = Test_getStandardSlice(dba);

//  DBAdaptor *seqdba = DBAdaptor_new("genebuild6.internal.sanger.ac.uk","ensadmin","ensembl","steve_chicken_rnaseq_missing_reference",3306,NULL);
//  dba = DBAdaptor_new("genebuild1.internal.sanger.ac.uk","ensadmin","ensembl","steve_chicken_rnaseq_missing_refined",3306,seqdba);

  ok(1, slice!=NULL);

  ga = DBAdaptor_getGeneAdaptor(dba);
  SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba);

  ok(2, ga!=NULL);

  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","20",10000000,50000000,1,NULL,0);
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","17",1000000,5000000,1,NULL,0);
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","17",1,5000000,1,NULL,0);
// Has a seleno
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","1",1000000,27000000,1,NULL,0);
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","MT",1,17000,1,NULL,0);
  genes =  Slice_getAllGenes(slice, NULL, NULL, 1, NULL, NULL);

  fprintf(stdout, "Have %d genes\n", Vector_getNumElement(genes));
  ok(3, genes!=NULL);
  ok(4, Vector_getNumElement(genes)!=0);

  failed = dumpGenes(genes, 1);
  ok(5, !failed);

  //Vector *toplevelSlices = SliceAdaptor_fetchAll(sa, "toplevel", NULL, 0);
  Vector *toplevelSlices = SliceAdaptor_fetchAll(sa, "chromosome", NULL, 0);

  for (i=0;i<Vector_getNumElement(toplevelSlices) && !failed;i++) {
    Slice *tlSlice = Vector_getElementAt(toplevelSlices, i);
    fprintf(stderr, "Slice %s\n", Slice_getName(tlSlice));
    genes =  Slice_getAllGenes(tlSlice, NULL, NULL, 1, NULL, NULL);
    fprintf(stderr, "Got %d genes on %s\n", Vector_getNumElement(genes), Slice_getName(tlSlice));
    failed = dumpGenes(genes, 0);


  fprintf(stderr,"\nEcostring table stats:\n");

  ProcUtil_timeInfo("at end of GeneTest");

  return 0;
Ejemplo n.º 29
int dumpGenes(Vector *genes, int withSupport) {
  FILE *fp = stderr;
  int i;
  int failed = 0;
  for (i=0;i<Vector_getNumElement(genes) && !failed;i++) {
    Gene *g = Vector_getElementAt(genes,i);
    fprintf(fp,"Gene %s (%s) coords: %ld %ld %d\n",Gene_getStableId(g),(Gene_getDisplayXref(g) ? DBEntry_getDisplayId(Gene_getDisplayXref(g)) : ""),Gene_getStart(g),Gene_getEnd(g),Gene_getStrand(g));

    int j;
    for (j=0;j<Gene_getTranscriptCount(g);j++) {
      Transcript *t = Gene_getTranscriptAt(g,j);
      int k;
      fprintf(fp," Trans %s coords: %ld %ld %d biotype: %s\n",Transcript_getStableId(t), Transcript_getStart(t),Transcript_getEnd(t),Transcript_getStrand(t),Transcript_getBiotype(t));
      if (withSupport) {
        Vector *support = Transcript_getAllSupportingFeatures(t);
        for (k=0; k<Vector_getNumElement(support); k++) {
          BaseAlignFeature *baf = Vector_getElementAt(support, k);
          fprintf(fp,"   support %s coords: %ld %ld %d\n", BaseAlignFeature_getHitSeqName(baf), BaseAlignFeature_getStart(baf), BaseAlignFeature_getEnd(baf), BaseAlignFeature_getStrand(baf));
        Vector *intronSupport = Transcript_getAllIntronSupportingEvidence(t);
        for (k=0; k<Vector_getNumElement(intronSupport); k++) {
          IntronSupportingEvidence *ise = Vector_getElementAt(intronSupport, k);
          fprintf(fp,"   intron support %s coords: %ld %ld %d\n", IntronSupportingEvidence_getHitName(ise), IntronSupportingEvidence_getStart(ise), IntronSupportingEvidence_getEnd(ise), IntronSupportingEvidence_getStrand(ise));

      for (k=0;k<Transcript_getExonCount(t);k++) {
        Exon *e = Transcript_getExonAt(t,k);
        fprintf(fp,"  exon %s (%p) coords: %ld %ld %d\n",Exon_getStableId(e), e, Exon_getStart(e), Exon_getEnd(e), Exon_getStrand(e));
        if (withSupport) {
          Vector *support = Exon_getAllSupportingFeatures(e);
          int m;
          for (m=0; m<Vector_getNumElement(support); m++) {
            BaseAlignFeature *baf = Vector_getElementAt(support, m);
            fprintf(fp,"   support %s coords: %ld %ld %d\n", BaseAlignFeature_getHitSeqName(baf), BaseAlignFeature_getStart(baf), BaseAlignFeature_getEnd(baf), BaseAlignFeature_getStrand(baf));
      Translation *tln = Transcript_getTranslation(t);
      if (tln) {
        fprintf(fp," translation id: %s %s %d %s %d\n",Translation_getStableId(tln), 
                Exon_getStableId(Translation_getStartExon(tln)), Translation_getStart(tln),
                Exon_getStableId(Translation_getEndExon(tln)), Translation_getEnd(tln));
        char *tSeq = Transcript_translate(t);
        fprintf(fp," translation: %s\n",tSeq);
        Vector *tlnAttribs = Translation_getAllAttributes(tln, NULL);
        if (Vector_getNumElement(tlnAttribs)) {
          fprintf(fp, " translation attributes:\n");
          int n;
          for (n=0; n<Vector_getNumElement(tlnAttribs); n++) {
            Attribute *attrib = Vector_getElementAt(tlnAttribs, n);
            fprintf(fp, "  code %s name %s desc %s value %s\n", 
  return failed;
Ejemplo n.º 30
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) {
  int i;
  Vector *bigList = Vector_new();
  IDHash *overlappingSets[2];
  Vector *mergedAligns;

  for (i=0;i<Vector_getNumElement(alignSet1); i++) {
    GenomicAlign *align = Vector_getElementAt(alignSet1, i);
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)),
                                                        GenomicAlign_getQueryStart(align), align, 0));
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)),
                                                        GenomicAlign_getQueryEnd(align)+0.5, align, 0));

  for (i=0;i<Vector_getNumElement(alignSet2); i++) {
    GenomicAlign *align = Vector_getElementAt(alignSet2, i);
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)),
                                                        GenomicAlign_getConsensusStart(align), align, 1));
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)),
                                                        GenomicAlign_getConsensusEnd(align)+0.5, align, 1));
  Vector_sort(bigList, GenomicAlignListElem_compFunc);

  // walking from start to end through sortlist and keep track of the 
  // currently overlapping set of Alignments
  overlappingSets[0] = IDHash_new(IDHASH_SMALL);
  overlappingSets[1] = IDHash_new(IDHASH_SMALL);

  mergedAligns = Vector_new();

  for (i=0; i<Vector_getNumElement(bigList); i++) {
    GenomicAlignListElem *gale  = Vector_getElementAt(bigList,i);

    GenomicAlign *align = gale->align;
    IDType alignID      = GenomicAlign_getDbID(align);
    int setNo           = gale->setNum;

    if (IDHash_contains(overlappingSets[setNo], alignID)) {
      // remove from current overlapping set
      IDHash_remove(overlappingSets[setNo], alignID, NULL);
    } else {
      int j;
      void **values = IDHash_getValues(overlappingSets[1-setNo]);

      // insert into the set and do all the overlap business
      IDHash_add(overlappingSets[setNo], alignID, align);

      // the other set contains everything this align overlaps with
      for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) {
        GenomicAlign *align2 = values[j];
        if (setNo == 0) {
          GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2);
        } else {
          GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align);

// NIY Free gale

  return mergedAligns;