Exemple #1
0
void GenomicAlignAdaptor_nextCig(GenomicAlignAdaptor *gaa,
    Vector *cigList, int *cigListPos, int *cs, int *ce, int *qs, int *qe)  {
  int count;
  char type;
  char *cigElem;
  int lenElem;
  
  do {
    cigElem = Vector_getElementAt(cigList, *cigListPos);
    (*cigListPos)++;
    lenElem = strlen(cigElem);
    type = cigElem[lenElem-1];

    
    if (type!='M' && type!='I' && type!='D') {
      fprintf(stderr,"Error: Cigar string format error for %s\n",cigElem);
      break;
    }
  
    if (lenElem > 1) {
      cigElem[lenElem-1] = '\0';
      count = atol(cigElem);
    } else {
      count = 1;
    }


    switch (type) {
      case 'D':
        *qe += count;
        break;
      case 'I':
        *ce += count;
        break;
      case 'M':
        *cs = *ce + 1;
        *ce = *cs + count - 1;
        *qs = *qe + 1;
        *qe = *qs + count - 1;
    } 
  } while (type != 'M' && *cigListPos!=Vector_getNumElement(cigList));
}
MapperRangeSet *PredictionTranscript_genomic2cDNA(PredictionTranscript *trans, int start, int end, int strand, BaseContig *contig) {
    Mapper *mapper;

    // "ids" in mapper are contigs of exons, so use the same contig that should
    // be attached to all of the exons...
    if (!contig) {
        Vector *translateable = PredictionTranscript_getAllTranslateableExons(trans);
        PredictionExon *firstExon;
        if (!Vector_getNumElement(translateable)) {
            return MapperRangeSet_new();
        }
        firstExon = Vector_getElementAt(translateable, 0);
        contig = (BaseContig*)PredictionExon_getSlice(firstExon);
        Vector_free(translateable);
    }

    mapper = PredictionTranscript_getcDNACoordMapper(trans);

    return Mapper_mapCoordinates(mapper,(IDType)contig, start, end, strand, "genomic");
}
Exemple #3
0
PredictionTranscript *PredictionTranscriptAdaptor_fetchByStableId(PredictionTranscriptAdaptor *pta, char *stableId) {
  if (stableId == NULL) {
    fprintf(stderr,"Error: Stable_id argument expected in PredictionTranscriptAdaptor_fetchByStableId\n");
    exit(1);
  }

  NameTableType *tables = pta->getTables();
  char **primTab = (*tables)[0];
  char *tableSynonym = primTab[SYN];

  char constraint[1024];
  sprintf(constraint, "%s.display_label = '%s'", tableSynonym, stableId);

  Vector *pts = PredictionTranscriptAdaptor_genericFetch(pta, constraint, NULL, NULL);
  PredictionTranscript *pt = Vector_getElementAt(pts, 0);
  Vector_free(pts);
// NIY: Free pts if there are more than 1

// NIY: Perl seemed to allow no pts so undef return, but I've not done that in other places eg. TranscriptAdaptor so didn't here either
  return pt;
}
Exemple #4
0
long RangeRegistry_overlapSize(RangeRegistry *registry, IDType id, long start, long end)  {
  long overlap = 0;

  if ( start > end ) return 0;

  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list;
  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id);
  } else {
    return 0; // No list for this id, so can't be any overlap 
  }

  int len = Vector_getNumElement(list);

  if ( len == 0 ) {
    fprintf(stderr, "Odd have zero length list in RangeRegistry_overlapSize\n");
    return 0;
  }

  int startIdx = 0;
  int endIdx   = Vector_getNumElement(list)-1;
  int midIdx;
  CoordPair *range;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    range   = Vector_getElementAt(list, midIdx);
    if ( CoordPair_getEnd(range) < start ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;
    }
  }

  int i;
  for (i=startIdx; i < len ; i++ ) {
    CoordPair *pRange = Vector_getElementAt(list, i);
    long pStart = CoordPair_getStart(pRange);
    long pEnd   = CoordPair_getEnd(pRange);

    if ( pStart > end ) {
      break;
    }

    if ( pStart <= start && pEnd >= end ) {
      overlap = end - start + 1;
      break;
    }

    long mStart = ( start < pStart ? pStart : start );
    long mEnd   = ( end   < pEnd   ? end    : pEnd );

    if (mEnd - mStart >= 0) {
      overlap += ( mEnd - mStart + 1 );
    }
  }

  return overlap;
}
Exemple #5
0
  rc = Vector_getElementAt(rcVector,0);

  Vector_free(rcVector);

  return rc;   
}

RepeatConsensus *RepeatConsensusAdaptor_fetchByNameAndClass(RepeatConsensusAdaptor *rca, char *name, char *class) {
  char constraintStr[256];
  Vector *rcVector;
  RepeatConsensus *rc;
  
  sprintf(constraintStr,"repeat_name = \'%s\' AND repeat_class = \'%s\' limit 1", name,class);
  rcVector = RepeatConsensusAdaptor_genericFetch(rca, constraintStr); 

  rc = Vector_getElementAt(rcVector,0);

  Vector_free(rcVector);

  return rc;   
}


Vector *RepeatConsensusAdaptor_fetchByClassAndSeq(RepeatConsensusAdaptor *rca, char *class, char *seq) {
  Vector *result = NULL;
  char *constraintStr = NULL;

  if ((constraintStr = (char *)calloc(655500,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating constraintStr\n");
    return result;
  }
Exemple #6
0
int main(int argc, char *argv[]) {
  DBAdaptor *      dba;
  StatementHandle *sth;
  ResultRow *      row;
  Vector *         slices;
  int              nSlices;
  htsFile *      out;

  int   argNum = 1;

  char *inFName  = NULL;
  char *outFName = NULL;

  char *dbUser = "******";
  char *dbPass = NULL;
  int   dbPort = 3306;

  char *dbHost = "ens-staging.internal.sanger.ac.uk";
  char *dbName = "homo_sapiens_core_71_37";

  char *assName = "GRCh37";

  char *chrName = "1";


  int flags = 0;
  int   threads  = 1;

  initEnsC(argc, argv);

  while (argNum < argc) {
    char *arg = argv[argNum];
    char *val;

// Ones without a val go here
    if (!strcmp(arg, "-U") || !strcmp(arg,"--ucsc_naming")) {
      flags |= M_UCSC_NAMING;
    } else {
// Ones with a val go in this block
      if (argNum == argc-1) {
        Bamcov_usage();
      }

      val = argv[++argNum];
  
      if (!strcmp(arg, "-i") || !strcmp(arg,"--in_file")) {
        StrUtil_copyString(&inFName,val,0);
      } else if (!strcmp(arg, "-o") || !strcmp(arg,"--out_file")) {
        StrUtil_copyString(&outFName,val,0);
      } else if (!strcmp(arg, "-h") || !strcmp(arg,"--host")) {
        StrUtil_copyString(&dbHost,val,0);
      } else if (!strcmp(arg, "-p") || !strcmp(arg,"--password")) {
        StrUtil_copyString(&dbPass,val,0);
      } else if (!strcmp(arg, "-P") || !strcmp(arg,"--port")) {
        dbPort = atoi(val);
      } else if (!strcmp(arg, "-n") || !strcmp(arg,"--name")) {
        StrUtil_copyString(&dbName,val,0);
      } else if (!strcmp(arg, "-u") || !strcmp(arg,"--user")) {
        StrUtil_copyString(&dbUser,val,0);
      } else if (!strcmp(arg, "-t") || !strcmp(arg,"--threads")) {
        threads = atoi(val);
      } else if (!strcmp(arg, "-a") || !strcmp(arg,"--assembly")) {
        StrUtil_copyString(&assName,val,0);
      } else if (!strcmp(arg, "-v") || !strcmp(arg,"--verbosity")) {
        verbosity = atoi(val);
// Temporary
      } else if (!strcmp(arg, "-c") || !strcmp(arg,"--chromosome")) {
        StrUtil_copyString(&chrName,val,0);
      } else {
        fprintf(stderr,"Error in command line at %s\n\n",arg);
        Bamcov_usage();
      }
    }
    argNum++;
  }

  if (verbosity > 0) {
    printf("Program for calculating read coverage in a BAM file \n"
           "Steve M.J. Searle.  [email protected]  Last update April 2013.\n");
  }

  if (!inFName || !outFName) {
    Bamcov_usage();
  }

  dba = DBAdaptor_new(dbHost,dbUser,dbPass,dbName,dbPort,NULL);

  //nSlices = getSlices(dba, destName);
  nSlices = 1;

  slices = Vector_new();

  SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba);

  Slice *slice = SliceAdaptor_fetchByRegion(sa,NULL,chrName,POS_UNDEF,POS_UNDEF,1,NULL, 0);

  Vector_addElement(slices,slice);

  if (Vector_getNumElement(slices) == 0) {
    fprintf(stderr, "Error: No slices.\n");
    exit(1);
  }

  htsFile *in = hts_open(inFName, "rb");
  if (in == 0) {
    fprintf(stderr, "Fail to open BAM file %s\n", inFName);
    return 1;
  }

  hts_set_threads(in, threads);
  hts_idx_t *idx;
  idx = bam_index_load(inFName); // load BAM index
  if (idx == 0) {
    fprintf(stderr, "BAM index file is not available.\n");
    return 1;
  }

  int i;
  for (i=0; i<Vector_getNumElement(slices); i++) {
    Slice *slice = Vector_getElementAt(slices,i);

    if (verbosity > 0) printf("Working on '%s'\n",Slice_getName(slice));

//    if (verbosity > 0) printf("Stage 1 - retrieving annotation from database\n");
//    Vector *genes = getGenes(slice, flags);

    if (verbosity > 0) printf("Stage 1 - calculating coverage\n");
    calcCoverage(inFName, slice, in, idx, flags);
  }


  hts_idx_destroy(idx);
  hts_close(in);

  if (verbosity > 0) printf("Done\n");
  return 0;
}
Exemple #7
0
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) {
  int  ref;
  int  begRange;
  int  endRange;
  char region[1024];
  char region_name[512];


  if (Slice_getChrStart(slice) != 1) {
    fprintf(stderr, "Currently only allow a slice start position of 1\n");
    return 1;
  }
  if (flags & M_UCSC_NAMING) {
    sprintf(region,"chr%s", Slice_getSeqRegionName(slice));
  } else {
    sprintf(region,"%s", Slice_getSeqRegionName(slice));
  }
  bam_hdr_t *header = bam_hdr_init();
  header = bam_hdr_read(in->fp.bgzf);
  ref = bam_name2id(header, region);
  if (ref < 0) {
    fprintf(stderr, "Invalid region %s\n", region);
    exit(1);
  }
  sprintf(region,"%s:%ld-%ld", region_name,
                             Slice_getSeqRegionStart(slice),
                             Slice_getSeqRegionEnd(slice));
  if (hts_parse_reg(region, &begRange, &endRange) == NULL) {
    fprintf(stderr, "Could not parse %s\n", region);
    exit(2);
  }
  bam_hdr_destroy(header);


  hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange);
  bam1_t *b = bam_init1();

  Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage));

  long counter = 0;
  long overlapping = 0;
  long bad = 0;
  int startIndex = 0;
  while (bam_itr_next(in, iter, b) >= 0) {
    if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) {
      bad++;
      continue;
    }

    int end;
    //end = bam_calend(&b->core, bam1_cigar(b));
    end = bam_endpos(b);

    // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in).
    // That is the reason for the || end == begRange test
    if (end == begRange) {
      continue;
    }
    counter++;

    if (!(counter%1000000)) {
      if (verbosity > 1) { printf("."); }
      fflush(stdout);
    }

// Remember: b->core.pos is zero based!
    int cigInd;
    int refPos;
    int readPos;
    uint32_t *cigar = bam_get_cigar(b);
    for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) {
      int k;
      int lenCigBlock = cigar[cigInd]>>4;
      int op          = cigar[cigInd]&0xf;

      if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
        for (k = 0; k < lenCigBlock; ++k) {
          //if (ref[refPos+k] == 0) break; // out of boundary
          coverage[refPos+k].coverage++;
        }
        if (k < lenCigBlock) break;
        refPos += lenCigBlock; readPos += lenCigBlock;
      } else if (op == BAM_CDEL) {
        for (k = 0; k < lenCigBlock; ++k) {
        //  if (ref[refPos+k] == 0) break;
          coverage[refPos+k].coverage++;
        }
        if (k < lenCigBlock) break;
        refPos += lenCigBlock;
      } else if (op == BAM_CSOFT_CLIP) {
        readPos += lenCigBlock;
      } else if (op == BAM_CHARD_CLIP) {
      } else if (op == BAM_CINS) {
         readPos += lenCigBlock;
      } else if (op == BAM_CREF_SKIP) {
         refPos += lenCigBlock;
      }
    }

#ifdef DONE
    int j;
    int done = 0;
    int hadOverlap = 0;
    
    for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) {
      Gene *gene = Vector_getElementAt(genes,j); 
      if (!gene) {
        continue;
      }
// Remember: b->core.pos is zero based!
      if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) {
        int k;

        int doneGene = 0;
        for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) {
          Transcript *trans = Gene_getTranscriptAt(gene,k);

          if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) {
            int m;
     
            for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) {
              Exon *exon = Transcript_getExonAt(trans,m);

              if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) {

                // Only count as overlapping once (could be that a read overlaps more than one gene)
                if (!hadOverlap) {
                  overlapping++;
                  hadOverlap = 1;
                }

                gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
                gs->score++;
                
                doneGene = 1;
              }
            }
          }
        }
      } else if (Gene_getStart(gene) > end) {
        done = 1;
      } else if (Gene_getEnd(gene) < b->core.pos+1) {
        gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
        printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                          Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 
                                          gs->score);

        if (verbosity > 1) { 
          printf("Removing gene %s (index %d) with extent %d to %d\n", 
                 Gene_getStableId(gene), 
                 gs->index,
                 Gene_getStart(gene),
                 Gene_getEnd(gene));
        }
        Vector_setElementAt(genes,j,NULL);

        // Magic (very important for speed) - move startIndex to first non null gene
        int n;
        startIndex = 0;
        for (n=0;n<Vector_getNumElement(genes);n++) {
          void *v = Vector_getElementAt(genes,n);

          if (v != NULL) {
            break;
          }
          startIndex++;
        }
        if (verbosity > 1) { 
          printf("startIndex now %d\n",startIndex);
        }
      }
    }
#endif
  }
  if (verbosity > 1) { printf("\n"); }

#ifdef DONE
// Print out read counts for what ever's left in the genes array
  int n;
  for (n=0;n<Vector_getNumElement(genes);n++) {
    Gene *gene = Vector_getElementAt(genes,n);

    if (gene != NULL) {
      gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene));
      printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), 
                                        Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", 
                                        gs->score);
    }

  }
#endif

  printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad);

  long i;
  for (i=0; i< Slice_getLength(slice); i++) {
    printf("%ld %ld\n", i+1, coverage[i].coverage);
  }

  sam_itr_destroy(iter);
  bam_destroy1(b);


  return 1;
}
Exemple #8
0
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) {

  //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name);
  Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName);

  // if there are 0 or 1 transcripts still do lazy-loading
  if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) {
    return transcripts;
  }

  // preload all of the exons now, instead of lazy loading later
  // faster than 1 query per transcript

  // get extent of region spanned by transcripts
  long minStart =  2000000000;
  long maxEnd   = -2000000000;

  int i;
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) {
      minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t);
    }
    if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) {
      maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t);
    }
  }

  Slice *extSlice;

  if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) {
    extSlice = slice;
  } else {
    SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba);
    extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice),
                                          minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0);
  }

  // associate exon identifiers with transcripts
  IDHash *trHash = IDHash_new(IDHASH_MEDIUM);
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) {
      IDHash_add(trHash, PredictionTranscript_getDbID(t), t);
    }
  }

  IDType *uniqueIds = IDHash_getKeys(trHash);

  char tmpStr[1024];
  char *qStr = NULL;
  if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating qStr\n");
    return transcripts;
  }

  int lenNum;
  int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE  prediction_transcript_id IN (");
  for (i=0; i<IDHash_getNumValues(trHash); i++) {
    if (i!=0) {
      qStr[endPoint++] = ',';
      qStr[endPoint++] = ' ';
    }
    lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]);
    memcpy(&(qStr[endPoint]), tmpStr, lenNum);
    endPoint+=lenNum;
  }
  qStr[endPoint++] = ')';
  qStr[endPoint] = '\0';

  free(uniqueIds);

  StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr));
  sth->execute(sth);

  IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM);
  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType trId = row->getLongLongAt(row,0);
    IDType exId = row->getLongLongAt(row,1);
    int    rank = row->getIntAt(row,2);

    if (! IDHash_contains(exTrHash, exId)) {
      Vector *vec = Vector_new();
      Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free);
      IDHash_add(exTrHash, exId, vec);
    }
    Vector *exVec = IDHash_getValue(exTrHash, exId);
    PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank);
    Vector_addElement(exVec, trp);
  }

  IDHash_free(trHash, NULL);

  sth->finish(sth);

  PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba);
  Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice);

  // move exons onto transcript slice, and add them to transcripts
  for (i=0; i<Vector_getNumElement(exons); i++) {
    PredictionExon *ex = Vector_getElementAt(exons, i);

  // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it
    if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue;

    PredictionExon *newEx;
    if (slice != extSlice) {
      newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice);
      if (newEx == NULL) {
        fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n");
        exit(1);
      }
    } else {
      newEx = ex;
    }

    Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx));
    int j;
    for (j=0; j<Vector_getNumElement(exVec); j++) {
      PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j);
      PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank);
    }
  }

  IDHash_free(exTrHash, Vector_free);
  free(qStr);

  return transcripts;
}
Exemple #9
0
// Note I didn't implement the stable id fetching uggliness here. I'll probably make a separate method for that
// if necessary
Vector *BaseAdaptor_uncachedFetchAllByDbIDList(BaseAdaptor *ba, Vector *idList, Slice *slice) {
  if ( idList == NULL) {
    fprintf(stderr, "id_list list reference argument is required - bye!");
    return NULL;
  }
  char constraintPref[1024];
  

  if (!Vector_getNumElement(idList)) {
    return Vector_new();
  }

  NameTableType *tables = ba->getTables();
  char **t = (*tables)[0];

  sprintf(constraintPref, "%s.%s_id ", t[SYN], t[NAME] ); 

  // Ensure that we do not exceed MySQL's max_allowed_packet (defaults to
  // 1 MB) splitting large queries into smaller queries of at most 256 KB.
  // Assuming a (generous) average dbID string
  // length of 16, this means 16384 dbIDs in each query.
  int maxSize = 16384;

  // Uniquify the list
  IDHash *idListHash = IDHash_new(IDHASH_MEDIUM);

  int i;
  for (i=0; i<Vector_getNumElement(idList); i++) {
    IDType id = *(IDType *)(Vector_getElementAt(idList, i));
    if (!IDHash_contains(idListHash, id)) {
      IDHash_add(idListHash, id, &trueVal);
    }
  }

  IDType *uniqueIds = IDHash_getKeys(idListHash);
  int nUniqueId = IDHash_getNumValues(idListHash);

  IDHash_free(idListHash, NULL);

  Vector *out = Vector_new();

  int lenNum;
  for (i=0; i<nUniqueId; i+=maxSize) {
    char *constraint = NULL;

    if ((constraint = (char *)calloc(655500,sizeof(char))) == NULL) {
      fprintf(stderr,"Failed allocating constraint\n");
      return out;
    }

    strcpy(constraint, constraintPref);
  
    // Special case for one remaining Id
    if (i == nUniqueId-1) {
      sprintf(constraint, "%s = "IDFMTSTR, constraint, uniqueIds[i]);
    } else {
      char tmpStr[1024];
      int endPoint = sprintf(constraint, "%s IN (", constraint);
      int j;
      for (j=0; j<maxSize && j+i<nUniqueId; j++) {
        if (j!=0) {
          constraint[endPoint++] = ',';
          constraint[endPoint++] = ' ';
        }
        lenNum = sprintf(tmpStr, IDFMTSTR, uniqueIds[i+j]);
        memcpy(&(constraint[endPoint]), tmpStr, lenNum);
        endPoint+=lenNum;
      }
      constraint[endPoint++] = ')';
      constraint[endPoint] = '\0';
    }

    Vector *resChunk = BaseAdaptor_genericFetch(ba, constraint, NULL, slice);

    Vector_append(out, resChunk);

    Vector_free(resChunk);
    free(constraint);
  }
  free(uniqueIds);

  return out;
}
Exemple #10
0
int dumpGenes(Vector *genes, int withSupport) {
  FILE *fp = stderr;
  int i;
  int failed = 0;
  for (i=0;i<Vector_getNumElement(genes) && !failed;i++) {
    Gene *g = Vector_getElementAt(genes,i);
    fprintf(fp,"Gene %s (%s) coords: %ld %ld %d\n",Gene_getStableId(g),(Gene_getDisplayXref(g) ? DBEntry_getDisplayId(Gene_getDisplayXref(g)) : ""),Gene_getStart(g),Gene_getEnd(g),Gene_getStrand(g));

    int j;
    for (j=0;j<Gene_getTranscriptCount(g);j++) {
      Transcript *t = Gene_getTranscriptAt(g,j);
      int k;
     
      fprintf(fp," Trans %s coords: %ld %ld %d biotype: %s\n",Transcript_getStableId(t), Transcript_getStart(t),Transcript_getEnd(t),Transcript_getStrand(t),Transcript_getBiotype(t));
      if (withSupport) {
        Vector *support = Transcript_getAllSupportingFeatures(t);
        for (k=0; k<Vector_getNumElement(support); k++) {
          BaseAlignFeature *baf = Vector_getElementAt(support, k);
          fprintf(fp,"   support %s coords: %ld %ld %d\n", BaseAlignFeature_getHitSeqName(baf), BaseAlignFeature_getStart(baf), BaseAlignFeature_getEnd(baf), BaseAlignFeature_getStrand(baf));
        }
        Vector *intronSupport = Transcript_getAllIntronSupportingEvidence(t);
        for (k=0; k<Vector_getNumElement(intronSupport); k++) {
          IntronSupportingEvidence *ise = Vector_getElementAt(intronSupport, k);
          fprintf(fp,"   intron support %s coords: %ld %ld %d\n", IntronSupportingEvidence_getHitName(ise), IntronSupportingEvidence_getStart(ise), IntronSupportingEvidence_getEnd(ise), IntronSupportingEvidence_getStrand(ise));
        }
      }

      for (k=0;k<Transcript_getExonCount(t);k++) {
        Exon *e = Transcript_getExonAt(t,k);
        fprintf(fp,"  exon %s (%p) coords: %ld %ld %d\n",Exon_getStableId(e), e, Exon_getStart(e), Exon_getEnd(e), Exon_getStrand(e));
        if (withSupport) {
          Vector *support = Exon_getAllSupportingFeatures(e);
          int m;
          for (m=0; m<Vector_getNumElement(support); m++) {
            BaseAlignFeature *baf = Vector_getElementAt(support, m);
            fprintf(fp,"   support %s coords: %ld %ld %d\n", BaseAlignFeature_getHitSeqName(baf), BaseAlignFeature_getStart(baf), BaseAlignFeature_getEnd(baf), BaseAlignFeature_getStrand(baf));
          }
        }
      }
      Translation *tln = Transcript_getTranslation(t);
      if (tln) {
 
        fprintf(fp," translation id: %s %s %d %s %d\n",Translation_getStableId(tln), 
                Exon_getStableId(Translation_getStartExon(tln)), Translation_getStart(tln),
                Exon_getStableId(Translation_getEndExon(tln)), Translation_getEnd(tln));
        char *tSeq = Transcript_translate(t);
        fprintf(fp," translation: %s\n",tSeq);
        free(tSeq);
        Vector *tlnAttribs = Translation_getAllAttributes(tln, NULL);
        if (Vector_getNumElement(tlnAttribs)) {
          fprintf(fp, " translation attributes:\n");
          int n;
          for (n=0; n<Vector_getNumElement(tlnAttribs); n++) {
            Attribute *attrib = Vector_getElementAt(tlnAttribs, n);
            fprintf(fp, "  code %s name %s desc %s value %s\n", 
                    Attribute_getCode(attrib), 
                    Attribute_getName(attrib),
                    Attribute_getDescription(attrib),
                    Attribute_getValue(attrib));
          }
        }
      }
    }
  }
  return failed;
}
Exemple #11
0
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  GeneAdaptor *ga;
  Slice *slice = NULL;
  Vector *genes = NULL;
  int i = 0;
  int failed = 0;
  
  initEnsC(argc, argv);

//  ProcUtil_showBacktrace(EnsC_progName);

  dba = Test_initROEnsDB();
  slice = Test_getStandardSlice(dba);

//  DBAdaptor *seqdba = DBAdaptor_new("genebuild6.internal.sanger.ac.uk","ensadmin","ensembl","steve_chicken_rnaseq_missing_reference",3306,NULL);
//  dba = DBAdaptor_new("genebuild1.internal.sanger.ac.uk","ensadmin","ensembl","steve_chicken_rnaseq_missing_refined",3306,seqdba);

  ok(1, slice!=NULL);

  ga = DBAdaptor_getGeneAdaptor(dba);
  SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(dba);

  ok(2, ga!=NULL);

  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","20",10000000,50000000,1,NULL,0);
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","17",1000000,5000000,1,NULL,0);
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","17",1,5000000,1,NULL,0);
// Has a seleno
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","1",1000000,27000000,1,NULL,0);
//  slice = SliceAdaptor_fetchByRegion(sa,"chromosome","MT",1,17000,1,NULL,0);
  genes =  Slice_getAllGenes(slice, NULL, NULL, 1, NULL, NULL);

  fprintf(stdout, "Have %d genes\n", Vector_getNumElement(genes));
  ok(3, genes!=NULL);
  ok(4, Vector_getNumElement(genes)!=0);

  failed = dumpGenes(genes, 1);
  ok(5, !failed);

  //Vector *toplevelSlices = SliceAdaptor_fetchAll(sa, "toplevel", NULL, 0);
  Vector *toplevelSlices = SliceAdaptor_fetchAll(sa, "chromosome", NULL, 0);

  for (i=0;i<Vector_getNumElement(toplevelSlices) && !failed;i++) {
    Slice *tlSlice = Vector_getElementAt(toplevelSlices, i);
    fprintf(stderr, "Slice %s\n", Slice_getName(tlSlice));
    genes =  Slice_getAllGenes(tlSlice, NULL, NULL, 1, NULL, NULL);
    fprintf(stderr, "Got %d genes on %s\n", Vector_getNumElement(genes), Slice_getName(tlSlice));
    failed = dumpGenes(genes, 0);
  }

  //tc_malloc_stats();

  fprintf(stderr,"\nEcostring table stats:\n");
  EcoString_getInfo(ecoSTable);

  fprintf(stderr,"\n");
  ProcUtil_timeInfo("at end of GeneTest");

  return 0;
}
Exemple #12
0
IDType DBEntryAdaptor_store(DBEntryAdaptor *dbea, DBEntry *exObj, 
                            IDType ensObject, char *ensType, int ignoreRelease) {
  fprintf(stderr,"DBEntryAdaptor_store does not implement ignoreRelease functionality yet\n");

  char qStr[512];
  StatementHandle *sth;
  ResultRow *row;
  IDType dbRef;
  IDType dbX;

  //
  // Check for the existance of the external_db, throw if it does not exist
  //
  sprintf(qStr,
     "SELECT external_db_id"
     "  FROM external_db"
     " WHERE db_name = '%s'"
     "   AND db_release = %s",
     DBEntry_getDbName(exObj),
     DBEntry_getRelease(exObj));

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
  sth->execute(sth);
    
  row = sth->fetchRow(sth);
  if( row == NULL ) {
    sth->finish(sth);
    fprintf(stderr,"Error: external_db [%s] release [%s] does not exist\n", 
            DBEntry_getDbName(exObj), DBEntry_getRelease(exObj));
    exit(1);
  }

  dbRef =  row->getLongLongAt(row,0);
  sth->finish(sth);
    
  //
  // Check for the existance of the external reference, add it if not present
  //
  sprintf(qStr,
       "SELECT xref_id"
       "  FROM xref"
       " WHERE external_db_id = " IDFMTSTR
       "   AND dbprimary_acc = '%s'"
       "   AND version = %s",
      dbRef,
      DBEntry_getPrimaryId(exObj),
      DBEntry_getVersion(exObj));

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
  sth->execute(sth);

  row = sth->fetchRow(sth);
    
  if (row != NULL) {
    dbX =  row->getLongLongAt(row,0);
    sth->finish(sth);
  } else {
    //
    // store the new xref
    //

    // First finish the old sth
    sth->finish(sth);

// NIY Handling NULL values
    sprintf(qStr,
       "INSERT ignore INTO xref"
       " SET dbprimary_acc = '%s',"
       "    display_label = '%s',"
       "    version = %s,"
       "    description = '%s',"
       "    external_db_id = " IDFMTSTR,
       DBEntry_getPrimaryId(exObj),
       DBEntry_getDisplayId(exObj),
       DBEntry_getVersion(exObj),
       DBEntry_getDescription(exObj),
       dbRef
      );
    sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

    sth->execute(sth);
    dbX = sth->getInsertId(sth);

    sth->finish(sth);
	
    //
    // store the synonyms for the new xref
    // 
    if (DBEntry_getAllSynonyms(exObj)) {
      StatementHandle *checkSth;
      StatementHandle *storeSth;
      int i;
      Vector *synonyms;

      sprintf(qStr,
              "SELECT xref_id, synonym"
              " FROM external_synonym"
              " WHERE xref_id = %" IDFMTSTR
              " AND synonym = '%%s'");

      checkSth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

      sprintf(qStr,
        "INSERT ignore INTO external_synonym"
        " SET xref_id = %" IDFMTSTR ", synonym = '%%s'");     

      storeSth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

      synonyms = DBEntry_getAllSynonyms(exObj);

      for (i=0;i<Vector_getNumElement(synonyms); i++) {	    
        char *syn = Vector_getElementAt(synonyms,i);
        checkSth->execute(checkSth, dbX, syn);
        row = checkSth->fetchRow(checkSth);
        if (!row) {
          storeSth->execute(storeSth, dbX, syn);
        }
      }
  	
      checkSth->finish(checkSth);
      storeSth->finish(storeSth);
    }
  }

  //
  // check if the object mapping was already stored
  //
  sprintf(qStr,
           "SELECT xref_id"
           " FROM object_xref"
           " WHERE xref_id = " IDFMTSTR
           " AND   ensembl_object_type = '%s'"
           " AND   ensembl_id = " IDFMTSTR,
         dbX, ensType, ensObject);

  sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));

  sth->execute(sth);

  row = sth->fetchRow(sth);
// NOTE row will be invalid after this call but will still
//      indicate whether something was found
  sth->finish(sth);
    
  if (!row) {
    IDType Xidt;

    //
    // Store the reference to the internal ensembl object
    //
    sprintf(qStr,
         "INSERT ignore INTO object_xref"
         " SET xref_id = " IDFMTSTR ","
         "     ensembl_object_type = '%s',"
         "     ensembl_id = " IDFMTSTR,
        dbX, ensType, ensObject);

    sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
	
    sth->execute(sth);
    DBEntry_setDbID(exObj, dbX);
    DBEntry_setAdaptor(exObj, (BaseAdaptor *)dbea);
      
    Xidt = sth->getInsertId(sth);

    //
    // If this is an IdentityXref need to store in that table too
    //
    if (DBEntry_getIdentityXref(exObj)) {
      IdentityXref *idx = DBEntry_getIdentityXref(exObj);
      sprintf(qStr,
             "INSERT ignore INTO identity_xref"
             " SET object_xref_id = " IDFMTSTR ","
             "     query_identity = %f,"
             "     target_identity = %f",
             Xidt, 
             IdentityXref_getQueryIdentity(idx),
             IdentityXref_getTargetIdentity(idx));

      sth = dbea->prepare((BaseAdaptor *)dbea,qStr,strlen(qStr));
      sth->execute(sth);
      sth->finish(sth);
    }
  } 
  return dbX;    
}
Exemple #13
0
void GenomicAlignAdaptor_store(GenomicAlignAdaptor *gaa, Vector *genomicAligns) {
  int ok = 1;
  char *qStr = NULL;
  StatementHandle *sth;
  char commaStr[2] = {'\0','\0'};
  int i;
  char *tmpStr = NULL;
  
  if ((tmpStr = (char *)calloc(65556,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating tmpStr\n");
    ok = 0;
  }

  if (ok) {
    StrUtil_copyString(&qStr, "INSERT INTO genomic_align_block"
                       " (consensus_dnafrag_id, consensus_start, consensus_end,"
                       "  query_dnafrag_id, query_start, query_end, query_strand, method_link_id,"
                       "  score, perc_id, cigar_line) VALUES ",0);
  
    for (i=0; i<Vector_getNumElement(genomicAligns); i++) {
      GenomicAlign *ga = Vector_getElementAt(genomicAligns,i);
      DNAFrag *consDNAFrag  = GenomicAlign_getConsensusDNAFrag(ga);
      DNAFrag *queryDNAFrag = GenomicAlign_getQueryDNAFrag(ga);

      // check that everything has dbIDs
      if (!DNAFrag_getDbID(consDNAFrag) || !DNAFrag_getDbID(queryDNAFrag)) {
        fprintf(stderr, "Error: dna_fragment in GenomicAlign is not in DB\n");
        ok = 0;
        break;
      }
    }
  }

  GenomicAlign *ga = NULL;
  DNAFrag *consDNAFrag = NULL;
  DNAFrag *queryDNAFrag = NULL;
  IDType methodLinkId = 0;

  if (ok) {
    // all clear for storing
  
    for (i=0; i<Vector_getNumElement(genomicAligns); i++) {
      ga = Vector_getElementAt(genomicAligns,i);
      consDNAFrag  = GenomicAlign_getConsensusDNAFrag(ga);
      queryDNAFrag = GenomicAlign_getQueryDNAFrag(ga);

      methodLinkId = GenomicAlignAdaptor_methodLinkIdByAlignmentType(gaa, GenomicAlign_getAlignmentType(ga));

      if (!methodLinkId) {
        fprintf(stderr, "Error: There is no method_link with this type [%s] in the DB.\n",
                GenomicAlign_getAlignmentType(ga));
        ok = 0;
        break;
      }
    }
    
    if (ok) {
      sprintf(tmpStr," %s(" IDFMTSTR ", %d, %d, " IDFMTSTR ", %d, %d, %d, " IDFMTSTR ", %f, %f, '%s')", 
              commaStr, 
              DNAFrag_getDbID(consDNAFrag),
              GenomicAlign_getConsensusStart(ga),
              GenomicAlign_getConsensusEnd(ga),
              DNAFrag_getDbID(queryDNAFrag),  
              GenomicAlign_getQueryStart(ga),
              GenomicAlign_getQueryEnd(ga),
              GenomicAlign_getQueryStrand(ga),
              methodLinkId,
              GenomicAlign_getScore(ga),
              GenomicAlign_getPercentId(ga),
              GenomicAlign_getCigarString(ga));

      qStr = StrUtil_appendString(qStr, tmpStr);
      commaStr[0] = ','; 
    }
  
    sth = gaa->prepare((BaseAdaptor *)gaa, qStr, strlen(qStr));
    sth->execute(sth);
    sth->finish(sth);
  }

  if (qStr)
    free(qStr);

  if (tmpStr)
    free(tmpStr);
}
Exemple #14
0
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) {
  int i;
  Vector *bigList = Vector_new();
  IDHash *overlappingSets[2];
  Vector *mergedAligns;


  for (i=0;i<Vector_getNumElement(alignSet1); i++) {
    GenomicAlign *align = Vector_getElementAt(alignSet1, i);
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)),
                                                        GenomicAlign_getQueryStart(align), align, 0));
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)),
                                                        GenomicAlign_getQueryEnd(align)+0.5, align, 0));
  }

  for (i=0;i<Vector_getNumElement(alignSet2); i++) {
    GenomicAlign *align = Vector_getElementAt(alignSet2, i);
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)),
                                                        GenomicAlign_getConsensusStart(align), align, 1));
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)),
                                                        GenomicAlign_getConsensusEnd(align)+0.5, align, 1));
  }
  
  Vector_sort(bigList, GenomicAlignListElem_compFunc);

  // walking from start to end through sortlist and keep track of the 
  // currently overlapping set of Alignments
 
  overlappingSets[0] = IDHash_new(IDHASH_SMALL);
  overlappingSets[1] = IDHash_new(IDHASH_SMALL);

  mergedAligns = Vector_new();

  for (i=0; i<Vector_getNumElement(bigList); i++) {
    GenomicAlignListElem *gale  = Vector_getElementAt(bigList,i);

    GenomicAlign *align = gale->align;
    IDType alignID      = GenomicAlign_getDbID(align);
    int setNo           = gale->setNum;

    if (IDHash_contains(overlappingSets[setNo], alignID)) {
      // remove from current overlapping set
      IDHash_remove(overlappingSets[setNo], alignID, NULL);
    } else {
      int j;
      void **values = IDHash_getValues(overlappingSets[1-setNo]);

      // insert into the set and do all the overlap business
      IDHash_add(overlappingSets[setNo], alignID, align);

      // the other set contains everything this align overlaps with
      for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) {
        GenomicAlign *align2 = values[j];
        if (setNo == 0) {
          GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2);
        } else {
          GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align);
        }
      }
      free(values);
    }
  }

// NIY Free gale

  return mergedAligns;
}
Exemple #15
0
Vector *GenomicAlignAdaptor_fetchAllByDNAFragGenomeDB(GenomicAlignAdaptor *gaa,
               DNAFrag *dnaFrag, GenomeDB *targetGenome, int *startP, int *endP, 
               char *alignmentType) {

  Vector *result = NULL;
  GenomeDB *genomeCons;
  IDType methodLinkId;
  GenomeDB *genomeQuery;
  Vector *mergedAligns;
  int ok = 1;

  if (!dnaFrag) {
    fprintf(stderr, "Error: dnaFrag argument must be non NULL\n");
    ok = 0;
  }

  if (ok) {
    methodLinkId = GenomicAlignAdaptor_methodLinkIdByAlignmentType(gaa, alignmentType);

    genomeCons = DNAFrag_getGenomeDB(dnaFrag);
    genomeQuery = targetGenome;
  
    // direct or indirect ??
    if (GenomeDB_hasConsensus(genomeCons, genomeQuery, methodLinkId) ||
        GenomeDB_hasQuery(genomeCons, genomeQuery, methodLinkId)) {
      result = GenomicAlignAdaptor_fetchAllByDNAFragGenomeDBDirect(gaa, 
                                                                   dnaFrag, targetGenome, startP, endP, methodLinkId);
    } else {
      // indirect checks
      Vector *linkedCons  = GenomeDB_linkedGenomesByMethodLinkId(genomeCons, methodLinkId);
      Vector *linkedQuery = GenomeDB_linkedGenomesByMethodLinkId(genomeQuery, methodLinkId);
    
      // there are not many genomes, square effort is cheap
      Vector *linked = Vector_new();
      Vector *set1 = Vector_new();
      mergedAligns = Vector_new();
      int i;

      for (i=0; i<Vector_getNumElement(linkedCons); i++) {
        int j;
        GenomeDB *g1 = Vector_getElementAt(linkedCons, i);
        
        for (j=0; j<Vector_getNumElement(linkedQuery); j++) {
          GenomeDB *g2 = Vector_getElementAt(linkedQuery, i);
          if (g1 == g2) {
            Vector_addElement(linked, g1);
          }
        }
      }
      Vector_free(linkedCons);
      Vector_free(linkedQuery);

      // collect GenomicAligns from all linked genomes
      for (i=0; i<Vector_getNumElement(linked); i++) {
        GenomeDB *g = Vector_getElementAt(linked, i);

        Vector *gres = GenomicAlignAdaptor_fetchAllByDNAFragGenomeDBDirect(gaa, 
                                                                           dnaFrag, g, startP, endP, methodLinkId);
        Vector_append(set1, gres);
        
        Vector_free(gres);
      }

      // go from each dnafrag in the result set to target_genome
      // there is room for improvement here: create start end
      // my %frags = map { $_->query_dnafrag->dbID => $_->query_dnafrag } @$set1;
    

      for (i=0; i<Vector_getNumElement(set1); i++) {
        GenomicAlign *alignA = Vector_getElementAt(set1,i);
        DNAFrag *frag = GenomicAlign_getQueryDNAFrag(alignA);
        int qStart = GenomicAlign_getQueryStart(alignA);
        int qEnd   = GenomicAlign_getQueryEnd(alignA);

        Vector *dres = GenomicAlignAdaptor_fetchAllByDNAFragGenomeDBDirect(gaa,
                                                                           frag, genomeQuery, &qStart, &qEnd, methodLinkId);
        int j;

        for (j=0; j<Vector_getNumElement(dres); j++) {
          GenomicAlign *alignB = Vector_getElementAt(dres,j);
          GenomicAlignAdaptor_addDerivedAlignments(gaa,  mergedAligns, alignA, alignB);
        } 
        Vector_free(dres);
      }
      // NIY freeing
      result = mergedAligns;
    }
  }

  return result;
}
Exemple #16
0
// Also added a flag to indicate we actually want the gaps vector returned - quite often its not used in the caller and so would leak
// memory
Vector *RangeRegistry_checkAndRegister(RangeRegistry *registry, IDType id, long start, long end, 
                                       long rStart, long rEnd, int wantGaps) {

  // The following was commented out due to Ensembl Genomes requirements
  // for bacterial genomes.
  // The following was uncommented because I'm not caring about those requirements
  if ( start > end ) {
    fprintf(stderr, "start argument [%ld] must be less than (or equal to) end argument [%ld]\n", start, end);
    exit(1);
  }
  
  if ( rStart > rEnd ) {
    fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) rEnd argument [%ld]\n", rStart, rEnd);
    exit(1);
  }

  if ( rStart > start ) {
    fprintf(stderr, "rStart argument [%ld] must be less than (or equal to) start [%ld]\n", rStart, start);
    exit(1);
  }

  if ( rEnd < end ) {
    fprintf(stderr, "rEnd argument [%ld] must be greater than (or equal to) end [%ld]\n", rEnd, end);
    exit(1);
  }

  IDHash *regReg = RangeRegistry_getRegistry(registry);
  Vector *list;
  if (IDHash_contains(regReg, id)) {
    list = IDHash_getValue(regReg, id);
  } else {
    list = Vector_new();
    IDHash_add(regReg, id, list);
  }

  
  Vector *gapPairs = NULL;
  if (wantGaps) {
    gapPairs = Vector_new();
  }

  int len = Vector_getNumElement(list);

  if (len == 0) {
    //this is the first request for this id, return a gap pair for the
    // entire range and register it as seen
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_addElement(list, cp);

    return Vector_copy(list);
  }

  //####
  // loop through the list of existing ranges recording any "gaps" where
  // the existing range does not cover part of the requested range
  // 

  int startIdx = 0;
  int endIdx   = Vector_getNumElement(list)-1;
  int midIdx;
  CoordPair *range;

  // binary search the relevant pairs
  // helps if the list is big
  while ( ( endIdx - startIdx ) > 1 ) {
    midIdx = ( startIdx + endIdx ) >> 1;
    range  = Vector_getElementAt(list, midIdx);

    if ( CoordPair_getEnd(range) < rStart ) {
      startIdx = midIdx;
    } else {
      endIdx = midIdx;
    }
  }

  long gapStart;
  long gapEnd;
  int rIdx = -1;
  int rStartIdx = -1;
  int rEndIdx;

  gapStart = rStart;

  int i;
  for (i=startIdx; i < len ; i++ ) {
    CoordPair *pRange = Vector_getElementAt(list, i);
    long pStart = CoordPair_getStart(pRange);
    long pEnd   = CoordPair_getEnd(pRange);
    
    // no work needs to be done at all if we find a range pair that
    // entirely overlaps the requested region
    if ( pStart <= start && pEnd >= end ) {
      return Vector_new(); // perl returns undef, but that causes me problems
    }

    // find adjacent or overlapping regions already registered
    if ( pEnd >= ( rStart - 1 ) && pStart <= ( rEnd + 1 ) ) {
      if ( rStartIdx < 0 ) { // Not yet been set
        rStartIdx = i;
      }
      rEndIdx = i;
    }

    if ( pStart > rStart ) {
      gapEnd = ( rEnd < pStart ) ? rEnd : pStart - 1;
      if (wantGaps) {
        CoordPair *cp = CoordPair_new(gapStart, gapEnd);
        Vector_addElement(gapPairs, cp);
      }
    }

    gapStart = ( rStart > pEnd ) ? rStart : pEnd + 1;

    if ( pEnd >= rEnd && rIdx < 0 ) {
      rIdx = i;
      break;
    }
  }

  // do we have to make another gap?
  if ( gapStart <= rEnd ) {
    if (wantGaps) {
      CoordPair *cp = CoordPair_new(gapStart, rEnd);
      Vector_addElement(gapPairs, cp);
    }
  }

  // 
  // Merge the new range into the registered list
  // 
  if (rStartIdx >= 0 ) { // rStartIdx has been set to something 
    long newStart;
    long newEnd;
    CoordPair *rStartIdxRange = Vector_getElementAt(list, rStartIdx); 
    CoordPair *rEndIdxRange   = Vector_getElementAt(list, rEndIdx); 

    if ( rStart < CoordPair_getStart(rStartIdxRange)) {
      newStart = rStart;
    } else {
      newStart = CoordPair_getStart(rStartIdxRange);
    }

    if ( rEnd > CoordPair_getEnd(rEndIdxRange)) {
      newEnd = rEnd;
    } else {
      newEnd = CoordPair_getEnd(rEndIdxRange);
    }

    CoordPair *cp = CoordPair_new(newStart, newEnd);

    // Think its <=
    for (i=rStartIdx; i<=rEndIdx; i++) {
      Vector_removeElementAt(list, rStartIdx); // Always remove from rStartIdx as array is shrinking by one each time called
    }
    Vector_insertElementAt(list, rStartIdx, cp);
    //splice( @$list, $rstart_idx,
    //        $rend_idx - $rstart_idx + 1,
    //        [ $new_start, $new_end ] );

  } else if (rIdx >= 0) {
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_insertElementAt(list, rIdx, cp);
    //splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
  } else {
    CoordPair *cp = CoordPair_new(rStart, rEnd);
    Vector_addElement(list, cp);
  }

  // Note if wantGaps is not set then gapPairs will be NULL - but you said you didn't want it so that should be OK
  return gapPairs;
}
Exemple #17
0
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  AssemblyMapperAdaptor *asma;
  int testNum = 1;

  initEnsC(argc, argv);

  dba = Test_initROEnsDB();

  //
  // 1 Test AssemblyMapperAdaptor constructor
  //
  asma = DBAdaptor_getAssemblyMapperAdaptor(dba);

  ok(testNum++, asma!=NULL);

  //
  // Test fetch_by_CoordSystems
  //

  CoordSystemAdaptor *csa = DBAdaptor_getCoordSystemAdaptor(dba);

  CoordSystemAdaptor_dumpCachedMappings(csa);

  CoordSystem *toplevelCs  = CoordSystemAdaptor_fetchByName(csa, "toplevel", NULL);
  CoordSystem *clnCs  = CoordSystemAdaptor_fetchByName(csa, "clone", NULL);
  CoordSystem *superctgCs = CoordSystemAdaptor_fetchByName(csa, "supercontig", NULL);

  TopLevelAssemblyMapper *clnToplevelMapper = (TopLevelAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, toplevelCs, clnCs);
  TopLevelAssemblyMapper *superctgToplevelMapper = (TopLevelAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, toplevelCs, superctgCs);

  ok(testNum++, clnToplevelMapper!=NULL); //  && $cln_toplevel_mapper->isa('Bio::EnsEMBL::TopLevelAssemblyMapper'));
  ok(testNum++, superctgToplevelMapper!=NULL); //  && $cln_toplevel_mapper->isa('Bio::EnsEMBL::TopLevelAssemblyMapper'));


//
// test db has chr 20  (50KB -> 62MB)
//

//
// Test map
//
  MapperRangeSet *coords = NULL;

  if (clnToplevelMapper) {
    fprintf(stderr, "MAP 'AL359765.6'->toplevel\n");
    coords = TopLevelAssemblyMapper_map(clnToplevelMapper,"AL359765.6", 1, 13780, 1, clnCs,  0, NULL);
    printCoords(coords);
    ok(testNum++, coords!=NULL);
  }

  if (superctgToplevelMapper) {
      fprintf(stderr, "MAP NT_028392->toplevel\n");
      coords = TopLevelAssemblyMapper_map(superctgToplevelMapper, "NT_028392", 600000, 1000000, 1, superctgCs, 0, NULL);
      printCoords(coords);
      ok(testNum++, coords!=NULL);
    }


//
// Test list_seq_regions
//
  Vector *seqRegions;
  int i;
    
  if (clnToplevelMapper) {
    seqRegions = TopLevelAssemblyMapper_listSeqRegions(clnToplevelMapper, "AL359765.6", 1, 13780, clnCs);
    ok(testNum++, seqRegions!=NULL && Vector_getNumElement(seqRegions) == 1 && !strcmp("20", Vector_getElementAt(seqRegions,0)));
    for (i=0;i<Vector_getNumElement(seqRegions); i++) {
      char *regionName = Vector_getElementAt(seqRegions, i);
      fprintf(stderr, "%s\n",regionName);
    }
  }

  if (superctgToplevelMapper) {
    seqRegions = TopLevelAssemblyMapper_listSeqRegions(superctgToplevelMapper, "NT_028392", 600000, 1000000, superctgCs);
    ok(testNum++, seqRegions!=NULL && Vector_getNumElement(seqRegions) == 1 && !strcmp("20", Vector_getElementAt(seqRegions,0)));
    for (i=0;i<Vector_getNumElement(seqRegions); i++) {
      char *regionName = Vector_getElementAt(seqRegions, i);
      fprintf(stderr, "%s\n",regionName);
    }
  }

//
// Test list_seq_ids
//
  Vector *ids;

  if (clnToplevelMapper) {
    ids = TopLevelAssemblyMapper_listIds(clnToplevelMapper, "AL359765.6", 1, 13780, clnCs);
    ok(testNum++, ids!=NULL && Vector_getNumElement(ids) == 1 && *((IDType *)Vector_getElementAt(ids,0)) == 469283 );
    for (i=0;i<Vector_getNumElement(ids); i++) {
      IDType id = *((IDType *)Vector_getElementAt(ids, i));
      fprintf(stderr, IDFMTSTR"\n",id);
    }
  }

  if (superctgToplevelMapper) {
    ids = TopLevelAssemblyMapper_listIds(superctgToplevelMapper, "NT_028392", 600000, 1000000, superctgCs);
    ok(testNum++, ids!=NULL && Vector_getNumElement(ids) == 1 && *((IDType *)Vector_getElementAt(ids,0)) == 469283 );
    for (i=0;i<Vector_getNumElement(ids); i++) {
      IDType id = *((IDType *)Vector_getElementAt(ids, i));
      fprintf(stderr, IDFMTSTR"\n",id);
    }
  }

// Test for a not implemented method
//  seqRegions = TopLevelAssemblyMapper_listContigIds(clnToplevelMapper, "AL359765.6", 1, 13780, 1);

  return 0;
}
Exemple #18
0
int main(int argc, char *argv[]) {
  DBAdaptor *dba;
  AssemblyMapperAdaptor *asma;
  int testNum = 1;
  
  initEnsC(argc, argv);

  dba = Test_initROEnsDB();

  //
  // 1 Test AssemblyMapperAdaptor constructor
  //
  asma = DBAdaptor_getAssemblyMapperAdaptor(dba);

  ok(testNum++, asma!=NULL);


  //
  // 2 Test fetch_by_CoordSystems
  //

  CoordSystemAdaptor *csa = DBAdaptor_getCoordSystemAdaptor(dba);

  CoordSystemAdaptor_dumpCachedMappings(csa);

  CoordSystem *chrCs  = CoordSystemAdaptor_fetchByName(csa, "chromosome", NULL);
  CoordSystem *clnCs  = CoordSystemAdaptor_fetchByName(csa, "clone", NULL);
  CoordSystem *sCtgCs = CoordSystemAdaptor_fetchByName(csa, "supercontig", NULL);

  ChainedAssemblyMapper *asmMapper =  (ChainedAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, clnCs, chrCs);

  ok(testNum++,  asmMapper!=NULL); // Need to make it an object before can do this && asmMapper->objectType == ( "Bio::EnsEMBL::ChainedAssemblyMapper" ));
  
  ChainedAssemblyMapper *chrSCtgMapper = (ChainedAssemblyMapper *)AssemblyMapperAdaptor_fetchByCoordSystems(asma, chrCs, sCtgCs);

  ok(testNum++, chrSCtgMapper!=NULL);// && $chr_sctg_mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper'));

//
// test db has chr 20  (50KB -> 62MB)
//

  MapperRangeSet *coords;

  if (asmMapper)
    {
      fprintf(stderr,"MAP 20->clone\n");
      coords = ChainedAssemblyMapper_map(asmMapper, "20", 500001, 60000000, 1, chrCs, 0, NULL);
      ok(testNum++, coords!=NULL);
      printCoords(coords);
    }

  if (asmMapper)
    {
      fprintf(stderr,"MAP 'AL359765.6'->chromosome\n");
      coords = ChainedAssemblyMapper_map(asmMapper, "AL359765.6", 1, 13780, 1, clnCs, 0, NULL);
      ok(testNum++, coords!=NULL);
      printCoords(coords);
    }

  if (chrSCtgMapper)
    {
      fprintf(stderr,"MAP 20->supercontig\n");
      coords = ChainedAssemblyMapper_map(chrSCtgMapper, "20", 500001, 60000000, 1, chrCs, 0, NULL);
      ok(testNum++, coords!=NULL);
      printCoords(coords);
    }

  //
  // Test list_seq_regions
  //
  fprintf(stderr,"Starting list tests\n");
  int i;

  if (asmMapper)
    {
      Vector *seqRegions = ChainedAssemblyMapper_listSeqRegions(asmMapper, "20", 500001, 60000000, chrCs);
      ok(testNum++, seqRegions != NULL);
      for (i=0;i<Vector_getNumElement(seqRegions); i++) {
        char *regionName = Vector_getElementAt(seqRegions, i);
        fprintf(stderr, "%s\n",regionName);
      }
    }

  if (asmMapper)
    {
      Vector *seqRegions = ChainedAssemblyMapper_listSeqRegions(asmMapper, "AL359765.6", 1, 13780, clnCs);
      ok(testNum++, seqRegions!=NULL);
      for (i=0;i<Vector_getNumElement(seqRegions); i++) {
        char *regionName = Vector_getElementAt(seqRegions, i);
        fprintf(stderr, "%s\n",regionName);
      }
    }


  if (chrSCtgMapper)
    {
      Vector *seqRegions = ChainedAssemblyMapper_listSeqRegions(chrSCtgMapper, "NT_028392", 600000, 1000000, sCtgCs);
      ok(testNum++, seqRegions!=NULL);
      for (i=0;i<Vector_getNumElement(seqRegions); i++) {
        char *regionName = Vector_getElementAt(seqRegions, i);
        fprintf(stderr, "%s\n",regionName);
      }
    }

  if (chrSCtgMapper)
    {
      Vector *seqRegions = ChainedAssemblyMapper_listSeqRegions(chrSCtgMapper, "20", 3000000, 31000000, chrCs);
      ok(testNum++, seqRegions!=NULL);
      for (i=0;i<Vector_getNumElement(seqRegions); i++) {
        char *regionName = Vector_getElementAt(seqRegions, i);
        fprintf(stderr, "%s\n",regionName);
      }
    }



  //
  // Test list_seq_ids
  //

  if (asmMapper)
    {
      Vector *seqIds = ChainedAssemblyMapper_listIds(asmMapper, "20", 500001, 60000000, chrCs);

      ok(testNum++, seqIds!=NULL);
      for (i=0;i<Vector_getNumElement(seqIds); i++) {
        IDType regionId = *((IDType *)Vector_getElementAt(seqIds, i));
        fprintf(stderr, IDFMTSTR"\n",regionId);
      }
    }

  if (asmMapper)
    {
      Vector *seqIds = ChainedAssemblyMapper_listIds(asmMapper, "AL359765.6", 1, 13780, clnCs);
      ok(testNum++, seqIds!=NULL);
      for (i=0;i<Vector_getNumElement(seqIds); i++) {
        IDType regionId = *((IDType *)Vector_getElementAt(seqIds, i));
        fprintf(stderr, IDFMTSTR"\n",regionId);
      }
    }

  if (chrSCtgMapper)
    {
      Vector *seqIds = ChainedAssemblyMapper_listIds(chrSCtgMapper, "NT_028392", 600000, 1000000, sCtgCs);
      ok(testNum++, seqIds!=NULL);
      for (i=0;i<Vector_getNumElement(seqIds); i++) {
        IDType regionId = *((IDType *)Vector_getElementAt(seqIds, i));
        fprintf(stderr, IDFMTSTR"\n",regionId);
      }
    }

  if (chrSCtgMapper)
    {
      Vector *seqIds = ChainedAssemblyMapper_listIds(chrSCtgMapper, "20", 3000000, 31000000, chrCs);
      ok(testNum++, seqIds!=NULL);
      for (i=0;i<Vector_getNumElement(seqIds); i++) {
        IDType regionId = *((IDType *)Vector_getElementAt(seqIds, i));
        fprintf(stderr, IDFMTSTR"\n",regionId);
      }
    }

  return 0;
  
}