Exemplo n.º 1
0
void Mapper_sort(Mapper *m) {

  IDHash *fromHash = Mapper_getPairHash(m, MAPPER_FROM_IND);
  IDHash *toHash   = Mapper_getPairHash(m, MAPPER_TO_IND);
  MapperPairSet **sets;
  int nSet;
  int i;

  sets = (MapperPairSet **)IDHash_getValues(fromHash);
  nSet = IDHash_getNumValues(fromHash);
  for (i=0;i<nSet;i++) {
    MapperPairSet_sort(sets[i],MAPPER_FROM_IND);
  }
  free(sets);

  sets = (MapperPairSet **)IDHash_getValues(toHash);
  nSet = IDHash_getNumValues(toHash);
  for (i=0;i<nSet;i++) {
    MapperPairSet_sort(sets[i],MAPPER_TO_IND);
  }
  free(sets);

// MergePairs yet to do
  Mapper_mergePairs(m);

  Mapper_setIsSorted(m, 1);

}
Exemplo n.º 2
0
void Mapper_dump(Mapper *m, FILE *fp) {
  IDHash *fromHash;
  IDType *keys;
  int   nKey;
  int   i;

  if (fp==NULL) {
    fp = stderr;
  }

  fromHash = Mapper_getPairHash(m, MAPPER_TO_IND);

  keys = IDHash_getKeys(fromHash);
  nKey = IDHash_getNumValues(fromHash);

  for (i=0;i<nKey;i++) {
    IDType id = keys[i];
    MapperPairSet *set = IDHash_getValue(fromHash,id);
    int j;

    fprintf(fp, "From Hash " IDFMTSTR " with %d pairs\n",id, MapperPairSet_getNumPair(set));

    for (j=0; j<MapperPairSet_getNumPair(set); j++) {
      MapperPair *pair = MapperPairSet_getPairAt(set,j);
      MapperUnit *fromCoord = MapperPair_getUnit(pair, MAPPER_FROM_IND);
      MapperUnit *toCoord   = MapperPair_getUnit(pair, MAPPER_TO_IND);

      fprintf(fp, "    %ld %ld:%ld %ld " IDFMTSTR "\n",fromCoord->start,fromCoord->end,
              toCoord->start,toCoord->end,toCoord->id);
    }
  }

  free(keys);
}
Exemplo n.º 3
0
// this function merges pairs that are adjacent into one
// This function is a pain in the arse to implement in C
void Mapper_mergePairs(Mapper *m) {

  int to   = MAPPER_TO_IND;
  int from = MAPPER_FROM_IND;

  Mapper_setPairCount(m, 0);

  IDHash *toPairHash   = Mapper_getPairHash(m, MAPPER_TO_IND);
  IDHash *fromPairHash = Mapper_getPairHash(m, MAPPER_FROM_IND);
 
  MapperPairSet **toPairValues = (MapperPairSet **)IDHash_getValues(toPairHash);

  
  int pairInd;
  for (pairInd = 0; pairInd<IDHash_getNumValues(toPairHash); pairInd++) {
    MapperPairSet *pairs = toPairValues[pairInd];

    int i = 0;
    int next = 1;
    int length = MapperPairSet_getNumPair(pairs)-1; //$#{$lr};

    while (next <= length) {
      MapperPair *currentPair = MapperPairSet_getPairAt(pairs, i);
      MapperPair *nextPair    = MapperPairSet_getPairAt(pairs, next);
      MapperPair *delPair     = NULL;

      if (MapperPair_isIndel(currentPair) || MapperPair_isIndel(nextPair)) {
        //necessary to modify the merge function to not merge indels
        next++;
        i++;

      } else {
        // duplicate filter
        if ( MapperPair_getUnit(currentPair,to)->start == MapperPair_getUnit(nextPair,to)->start  &&
             MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) {

          delPair = nextPair;

        } else if (( MapperPair_getUnit(currentPair,from)->id == MapperPair_getUnit(nextPair,from)->id ) &&
                   ( nextPair->ori == currentPair->ori ) &&
                   ( MapperPair_getUnit(nextPair,to)->start-1 == MapperPair_getUnit(currentPair,to)->end )) {

          if ( currentPair->ori == 1 ) {

            // check forward strand merge
            if ( MapperPair_getUnit(nextPair,from)->start-1 == MapperPair_getUnit(currentPair,from)->end) {
              // normal merge with previous element
              MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end;
              MapperPair_getUnit(currentPair,from)->end = MapperPair_getUnit(nextPair,from)->end;
              delPair = nextPair;
            }
          } else {

            // check backward strand merge
            if ( MapperPair_getUnit(nextPair,from)->end+1  == MapperPair_getUnit(currentPair,from)->start ) {

              // yes its a merge
              MapperPair_getUnit(currentPair,to)->end = MapperPair_getUnit(nextPair,to)->end;
              MapperPair_getUnit(currentPair,from)->start = MapperPair_getUnit(nextPair,from)->start;
              delPair = nextPair;
            }
          }
        }

        if (delPair != NULL) { // Have a pair to delete
          // Remove from the to pair set
          MapperPairSet_removePairAt(pairs, next); //splice( @$lr, $next, 1 );

          MapperPairSet *fromPairs = IDHash_getValue(fromPairHash, MapperPair_getUnit(delPair, from)->id); //$self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})}; 

          int j;
          for (j=0; j < MapperPairSet_getNumPair(fromPairs); j++) {
            MapperPair *fromPair = MapperPairSet_getPairAt(fromPairs, j);
            if ( fromPair == delPair) { // Is this really going to be an equality ??? //$lr_from->[$j] == $del_pair )
              MapperPairSet_removePairAt(fromPairs, j); //splice( @$lr_from, $j, 1 );
              break;
            }
          }

          // NIY: Do we need to free delPair???

          length--;
          if ( length < next ) break;
        } else {
          next++;
          i++;
        }
      }
    }

    Mapper_addToPairCount(m, MapperPairSet_getNumPair(pairs)); //    $self->{'pair_count'} += scalar( @$lr );
  }
}
Exemplo n.º 4
0
Vector *PredictionTranscriptAdaptor_fetchAllBySlice(PredictionTranscriptAdaptor *pta, Slice *slice, char *logicName, int loadExons) {

  //my $transcripts = $self->SUPER::fetch_all_by_Slice($slice,$logic_name);
  Vector *transcripts = BaseFeatureAdaptor_fetchAllBySlice((BaseFeatureAdaptor *)pta, slice, logicName);

  // if there are 0 or 1 transcripts still do lazy-loading
  if ( ! loadExons || Vector_getNumElement(transcripts) < 2 ) {
    return transcripts;
  }

  // preload all of the exons now, instead of lazy loading later
  // faster than 1 query per transcript

  // get extent of region spanned by transcripts
  long minStart =  2000000000;
  long maxEnd   = -2000000000;

  int i;
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if (PredictionTranscript_getSeqRegionStart((SeqFeature*)t) < minStart) {
      minStart = PredictionTranscript_getSeqRegionStart((SeqFeature*)t);
    }
    if (PredictionTranscript_getSeqRegionEnd((SeqFeature*)t) > maxEnd) {
      maxEnd = PredictionTranscript_getSeqRegionEnd((SeqFeature*)t);
    }
  }

  Slice *extSlice;

  if (minStart >= Slice_getStart(slice) && maxEnd <= Slice_getEnd(slice)) {
    extSlice = slice;
  } else {
    SliceAdaptor *sa = DBAdaptor_getSliceAdaptor(pta->dba);
    extSlice = SliceAdaptor_fetchByRegion(sa, Slice_getCoordSystemName(slice), Slice_getSeqRegionName(slice),
                                          minStart, maxEnd, Slice_getStrand(slice), CoordSystem_getVersion(Slice_getCoordSystem(slice)), 0);
  }

  // associate exon identifiers with transcripts
  IDHash *trHash = IDHash_new(IDHASH_MEDIUM);
  for (i=0; i<Vector_getNumElement(transcripts); i++) {
    PredictionTranscript *t  = Vector_getElementAt(transcripts, i);
    if ( ! IDHash_contains(trHash, PredictionTranscript_getDbID(t))) {
      IDHash_add(trHash, PredictionTranscript_getDbID(t), t);
    }
  }

  IDType *uniqueIds = IDHash_getKeys(trHash);

  char tmpStr[1024];
  char *qStr = NULL;
  if ((qStr = (char *)calloc(655500,sizeof(char))) == NULL) {
    fprintf(stderr,"Failed allocating qStr\n");
    return transcripts;
  }

  int lenNum;
  int endPoint = sprintf(qStr, "SELECT prediction_transcript_id, prediction_exon_id, exon_rank FROM prediction_exon WHERE  prediction_transcript_id IN (");
  for (i=0; i<IDHash_getNumValues(trHash); i++) {
    if (i!=0) {
      qStr[endPoint++] = ',';
      qStr[endPoint++] = ' ';
    }
    lenNum = sprintf(tmpStr,IDFMTSTR,uniqueIds[i]);
    memcpy(&(qStr[endPoint]), tmpStr, lenNum);
    endPoint+=lenNum;
  }
  qStr[endPoint++] = ')';
  qStr[endPoint] = '\0';

  free(uniqueIds);

  StatementHandle *sth = pta->prepare((BaseAdaptor *)pta,qStr,strlen(qStr));
  sth->execute(sth);

  IDHash *exTrHash = IDHash_new(IDHASH_MEDIUM);
  ResultRow *row;
  while ((row = sth->fetchRow(sth))) {
    IDType trId = row->getLongLongAt(row,0);
    IDType exId = row->getLongLongAt(row,1);
    int    rank = row->getIntAt(row,2);

    if (! IDHash_contains(exTrHash, exId)) {
      Vector *vec = Vector_new();
      Vector_setFreeFunc(vec, PredictionTranscriptRankPair_free);
      IDHash_add(exTrHash, exId, vec);
    }
    Vector *exVec = IDHash_getValue(exTrHash, exId);
    PredictionTranscriptRankPair *trp = PredictionTranscriptRankPair_new(IDHash_getValue(trHash, trId), rank);
    Vector_addElement(exVec, trp);
  }

  IDHash_free(trHash, NULL);

  sth->finish(sth);

  PredictionExonAdaptor *pea = DBAdaptor_getPredictionExonAdaptor(pta->dba);
  Vector *exons = PredictionExonAdaptor_fetchAllBySlice(pea, extSlice);

  // move exons onto transcript slice, and add them to transcripts
  for (i=0; i<Vector_getNumElement(exons); i++) {
    PredictionExon *ex = Vector_getElementAt(exons, i);

  // Perl didn't have this line - it was in GeneAdaptor version so I think I'm going to keep it
    if (!IDHash_contains(exTrHash, PredictionExon_getDbID(ex))) continue;

    PredictionExon *newEx;
    if (slice != extSlice) {
      newEx = (PredictionExon*)PredictionExon_transfer((SeqFeature*)ex, slice);
      if (newEx == NULL) {
        fprintf(stderr, "Unexpected. Exon could not be transferred onto PredictionTranscript slice.\n");
        exit(1);
      }
    } else {
      newEx = ex;
    }

    Vector *exVec = IDHash_getValue(exTrHash, PredictionExon_getDbID(newEx));
    int j;
    for (j=0; j<Vector_getNumElement(exVec); j++) {
      PredictionTranscriptRankPair *trp = Vector_getElementAt(exVec, j);
      PredictionTranscript_addExon(trp->transcript, newEx, &trp->rank);
    }
  }

  IDHash_free(exTrHash, Vector_free);
  free(qStr);

  return transcripts;
}
Exemplo n.º 5
0
Vector *GenomicAlignAdaptor_mergeAlignsets(GenomicAlignAdaptor *gaa, Vector *alignSet1, Vector *alignSet2) {
  int i;
  Vector *bigList = Vector_new();
  IDHash *overlappingSets[2];
  Vector *mergedAligns;


  for (i=0;i<Vector_getNumElement(alignSet1); i++) {
    GenomicAlign *align = Vector_getElementAt(alignSet1, i);
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)),
                                                        GenomicAlign_getQueryStart(align), align, 0));
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getQueryDNAFrag(align)),
                                                        GenomicAlign_getQueryEnd(align)+0.5, align, 0));
  }

  for (i=0;i<Vector_getNumElement(alignSet2); i++) {
    GenomicAlign *align = Vector_getElementAt(alignSet2, i);
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)),
                                                        GenomicAlign_getConsensusStart(align), align, 1));
    Vector_addElement(bigList, GenomicAlignListElem_new(DNAFrag_getDbID(GenomicAlign_getConsensusDNAFrag(align)),
                                                        GenomicAlign_getConsensusEnd(align)+0.5, align, 1));
  }
  
  Vector_sort(bigList, GenomicAlignListElem_compFunc);

  // walking from start to end through sortlist and keep track of the 
  // currently overlapping set of Alignments
 
  overlappingSets[0] = IDHash_new(IDHASH_SMALL);
  overlappingSets[1] = IDHash_new(IDHASH_SMALL);

  mergedAligns = Vector_new();

  for (i=0; i<Vector_getNumElement(bigList); i++) {
    GenomicAlignListElem *gale  = Vector_getElementAt(bigList,i);

    GenomicAlign *align = gale->align;
    IDType alignID      = GenomicAlign_getDbID(align);
    int setNo           = gale->setNum;

    if (IDHash_contains(overlappingSets[setNo], alignID)) {
      // remove from current overlapping set
      IDHash_remove(overlappingSets[setNo], alignID, NULL);
    } else {
      int j;
      void **values = IDHash_getValues(overlappingSets[1-setNo]);

      // insert into the set and do all the overlap business
      IDHash_add(overlappingSets[setNo], alignID, align);

      // the other set contains everything this align overlaps with
      for (j=0; j<IDHash_getNumValues(overlappingSets[1-setNo]); j++) {
        GenomicAlign *align2 = values[j];
        if (setNo == 0) {
          GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align, align2);
        } else {
          GenomicAlignAdaptor_addDerivedAlignments(gaa, mergedAligns, align2, align);
        }
      }
      free(values);
    }
  }

// NIY Free gale

  return mergedAligns;
}
Exemplo n.º 6
0
// Note I didn't implement the stable id fetching uggliness here. I'll probably make a separate method for that
// if necessary
Vector *BaseAdaptor_uncachedFetchAllByDbIDList(BaseAdaptor *ba, Vector *idList, Slice *slice) {
  if ( idList == NULL) {
    fprintf(stderr, "id_list list reference argument is required - bye!");
    return NULL;
  }
  char constraintPref[1024];
  

  if (!Vector_getNumElement(idList)) {
    return Vector_new();
  }

  NameTableType *tables = ba->getTables();
  char **t = (*tables)[0];

  sprintf(constraintPref, "%s.%s_id ", t[SYN], t[NAME] ); 

  // Ensure that we do not exceed MySQL's max_allowed_packet (defaults to
  // 1 MB) splitting large queries into smaller queries of at most 256 KB.
  // Assuming a (generous) average dbID string
  // length of 16, this means 16384 dbIDs in each query.
  int maxSize = 16384;

  // Uniquify the list
  IDHash *idListHash = IDHash_new(IDHASH_MEDIUM);

  int i;
  for (i=0; i<Vector_getNumElement(idList); i++) {
    IDType id = *(IDType *)(Vector_getElementAt(idList, i));
    if (!IDHash_contains(idListHash, id)) {
      IDHash_add(idListHash, id, &trueVal);
    }
  }

  IDType *uniqueIds = IDHash_getKeys(idListHash);
  int nUniqueId = IDHash_getNumValues(idListHash);

  IDHash_free(idListHash, NULL);

  Vector *out = Vector_new();

  int lenNum;
  for (i=0; i<nUniqueId; i+=maxSize) {
    char *constraint = NULL;

    if ((constraint = (char *)calloc(655500,sizeof(char))) == NULL) {
      fprintf(stderr,"Failed allocating constraint\n");
      return out;
    }

    strcpy(constraint, constraintPref);
  
    // Special case for one remaining Id
    if (i == nUniqueId-1) {
      sprintf(constraint, "%s = "IDFMTSTR, constraint, uniqueIds[i]);
    } else {
      char tmpStr[1024];
      int endPoint = sprintf(constraint, "%s IN (", constraint);
      int j;
      for (j=0; j<maxSize && j+i<nUniqueId; j++) {
        if (j!=0) {
          constraint[endPoint++] = ',';
          constraint[endPoint++] = ' ';
        }
        lenNum = sprintf(tmpStr, IDFMTSTR, uniqueIds[i+j]);
        memcpy(&(constraint[endPoint]), tmpStr, lenNum);
        endPoint+=lenNum;
      }
      constraint[endPoint++] = ')';
      constraint[endPoint] = '\0';
    }

    Vector *resChunk = BaseAdaptor_genericFetch(ba, constraint, NULL, slice);

    Vector_append(out, resChunk);

    Vector_free(resChunk);
    free(constraint);
  }
  free(uniqueIds);

  return out;
}