int Genome::CountGenes( int start, int finish ) { int cnt = 0; for(int cgI=1;cgI<=genes.size();++cgI) if (min(genes(cgI)->indices) >= start && max(genes(cgI)->indices) <= finish) cnt = cnt + 1; return cnt; }
// Insert a gene bool DNA::insert_gene(unsigned int index, unsigned char* gene, unsigned int size) { unsigned int amountgenes = genes(); assert(index < amountgenes); // Case 1: gene at start if (index == 0) { if (amountgenes > 1) { unsigned char* gene_mod = (unsigned char*) malloc((size+1) * sizeof(unsigned char)); memcpy(gene_mod, gene, size); gene_mod[size] = 0; insert(0, gene_mod, size+1); free(gene_mod); } else { dataGenes = (unsigned char*) malloc(size * sizeof(unsigned char)); memcpy(dataGenes, gene, size); dataSize = size; } } // Case 2: gene at midst else { unsigned char* gene_mod = (unsigned char*) malloc((size+1) * sizeof(unsigned char)); memcpy(gene_mod, gene, size); gene_mod[size] = 0; unsigned int i_self = gene_start(index); insert(i_self, gene_mod, size+1); free(gene_mod); } return true; }
// Erase a gene bool DNA::erase_gene(unsigned int index) { unsigned int amountgenes = genes(); assert(index < amountgenes); // Case 1: gene at start if (index == 0) { if (amountgenes > 1) { unsigned int i_next = gene_start(1); erase(0, i_next); } else { free(dataGenes); dataGenes = 0; dataSize = 0; } } // Case 2: gene at midst else if (index < amountgenes-1) { unsigned int i_self = gene_start(index); unsigned int i_next = gene_start(index+1); erase(i_self, i_next); } // Case 3: gene at end else { if (amountgenes > 1) { unsigned int i_prev = gene_end(index-1); erase(i_prev, dataSize); } else { free(dataGenes); dataSize = 0; } } return true; }
// Returns a boolean value - tells us if we can cut the genome at given // position. bool Genome::CanCut(int location) { for(int i=1;i<=genes.size();i++) { Feature* g = genes(i); if (min(g->indices) <= location && location <= max(g->indices)) return false; } return true; }
void DNA::debug() const { // Debug message std::cout << "* DNA.debug" << std::endl; // Process chararray std::cout << "Contents of DNA object (" << genes() << " genes): " << std::endl; for (unsigned int i = 0; i < genes(); i++) { std::cout << "\tgene " << i+1 << ":"; unsigned int start = gene_start(i); unsigned int end = gene_end(i); while (start < end) { std::cout << " 0x" << std::hex << std::setfill('0') << std::setw(2) << (int)dataGenes[start++] << std::dec; } std::cout << std::endl; } }
// Extract a gene bool DNA::extract_gene(unsigned int index, unsigned char*& gene, unsigned int& size) const { unsigned int amountgenes = genes(); assert(index < amountgenes); unsigned int i_start = gene_start(index); unsigned int i_end = gene_end(index); size = i_end-i_start; extract(i_start, i_end, gene); return true; }
// Replace a gene bool DNA::replace_gene(unsigned int index, unsigned char* gene, unsigned int size) { unsigned int amountgenes = genes(); assert(index < amountgenes); if (!erase_gene(index)) return false; if (index < amountgenes-1) return insert_gene(index, gene, size); else return push_back_gene(gene, size); }
RankTransformer(const std::string& aMatrixDir, const std::string& aOutputfile, bool aQuantileNormalisation = false, bool aUseInverse = false, bool aScramble = false) : mMatrixDir(aMatrixDir), mOutputFile(NULL), mData(NULL), mBuf(NULL), mRanks(NULL), mInvRanks(NULL), mRankAvgs(NULL), mRankCounts(NULL), mQuantileNormalisation(aQuantileNormalisation), mUseInverse(aUseInverse), mScramble(aScramble) { mOutputFile = fopen(aOutputfile.c_str(), "w"); fs::path data(mMatrixDir); if (mUseInverse) data /= "inverse_data"; else data /= "data"; mData = fopen(data.string().c_str(), "r"); fs::path genes(mMatrixDir); // Ugly hack: if we are using the inverted data, we simply swap out the // list of genes for the list of arrays, so that nGenes is actually the // number of arrays. This means that the normalisation occurs as normal, // except for each gene across arrays instead of for each array across // genes. if (mUseInverse) genes /= "arrays"; else genes /= "genes"; nGenes = 0; std::ifstream gs(genes.string().c_str()); while (gs.good()) { std::string l; std::getline(gs, l); if (!gs.good()) break; nGenes++; } mBuf = new double[nGenes]; mRanks = new double[nGenes]; if (mQuantileNormalisation) { mRankAvgs = new double[nGenes]; mRankCounts = new uint32_t[nGenes]; memset(mRankAvgs, 0, sizeof(double) * nGenes); memset(mRankCounts, 0, sizeof(uint32_t) * nGenes); } mInvRanks = new uint32_t[nGenes]; processAllData(); }
// Add a gene bool DNA::push_back_gene(unsigned char* gene, unsigned int size) { unsigned int amountgenes = genes(); if (amountgenes == 0) push_back(gene, size); else { unsigned char* gene_mod = (unsigned char*) malloc((size+1) * sizeof(unsigned char)); memcpy(gene_mod+1, gene, size); gene_mod[0] = 0; push_back(gene_mod, size+1); free(gene_mod); } return true; }
mvec<Genome*> Genome::Split( float wanted_ratio, int impTh ) { /* % For each gene go and try to divide the genome after each gene, make % sure we do not cut any genes in the middle and select the ratio % closest to 0.5 */ float best_ratio = FLT_MAX; int best_position = 0; int n = genes.size(); int last_impI = 0; int last_impJ = 0; int i = (int)(n * wanted_ratio + 0.5f) - 1; int j = (int)(n * wanted_ratio + 0.5f); while ((i > 0 && last_impI < impTh) || (j < n && last_impJ < impTh)) { if (i > 0 && last_impI < impTh) { Feature* cur = genes(i); Feature* next = genes(i + 1); int cur_end = max(cur->indices); int next_start = min(next->indices); int middle = round(0.5 * (cur_end+next_start)); if (CanCut(middle)) { float ratio = CountGenes(1, middle) / (float)n; if (fabsf(ratio - wanted_ratio) < fabsf(best_ratio - wanted_ratio)) { best_ratio = ratio; d_trace("[+] (%d) New best ratio attained - %f\n", i, best_ratio); best_position = middle; last_impI = 0; } else { last_impI = last_impI + 1; } } } i--; if (j < n && last_impJ < impTh) { Feature* cur = genes(j); Feature* next = genes(j + 1); int cur_end = max(cur->indices); int next_start = min(next->indices); int middle = round(0.5 * (cur_end+next_start)); if (CanCut(middle)) { float ratio = CountGenes(1, middle) / (float) n; if (fabsf(ratio - wanted_ratio) < fabsf(best_ratio - wanted_ratio)) { best_ratio = ratio; d_trace("[+] (%d) New best ratio attained - %f\n", i, best_ratio); best_position = middle; last_impJ = 0; } else { last_impJ = last_impJ + 1; } } } j++; } // % BTW, this works only coz the genes are sorted in incresing order of // % their lower index (lower != first) d_trace("[i] Cutting sequence at %d\n", best_position); mvec<Genome*> r; r.push_back(GetSubset(1, best_position)); // train // train.Sequence = g.Sequence(1:best_position); //train.gene = get_all_genes(f, 1, best_position); r.push_back(GetSubset(best_position + 1, sequence.size())); // test.Sequence = g.Sequence(best_position + 1:seq_length); // test.gene = get_all_genes(f, best_position + 1, seq_length); // test.gene = shift_genes(test.gene, best_position); return r; }
Merz1999Solution::Merz1999Solution(const QUBOInstance& qi, const Merz1999Solution& parent_a, const Merz1999Solution& parent_b, QUBOHeuristic *heuristic) : QUBOSolution(parent_a) { // Implements the HUX cross over with restricted local search // Store the bits which were identical with parents before commencing local search. std::vector<bool> parents_identical(N_, false); // http://en.wikipedia.org/wiki/Crossover_(genetic_algorithm)#Uniform_Crossover_and_Half_Uniform_Crossover // In the half uniform crossover scheme (HUX), exactly half of the // nonmatching bits are swapped. Thus first the Hamming distance (the // number of differing bits) is calculated. This number is divided by two. // The resulting number is how many of the bits that do not match between // the two parents will be swapped. int half_hamming_distance = parent_a.SymmetricDifference(parent_b) / 2; std::vector<int> genes(N_, 0); for (int i = 0; i < N_; i++) genes[i] = i; // Pick random ordering of genes std::random_shuffle(genes.begin(), genes.end()); int left_to_swap = half_hamming_distance; const std::vector<int>& a_genes = parent_a.get_assignments(); const std::vector<int>& b_genes = parent_b.get_assignments(); for (int pos = 0; pos < N_; pos++) { int i = genes[pos]; if (a_genes[i] == b_genes[i]) { // Parents are the same in this gene parents_identical[i] = true; } else { // Parents are different in this gene // The wikipedia page is confusing because it talks about two children // But the paper only has one // So, by default we'll take the father if (left_to_swap > 0) { UpdateCutValues(i); --left_to_swap; } } } // We now have some combination of the parents, now do the local search // PAPER: The local search applied to the resulting offspring after // recombination is restricted to a region of the search space // defined by the two parents: the genes with equal values in // the two parents are not modified during local search. while (1) { double best_move = 0.0; int best_pos = -1; for (int i=0; i < N_; ++i) { if (parents_identical[i]) continue; // Only modification if (diff_weights_[i] > best_move) { best_move = diff_weights_[i]; best_pos = i; } } if (best_pos < 0 || !ImprovingMove(best_pos)) { // No more profitable moves break; } // Update the diff_weights_ variable and objective UpdateCutValues(best_pos); } }
// Returns the index of the last data part of a gene (exclusive) // NOTE: this value is not guaranteed to be accessible! unsigned int DNA::gene_end(unsigned int index) const { if (index < genes()-1) return separator(index+1); else return dataSize; }