void selectAlignments(std::vector<std::vector<Chain*>>& dst, DbAlignment*** alignments, int32_t* alignments_lengths, Chain** queries, int32_t queries_length, float threshold) { dst.resize(queries_length); fprintf(stderr, "** Selecting alignments with median threshold: %.2f **\n", threshold); std::vector<ThreadPoolTask*> thread_tasks(queries_length, nullptr); for (int32_t i = 0; i < queries_length; ++i) { if (alignments_lengths[i] == 0) { continue; } auto thread_data = new ThreadSelectionData(dst[i], alignments[i], alignments_lengths[i], queries[i], threshold); thread_tasks[i] = threadPoolSubmit(threadSelectAlignments, (void*) thread_data); } for (int32_t i = 0; i < queries_length; ++i) { threadPoolTaskWait(thread_tasks[i]); threadPoolTaskDelete(thread_tasks[i]); queryLog(i + 1, queries_length); } fprintf(stderr, "\n\n"); }
uint64_t searchDatabase(std::vector<std::vector<uint32_t>>& dst, const std::string& database_path, Chain** queries, int32_t queries_length, uint32_t kmer_length, uint32_t max_candidates, uint32_t num_threads) { fprintf(stderr, "** Searching database for candidate sequences **\n"); std::shared_ptr<Hash> query_hash = createHash(queries, queries_length, 0, queries_length, kmer_length); Chain** database = nullptr; int database_length = 0; int database_start = 0; FILE* handle = nullptr; int serialized = 0; readFastaChainsPartInit(&database, &database_length, &handle, &serialized, database_path.c_str()); uint64_t database_cells = 0; std::vector<float> min_scores(queries_length, 1000000.0); std::vector<std::vector<std::vector<Candidate>>> candidates(num_threads); uint32_t part = 1; float part_size = database_chunk / (float) 1000000000; while (true) { int status = 1; status &= readFastaChainsPart(&database, &database_length, handle, serialized, database_chunk); databaseLog(part, part_size, 0); uint32_t database_split_size = (database_length - database_start) / num_threads; std::vector<uint32_t> database_splits(num_threads + 1, database_start); for (uint32_t i = 1; i < num_threads; ++i) { database_splits[i] += i * database_split_size; } database_splits[num_threads] = database_length; std::vector<ThreadPoolTask*> thread_tasks(num_threads, nullptr); for (uint32_t i = 0; i < num_threads; ++i) { auto thread_data = new ThreadSearchData(query_hash, queries_length, min_scores, database, database_splits[i], database_splits[i + 1], kmer_length, max_candidates, candidates[i], i == num_threads - 1, part, part_size); thread_tasks[i] = threadPoolSubmit(threadSearchDatabase, (void*) thread_data); } for (uint32_t i = 0; i < num_threads; ++i) { threadPoolTaskWait(thread_tasks[i]); threadPoolTaskDelete(thread_tasks[i]); } for (int i = database_start; i < database_length; ++i) { database_cells += chainGetLength(database[i]); chainDelete(database[i]); database[i] = nullptr; } // merge candidates from all threads for (int32_t i = 0; i < queries_length; ++i) { for (uint32_t j = 1; j < num_threads; ++j) { if (candidates[j][i].empty()) { continue; } candidates[0][i].insert(candidates[0][i].end(), candidates[j][i].begin(), candidates[j][i].end()); std::vector<Candidate>().swap(candidates[j][i]); } if (num_threads > 1) { std::sort(candidates[0][i].begin(), candidates[0][i].end()); if (candidates[0][i].size() > max_candidates) { std::vector<Candidate> tmp(candidates[0][i].begin(), candidates[0][i].begin() + max_candidates); candidates[0][i].swap(tmp); } } if (!candidates[0][i].empty()) { min_scores[i] = candidates[0][i].back().score; } } databaseLog(part, part_size, 100); ++part; if (status == 0) { break; } database_start = database_length; } fprintf(stderr, "\n\n"); fclose(handle); deleteFastaChains(database, database_length); dst.clear(); dst.resize(queries_length); for (int32_t i = 0; i < queries_length; ++i) { dst[i].reserve(candidates[0][i].size()); for (uint32_t j = 0; j < candidates[0][i].size(); ++j) { dst[i].emplace_back(candidates[0][i][j].id); } std::vector<Candidate>().swap(candidates[0][i]); std::sort(dst[i].begin(), dst[i].end()); } return database_cells; }
static void scoreCpu(int** scores_, int type, Chain** queries, int queriesLen, Chain** database_, int databaseLen_, Scorer* scorer, int* indexes, int indexesLen) { TIMER_START("CPU database scoring"); *scores_ = (int*) malloc(queriesLen * databaseLen_ * sizeof(int)); int i, j; int* scores; Chain** database; int databaseLen; //************************************************************************** // INIT STRUCTURES if (indexes == NULL) { scores = *scores_; database = database_; databaseLen = databaseLen_; } else { scores = (int*) malloc(indexesLen * queriesLen * sizeof(int)); database = (Chain**) malloc(indexesLen * sizeof(Chain*)); databaseLen = indexesLen; for (i = 0; i < indexesLen; ++i) { database[i] = database_[indexes[i]]; } } //************************************************************************** //************************************************************************** // SOLVE MULTITHREADED int maxLen = (queriesLen * databaseLen) / CPU_THREAD_CHUNK + queriesLen; int length = 0; size_t contextsSize = maxLen * sizeof(ScoreCpuContext); ScoreCpuContext* contexts = (ScoreCpuContext*) malloc(contextsSize); size_t tasksSize = maxLen * sizeof(ThreadPoolTask*); ThreadPoolTask** tasks = (ThreadPoolTask**) malloc(tasksSize); for (i = 0; i < queriesLen; ++i) { for (j = 0; j < databaseLen; j += CPU_THREAD_CHUNK) { contexts[length].scores = scores + i * databaseLen + j; contexts[length].type = type; contexts[length].query = queries[i]; contexts[length].database = database + j; contexts[length].databaseLen = MIN(CPU_THREAD_CHUNK, databaseLen - j); contexts[length].scorer = scorer; tasks[length] = threadPoolSubmit(scoreCpuThread, &(contexts[length])); length++; } } for (i = 0; i < length; ++i) { threadPoolTaskWait(tasks[i]); threadPoolTaskDelete(tasks[i]); } free(tasks); free(contexts); //************************************************************************** //************************************************************************** // SAVE RESULTS if (indexes != NULL) { for (i = 0; i < queriesLen; ++i) { for (j = 0; j < databaseLen_; ++j) { (*scores_)[i * databaseLen_ + j] = NO_SCORE; } for (j = 0; j < indexesLen; ++j) { (*scores_)[i * databaseLen_ + indexes[j]] = scores[i * indexesLen + j]; } } free(database); free(scores); } //************************************************************************** TIMER_STOP; }
static void databaseSearchStep(DbAlignment*** dbAlignments, int* dbAlignmentsLen, int type, Chain** queries, int queriesStart, int queriesLen, ChainDatabase* chainDatabase, Scorer* scorer, int maxAlignments, ValueFunction valueFunction, void* valueFunctionParam, double valueThreshold, int* indexes, int indexesLen, int* cards, int cardsLen) { Chain** database = chainDatabase->database; int databaseStart = chainDatabase->databaseStart; int databaseLen = chainDatabase->databaseLen; long databaseElems = chainDatabase->databaseElems; ChainDatabaseGpu* chainDatabaseGpu = chainDatabase->chainDatabaseGpu; int i, j, k; //************************************************************************** // CALCULATE CELL NUMBER long queriesElems = 0; for (i = 0; i < queriesLen; ++i) { queriesElems += chainGetLength(queries[i]); } if (indexes != NULL) { databaseElems = 0; for (i = 0; i < indexesLen; ++i) { databaseElems += chainGetLength(database[indexes[i]]); } } long long cells = (long long) queriesElems * databaseElems; //************************************************************************** //************************************************************************** // CALCULATE SCORES int* scores; if (cells < GPU_DB_MIN_CELLS || cardsLen == 0) { scoreCpu(&scores, type, queries, queriesLen, database, databaseLen, scorer, indexes, indexesLen); } else { scoreDatabasesGpu(&scores, type, queries, queriesLen, chainDatabaseGpu, scorer, indexes, indexesLen, cards, cardsLen, NULL); } //************************************************************************** //************************************************************************** // EXTRACT BEST CHAINS AND SAVE THEIR DATA MULTITHREADED TIMER_START("Extract best"); DbAlignmentData** dbAlignmentsData = (DbAlignmentData**) malloc(queriesLen * sizeof(DbAlignmentData*)); ExtractContext* eContexts = (ExtractContext*) malloc(queriesLen * sizeof(ExtractContext)); for (i = 0; i < queriesLen; ++i) { eContexts[i].dbAlignmentData = &(dbAlignmentsData[i]); eContexts[i].dbAlignmentLen = &(dbAlignmentsLen[i]); eContexts[i].query = queries[i]; eContexts[i].database = database; eContexts[i].databaseLen = databaseLen; eContexts[i].scores = scores + i * databaseLen; eContexts[i].maxAlignments = maxAlignments; eContexts[i].valueFunction = valueFunction; eContexts[i].valueFunctionParam = valueFunctionParam; eContexts[i].valueThreshold = valueThreshold; eContexts[i].cards = cards; eContexts[i].cardsLen = cardsLen; } if (cardsLen == 0) { size_t tasksSize = queriesLen * sizeof(ThreadPoolTask*); ThreadPoolTask** tasks = (ThreadPoolTask**) malloc(tasksSize); for (i = 0; i < queriesLen; ++i) { tasks[i] = threadPoolSubmit(extractThread, (void*) &(eContexts[i])); } for (i = 0; i < queriesLen; ++i) { threadPoolTaskWait(tasks[i]); threadPoolTaskDelete(tasks[i]); } free(tasks); } else { int chunks = MIN(queriesLen, cardsLen); int cardsChunk = cardsLen / chunks; int cardsAdd = cardsLen % chunks; int cardsOff = 0; int contextsChunk = queriesLen / chunks; int contextsAdd = queriesLen % chunks; int contextsOff = 0; size_t contextsSize = chunks * sizeof(ExtractContexts); ExtractContexts* contexts = (ExtractContexts*) malloc(contextsSize); size_t tasksSize = chunks * sizeof(Thread); Thread* tasks = (Thread*) malloc(tasksSize); for (i = 0; i < chunks; ++i) { int* cards_ = cards + cardsOff; int cardsLen_ = cardsChunk + (i < cardsAdd); cardsOff += cardsLen_; ExtractContext* contexts_ = eContexts + contextsOff; int contextsLen_ = contextsChunk + (i < contextsAdd); contextsOff += contextsLen_; for (j = 0; j < contextsLen_; ++j) { contexts_[j].cards = cards_; contexts_[j].cardsLen = cardsLen_; } contexts[i].contexts = contexts_; contexts[i].contextsLen = contextsLen_; threadCreate(&(tasks[i]), extractsThread, &(contexts[i])); } for (i = 0; i < chunks; ++i) { threadJoin(tasks[i]); } free(tasks); free(contexts); } free(eContexts); free(scores); // this is big, release immediately TIMER_STOP; //************************************************************************** //************************************************************************** // ALIGN BEST TARGETS MULTITHREADED TIMER_START("Database aligning"); // create structure for (i = 0; i < queriesLen; ++i) { size_t dbAlignmentsSize = dbAlignmentsLen[i] * sizeof(DbAlignment*); dbAlignments[i] = (DbAlignment**) malloc(dbAlignmentsSize); } // count tasks int aTasksLen = 0; for (i = 0; i < queriesLen; ++i) { aTasksLen += dbAlignmentsLen[i]; } size_t aTasksSize = aTasksLen * sizeof(ThreadPoolTask*); ThreadPoolTask** aTasks = (ThreadPoolTask**) malloc(aTasksSize); size_t aContextsSize = aTasksLen * sizeof(AlignContext); AlignContext* aContextsCpu = (AlignContext*) malloc(aContextsSize); AlignContext* aContextsGpu = (AlignContext*) malloc(aContextsSize); int aContextsCpuLen = 0; int aContextsGpuLen = 0; for (i = 0, k = 0; i < queriesLen; ++i, ++k) { Chain* query = queries[i]; int rows = chainGetLength(query); for (j = 0; j < dbAlignmentsLen[i]; ++j, ++k) { DbAlignmentData data = dbAlignmentsData[i][j]; Chain* target = database[data.idx]; int cols = chainGetLength(target); long long cells = (long long) rows * cols; AlignContext* context; if (cols < GPU_MIN_LEN || cells < GPU_MIN_CELLS || cardsLen == 0) { context = &(aContextsCpu[aContextsCpuLen++]); context->cards = NULL; context->cardsLen = 0; } else { context = &(aContextsGpu[aContextsGpuLen++]); } context->dbAlignment = &(dbAlignments[i][j]); context->type = type; context->query = query; context->queryIdx = i; context->target = target; context->targetIdx = data.idx + databaseStart; context->value = data.value; context->score = data.score; context->scorer = scorer; context->cells = cells; } } LOG("Aligning %d cpu, %d gpu", aContextsCpuLen, aContextsGpuLen); // run cpu tasks int aCpuTasksLen; AlignContextsPacked* aContextsCpuPacked; if (aContextsCpuLen < 10000) { aCpuTasksLen = aContextsCpuLen; aContextsCpuPacked = NULL; for (i = 0; i < aCpuTasksLen; ++i) { aTasks[i] = threadPoolSubmit(alignThread, &(aContextsCpu[i])); } } else { aCpuTasksLen = aContextsCpuLen / CPU_PACKED_CHUNK; aCpuTasksLen += (aContextsCpuLen % CPU_PACKED_CHUNK) != 0; size_t contextsSize = aCpuTasksLen * sizeof(AlignContextsPacked); AlignContextsPacked* contexts = (AlignContextsPacked*) malloc(contextsSize); for (i = 0; i < aCpuTasksLen; ++i) { int length = MIN(CPU_PACKED_CHUNK, aContextsCpuLen - i * CPU_PACKED_CHUNK); contexts[i].contexts = aContextsCpu + i * CPU_PACKED_CHUNK; contexts[i].contextsLen = length; } for (i = 0; i < aCpuTasksLen; ++i) { aTasks[i] = threadPoolSubmit(alignsPackedThread, &(contexts[i])); } aContextsCpuPacked = contexts; } if (aContextsGpuLen) { int chunks = MIN(aContextsGpuLen, cardsLen); size_t contextsSize = chunks * sizeof(AlignContexts); AlignContexts* contexts = (AlignContexts*) malloc(contextsSize); size_t balancedSize = chunks * aContextsGpuLen * sizeof(AlignContext*); AlignContext** balanced = (AlignContext**) malloc(balancedSize); // set phony contexts, init data for (i = 0; i < chunks; ++i) { contexts[i].contexts = balanced + i * aContextsGpuLen; contexts[i].contextsLen = 0; contexts[i].cells = 0; } // balance tasks by round roobin, chunks are pretty small (CUDA cards) for (i = 0; i < aContextsGpuLen; ++i) { int minIdx = 0; long long min = contexts[0].cells; for (j = 1; j < chunks; ++j) { if (contexts[j].cells < min) { min = contexts[j].cells; minIdx = j; } } AlignContext* context = &(aContextsGpu[i]); contexts[minIdx].contexts[contexts[minIdx].contextsLen++] = context; contexts[minIdx].cells += context->cells; } // set context cards int cardsChunk = cardsLen / chunks; int cardsAdd = cardsLen % chunks; int cardsOff = 0; for (i = 0; i < chunks; ++i) { int cCardsLen = cardsChunk + (i < cardsAdd); int* cCards = cards + cardsOff; cardsOff += cCardsLen; for (j = 0; j < contexts[i].contextsLen; ++j) { contexts[i].contexts[j]->cards = cCards; contexts[i].contexts[j]->cardsLen = cCardsLen; } } size_t tasksSize = chunks * sizeof(Thread); Thread* tasks = (Thread*) malloc(tasksSize); // run gpu tasks first for (i = 0; i < chunks; ++i) { threadCreate(&(tasks[i]), alignsThread, &(contexts[i])); } // wait for gpu tasks to finish for (i = 0; i < chunks; ++i) { threadJoin(tasks[i]); } free(balanced); free(contexts); } // wait for cpu tasks for (i = 0; i < aCpuTasksLen; ++i) { threadPoolTaskWait(aTasks[i]); threadPoolTaskDelete(aTasks[i]); } free(aContextsCpuPacked); free(aContextsCpu); free(aContextsGpu); free(aTasks); TIMER_STOP; //************************************************************************** //************************************************************************** // CLEAN MEMORY for (i = 0; i < queriesLen; ++i) { free(dbAlignmentsData[i]); } free(dbAlignmentsData); //************************************************************************** }