bool corpus_gen::generate(const char* pCmd_line) { static const command_line_params::param_desc param_desc_array[] = { { "corpus_gen", 0, false }, { "in", 1, true }, { "deep", 0, false }, { "blockpercentage", 1, false }, { "width", 1, false }, { "height", 1, false }, { "alpha", 0, false }, }; command_line_params params; if (!params.parse(pCmd_line, CRNLIB_ARRAY_SIZE(param_desc_array), param_desc_array, true)) return false; if (!params.has_key("in")) { console::error("Must specify one or more input files using the /in option!"); return false; } uint num_dst_blocks_x = params.get_value_as_int("width", 0, 4096, 128, 4096); num_dst_blocks_x = (num_dst_blocks_x + 3) / 4; uint num_dst_blocks_y = params.get_value_as_int("height", 0, 4096, 128, 4096); num_dst_blocks_y = (num_dst_blocks_y + 3) / 4; const uint total_dst_blocks = num_dst_blocks_x * num_dst_blocks_y; image_u8 dst_img(num_dst_blocks_x * 4, num_dst_blocks_y * 4); uint next_dst_block = 0; uint total_dst_images = 0; random rm; block_hash_map block_hash; block_hash.reserve(total_dst_blocks); uint total_images_loaded = 0; uint total_blocks_written = 0; command_line_params::param_map_const_iterator it = params.begin(); for ( ; it != params.end(); ++it) { if (it->first != "in") continue; if (it->second.m_values.empty()) { console::error("Must follow /in parameter with a filename!\n"); return false; } for (uint in_value_index = 0; in_value_index < it->second.m_values.size(); in_value_index++) { const dynamic_string& filespec = it->second.m_values[in_value_index]; find_files file_finder; if (!file_finder.find(filespec.get_ptr(), find_files::cFlagAllowFiles | (params.has_key("deep") ? find_files::cFlagRecursive : 0))) { console::warning("Failed finding files: %s", filespec.get_ptr()); continue; } if (file_finder.get_files().empty()) { console::warning("No files found: %s", filespec.get_ptr()); return false; } const find_files::file_desc_vec& files = file_finder.get_files(); for (uint file_index = 0; file_index < files.size(); file_index++) { const find_files::file_desc& file_desc = files[file_index]; console::printf("Loading image: %s", file_desc.m_fullname.get_ptr()); image_u8 img; if (!image_utils::read_from_file(img, file_desc.m_fullname.get_ptr(), 0)) { console::warning("Failed loading image file: %s", file_desc.m_fullname.get_ptr()); continue; } if (!params.has_key("alpha")) { for (uint y = 0; y < img.get_height(); y++) for (uint x = 0; x < img.get_width(); x++) img(x, y).a = 255; } total_images_loaded++; uint width = img.get_width(); uint height = img.get_height(); uint num_blocks_x = (width + 3) / 4; uint num_blocks_y = (height + 3) / 4; uint total_blocks = num_blocks_x * num_blocks_y; float percentage = params.get_value_as_float("blockpercentage", 0, .1f, .001f, 1.0f); uint total_rand_blocks = math::maximum<uint>(1U, (uint)(total_blocks * percentage)); crnlib::vector<uint> remaining_blocks(total_blocks); for (uint i = 0; i < total_blocks; i++) remaining_blocks[i] = i; uint num_blocks_remaining = total_rand_blocks; while (num_blocks_remaining) { if (remaining_blocks.empty()) break; uint rand_block_index = rm.irand(0, remaining_blocks.size()); uint block_index = remaining_blocks[rand_block_index]; remaining_blocks.erase_unordered(rand_block_index); uint block_y = block_index / num_blocks_x; uint block_x = block_index % num_blocks_x; block b; for (uint y = 0; y < 4; y++) { for (uint x = 0; x < 4; x++) { b.m_c[x+y*4] = img.get_clamped(block_x*4+x, block_y*4+y); } } if (!block_hash.insert(b).second) continue; if (block_hash.size() == total_dst_blocks) { block_hash.clear(); block_hash.reserve(total_dst_blocks); } uint dst_block_x = next_dst_block % num_dst_blocks_x; uint dst_block_y = next_dst_block / num_dst_blocks_x; for (uint y = 0; y < 4; y++) { for (uint x = 0; x < 4; x++) { dst_img(dst_block_x * 4 + x, dst_block_y * 4 + y) = b.m_c[x + y*4]; } } next_dst_block++; if (total_dst_blocks == next_dst_block) { sort_blocks(dst_img); dynamic_string dst_filename(cVarArg, "test_%u.tga", total_dst_images); console::printf("Writing image: %s", dst_filename.get_ptr()); image_utils::write_to_file(dst_filename.get_ptr(), dst_img, 0); dst_img.set_all(color_quad_u8::make_black()); next_dst_block = 0; total_dst_images++; } total_blocks_written++; num_blocks_remaining--; } } // file_index } // in_value_index } if (next_dst_block) { sort_blocks(dst_img); dynamic_string dst_filename(cVarArg, "test_%u.tga", total_dst_images); console::printf("Writing image: %s", dst_filename.get_ptr()); image_utils::write_to_file(dst_filename.get_ptr(), dst_img, 0); next_dst_block = 0; total_dst_images++; } console::printf("Found %u input images, %u output images, %u total blocks", total_images_loaded, total_dst_images, total_blocks_written); return true; }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int sort_type) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; char *fnout = 0; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = realloc(buf, max_k * sizeof(void*)); memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; if (b->data_len < b->m_data>>2) { // shrink b->m_data = b->data_len; kroundup32(b->m_data); b->data = realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); // output file name fnout = calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); // write the final output if (n_files == 0) { // a single block char mode[8]; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); sort_aux_core(k, buf, sort_type); #ifndef _PBGZF_USE write_buffer(fnout, mode, k, buf, header, n_threads); #else write_buffer(fnout, mode, k, buf, header); #endif } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } #ifndef _PBGZF_USE bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); #else bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, level); #endif for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } free(fnout); // free for (k = 0; k < max_k; ++k) { if (!buf[k]) continue; free(buf[k]->data); free(buf[k]); } free(buf); bam_header_destroy(header); bam_close(fp); }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) { int n, ret, k, i; size_t mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; g_is_by_qname = is_by_qname; n = k = 0; mem = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); // write sub files for (;;) { if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; mem += ret; ++k; if (mem >= max_mem) { sort_blocks(n++, k, buf, prefix, header, 0); mem = 0; k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); else { // then merge char **fns, *fnout; fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); sort_blocks(n++, k, buf, prefix, header, 0); fnout = (char*)calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); fns = (char**)calloc(n, sizeof(char*)); for (i = 0; i < n; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); free(fnout); for (i = 0; i < n; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } for (k = 0; (size_t)k < max_mem / BAM_CORE_SIZE; ++k) { if (buf[k]) { free(buf[k]->data); free(buf[k]); } } free(buf); bam_header_destroy(header); bam_close(fp); }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the temporary files (prefix.NNNN.bam are written) @param fnout name of the final output file to be written @param modeout sam_open() mode to be used to create the final output file @param max_mem approxiate maximum memory (very inaccurate) @return 0 for successful sorting, negative on errors @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const char *fnout, const char *modeout, size_t _max_mem, int n_threads) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_hdr_t *header; samFile *fp; bam1_t *b, **buf; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = NULL; fp = sam_open(fn, "r"); if (fp == NULL) { fprintf(pysamerr, "[bam_sort_core] fail to open file %s\n", fn); return -1; } header = sam_hdr_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t kk, old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*)); for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL; } if (buf[k] == NULL) buf[k] = bam_init1(); b = buf[k]; if ((ret = sam_read1(fp, header, b)) < 0) break; if (b->l_data < b->m_data>>2) { // shrink b->m_data = b->l_data; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); mem = k = 0; } } if (ret != -1) fprintf(pysamerr, "[bam_sort_core] truncated file. Continue anyway.\n"); // write the final output if (n_files == 0) { // a single block ks_mergesort(sort, k, buf, 0); write_buffer(fnout, modeout, k, buf, header, n_threads); } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); fprintf(pysamerr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads) < 0) { // Propagate bam_merge_core2() failure; it has already emitted a // message explaining the failure, so no further message is needed. return -1; } for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } // free for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]); free(buf); bam_hdr_destroy(header); sam_close(fp); return 0; }
/*===========================================================================* * rw_scattered * *===========================================================================*/ static void rw_scattered( dev_t dev, /* major-minor device number */ struct buf **bufq, /* pointer to array of buffers */ unsigned int bufqsize, /* number of buffers */ int rw_flag /* READING or WRITING */ ) { /* Read or write scattered data from a device. */ register struct buf *bp; register iovec_t *iop; static iovec_t iovec[NR_IOREQS]; off_t pos; unsigned int i, iov_per_block; #if !defined(NDEBUG) unsigned int start_in_use = bufs_in_use, start_bufqsize = bufqsize; #endif /* !defined(NDEBUG) */ if(bufqsize == 0) return; #if !defined(NDEBUG) /* for READING, check all buffers on the list are obtained and held * (count > 0) */ if (rw_flag == READING) { assert(bufqsize <= LMFS_MAX_PREFETCH); for(i = 0; i < bufqsize; i++) { assert(bufq[i] != NULL); assert(bufq[i]->lmfs_count > 0); } /* therefore they are all 'in use' and must be at least this many */ assert(start_in_use >= start_bufqsize); } assert(dev != NO_DEV); assert(fs_block_size > 0); assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS); #endif /* !defined(NDEBUG) */ /* For WRITING, (Shell) sort buffers on lmfs_blocknr. * For READING, the buffers are already sorted. */ if (rw_flag == WRITING) sort_blocks(bufq, bufqsize); /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything * went fine, otherwise the error code for the first failed transfer. */ while (bufqsize > 0) { unsigned int p, nblocks = 0, niovecs = 0; int r; for (iop = iovec; nblocks < bufqsize; nblocks++) { vir_bytes vdata, blockrem; bp = bufq[nblocks]; if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks) break; blockrem = bp->lmfs_bytes; iov_per_block = howmany(blockrem, PAGE_SIZE); if (niovecs > NR_IOREQS - iov_per_block) break; vdata = (vir_bytes) bp->data; for(p = 0; p < iov_per_block; p++) { vir_bytes chunk = blockrem < PAGE_SIZE ? blockrem : PAGE_SIZE; iop->iov_addr = vdata; iop->iov_size = chunk; vdata += PAGE_SIZE; blockrem -= chunk; iop++; niovecs++; } assert(p == iov_per_block); assert(blockrem == 0); } assert(nblocks > 0); assert(niovecs > 0 && niovecs <= NR_IOREQS); pos = (off_t)bufq[0]->lmfs_blocknr * fs_block_size; if (rw_flag == READING) r = bdev_gather(dev, pos, iovec, niovecs, BDEV_NOFLAGS); else r = bdev_scatter(dev, pos, iovec, niovecs, BDEV_NOFLAGS); /* Harvest the results. The driver may have returned an error, or it * may have done less than what we asked for. */ if (r < 0) { printf("fs cache: I/O error %d on device %d/%d, " "block %"PRIu64"\n", r, major(dev), minor(dev), bufq[0]->lmfs_blocknr); } for (i = 0; i < nblocks; i++) { bp = bufq[i]; if (r < (ssize_t)bp->lmfs_bytes) { /* Transfer failed. */ if (i == 0) { bp->lmfs_dev = NO_DEV; /* Invalidate block */ } break; } if (rw_flag == READING) { lmfs_put_block(bp); } else { MARKCLEAN(bp); } r -= bp->lmfs_bytes; } bufq += i; bufqsize -= i; if (rw_flag == READING) { /* Don't bother reading more than the device is willing to * give at this time. Don't forget to release those extras. */ while (bufqsize > 0) { bp = *bufq++; bp->lmfs_dev = NO_DEV; /* invalidate block */ lmfs_put_block(bp); bufqsize--; } } if (rw_flag == WRITING && i == 0) { /* We're not making progress, this means we might keep * looping. Buffers remain dirty if un-written. Buffers are * lost if invalidate()d or LRU-removed while dirty. This * is better than keeping unwritable blocks around forever.. */ break; } } #if !defined(NDEBUG) if(rw_flag == READING) { assert(start_in_use >= start_bufqsize); /* READING callers assume all bufs are released. */ assert(start_in_use - start_bufqsize == bufs_in_use); } #endif /* !defined(NDEBUG) */ }