Esempio n. 1
0
static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout)
{
	char *name, mode[3];
	int i;
	bamFile fp;
	ks_mergesort(sort, k, buf, 0);
	name = (char*)calloc(strlen(prefix) + 20, 1);
	if (n >= 0) {
		sprintf(name, "%s.%.4d.bam", prefix, n);
		strcpy(mode, "w1");
	} else {
		sprintf(name, "%s.bam", prefix);
		strcpy(mode, "w");
	}
	fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode);
	if (fp == 0) {
		fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name);
		free(name);
		// FIXME: possible memory leak
		return;
	}
	free(name);
	bam_header_write(fp, h);
	for (i = 0; i < k; ++i)
		bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
	bam_close(fp);
}
Esempio n. 2
0
static void *worker(void *data)
{
    worker_t *w = (worker_t*)data;
    char *name;
    ks_mergesort(sort, w->buf_len, w->buf, 0);
    name = (char*)calloc(strlen(w->prefix) + 20, 1);
    sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
    write_buffer(name, "wb1", w->buf_len, w->buf, w->h, 0);
    free(name);
    return 0;
}
Esempio n. 3
0
static void
sort_aux_core(int k, bam1_p *buf, int sort_type)
{
  switch(sort_type) {
    case 0:
        ks_mergesort(sort, k, buf, 0);
        break;
    case 1:
        ks_introsort(sort, k, buf);
        break;
    case 2:
        ks_combsort(sort, k, buf);
        break;
    case 3:
    default:
        ks_heapmake(sort, k, buf);
        ks_heapsort(sort, k, buf);
        break;
  }
}
Esempio n. 4
0
/*!
  @abstract Sort an unsorted BAM file based on the chromosome order
  and the leftmost position of an alignment

  @param  is_by_qname whether to sort by query name
  @param  fn       name of the file to be sorted
  @param  prefix   prefix of the temporary files (prefix.NNNN.bam are written)
  @param  fnout    name of the final output file to be written
  @param  modeout  sam_open() mode to be used to create the final output file
  @param  max_mem  approxiate maximum memory (very inaccurate)
  @return 0 for successful sorting, negative on errors

  @discussion It may create multiple temporary subalignment files
  and then merge them by calling bam_merge_core(). This function is
  NOT thread safe.
 */
int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const char *fnout, const char *modeout, size_t _max_mem, int n_threads)
{
    int ret, i, n_files = 0;
    size_t mem, max_k, k, max_mem;
    bam_hdr_t *header;
    samFile *fp;
    bam1_t *b, **buf;

    if (n_threads < 2) n_threads = 1;
    g_is_by_qname = is_by_qname;
    max_k = k = 0; mem = 0;
    max_mem = _max_mem * n_threads;
    buf = NULL;
    fp = sam_open(fn, "r");
    if (fp == NULL) {
        fprintf(pysamerr, "[bam_sort_core] fail to open file %s\n", fn);
        return -1;
    }
    header = sam_hdr_read(fp);
    if (is_by_qname) change_SO(header, "queryname");
    else change_SO(header, "coordinate");
    // write sub files
    for (;;) {
        if (k == max_k) {
            size_t kk, old_max = max_k;
            max_k = max_k? max_k<<1 : 0x10000;
            buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*));
            for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL;
        }
        if (buf[k] == NULL) buf[k] = bam_init1();
        b = buf[k];
        if ((ret = sam_read1(fp, header, b)) < 0) break;
        if (b->l_data < b->m_data>>2) { // shrink
            b->m_data = b->l_data;
            kroundup32(b->m_data);
            b->data = (uint8_t*)realloc(b->data, b->m_data);
        }
        mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
        ++k;
        if (mem >= max_mem) {
            n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
            mem = k = 0;
        }
    }
    if (ret != -1)
        fprintf(pysamerr, "[bam_sort_core] truncated file. Continue anyway.\n");
    // write the final output
    if (n_files == 0) { // a single block
        ks_mergesort(sort, k, buf, 0);
        write_buffer(fnout, modeout, k, buf, header, n_threads);
    } else { // then merge
        char **fns;
        n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
        fprintf(pysamerr, "[bam_sort_core] merging from %d files...\n", n_files);
        fns = (char**)calloc(n_files, sizeof(char*));
        for (i = 0; i < n_files; ++i) {
            fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
            sprintf(fns[i], "%s.%.4d.bam", prefix, i);
        }
        if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads) < 0) {
            // Propagate bam_merge_core2() failure; it has already emitted a
            // message explaining the failure, so no further message is needed.
            return -1;
        }
        for (i = 0; i < n_files; ++i) {
            unlink(fns[i]);
            free(fns[i]);
        }
        free(fns);
    }
    // free
    for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
    free(buf);
    bam_hdr_destroy(header);
    sam_close(fp);
    return 0;
}
Esempio n. 5
0
/*!
  @abstract Sort an unsorted BAM file based on the chromosome order
  and the leftmost position of an alignment

  @param  is_by_qname whether to sort by query name
  @param  fn       name of the file to be sorted
  @param  prefix   prefix of the output and the temporary files; upon
	                   sucessess, prefix.bam will be written.
  @param  max_mem  approxiate maximum memory (very inaccurate)
  @param full_path the given output path is the full path and not just the prefix

  @discussion It may create multiple temporary subalignment files
  and then merge them by calling bam_merge_core(). This function is
  NOT thread safe.
 */
void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int full_path)
{
	int ret, i, n_files = 0;
	size_t mem, max_k, k, max_mem;
	bam_header_t *header;
	bamFile fp;
	bam1_t *b, **buf;
	char *fnout = 0;
	char const *suffix = ".bam";
	if (full_path) suffix += 4;

	if (n_threads < 2) n_threads = 1;
	g_is_by_qname = is_by_qname;
	max_k = k = 0; mem = 0;
	max_mem = _max_mem * n_threads;
	buf = 0;
	fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
	if (fp == 0) {
		fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
		return;
	}
	header = bam_header_read(fp);
	if (is_by_qname) change_SO(header, "queryname");
	else change_SO(header, "coordinate");
	// write sub files
	for (;;) {
		if (k == max_k) {
			size_t old_max = max_k;
			max_k = max_k? max_k<<1 : 0x10000;
			buf = realloc(buf, max_k * sizeof(void*));
			memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
		}
		if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
		b = buf[k];
		if ((ret = bam_read1(fp, b)) < 0) break;
		if (b->data_len < b->m_data>>2) { // shrink
			b->m_data = b->data_len;
			kroundup32(b->m_data);
			b->data = realloc(b->data, b->m_data);
		}
		mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
		++k;
		if (mem >= max_mem) {
			n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
			mem = k = 0;
		}
	}
	if (ret != -1)
		fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
	// output file name
	fnout = calloc(strlen(prefix) + 20, 1);
	if (is_stdout) sprintf(fnout, "-");
	else sprintf(fnout, "%s%s", prefix, suffix);
	// write the final output
	if (n_files == 0) { // a single block
		char mode[8];
		strcpy(mode, "w");
		if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
		ks_mergesort(sort, k, buf, 0);
		write_buffer(fnout, mode, k, buf, header, n_threads);
	} else { // then merge
		char **fns;
		n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
		fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
		fns = (char**)calloc(n_files, sizeof(char*));
		for (i = 0; i < n_files; ++i) {
			fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
			sprintf(fns[i], "%s.%.4d%s", prefix, i, suffix);
		}
		bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
		for (i = 0; i < n_files; ++i) {
			unlink(fns[i]);
			free(fns[i]);
		}
		free(fns);
	}
	free(fnout);
	// free
	for (k = 0; k < max_k; ++k) {
		if (!buf[k]) continue;
		free(buf[k]->data);
		free(buf[k]);
	}
	free(buf);
	bam_header_destroy(header);
	bam_close(fp);
}