// Splits a sam file into individual files, one per chromosome. The files are created in the specified directory. // Each splitted file contains the same header as the original samFile. The generated files may only contain // a header if no alignments to that chromosome exist. The names of the split files come from the sam header // with an additional .sam extension. The unmapped alignments are collected in the file splitChrSam_unaligned.sam // Returns the chromosome names in the order in which they occur in the sam file header SEXP split_sam_chr(SEXP samFile, SEXP outDir) { if (!Rf_isString(samFile) || 1 != Rf_length(samFile)){ Rf_error("'samFile' must be character(1)"); } if (!Rf_isString(outDir) || 1 != Rf_length(outDir)){ Rf_error("'outDir' must be character(1)"); } const char * sam_file = Rf_translateChar(STRING_ELT(samFile, 0)); const char * out_dir = Rf_translateChar(STRING_ELT(outDir, 0)); // open the input sam file samfile_t *fin = _bam_tryopen(sam_file, "r", NULL); if (fin->header == 0) { samclose(fin); Rf_error("invalid header"); } // remove \r from header if exists (for windows) int j, k = 0; for(j = 0; j<fin->header->l_text; j++){ if(fin->header->text[j] != '\r'){ fin->header->text[k++] = fin->header->text[j]; } } if(j != k){ fin->header->text[k] = '\0'; fin->header->l_text = (uint32_t)strlen(fin->header->text); } // allocate memory for a list of filehandles (n+1 because of the unaligned reads) samfile_t **foutList = (samfile_t**)calloc((size_t)(fin->header->n_targets+1), sizeof(samfile_t*)); // open the output file handles (n+1 due to the unaligned reads) int i; SEXP chrNames; PROTECT(chrNames = allocVector(STRSXP, (fin->header->n_targets+1))); // protect from garbage collector for (i = 0; i < (fin->header->n_targets); i++) { foutList[i] = _bam_tryopen(_assemble_file_name(out_dir,fin->header->target_name[i]), "wh", fin->header); SET_STRING_ELT(chrNames, i, mkChar(fin->header->target_name[i])); } foutList[fin->header->n_targets] = _bam_tryopen(_assemble_file_name(out_dir,"splitChrSam_unaligned"), "wh", fin->header); SET_STRING_ELT(chrNames, fin->header->n_targets, mkChar("splitChrSam_unaligned")); // split the sam file based on chromosome _walk_through_sam_and_split(fin,foutList); // close all the file handles for (i = 0; i < (fin->header->n_targets+1); i++){samclose(foutList[i]);} samclose(fin); UNPROTECT(1); // release return chrNames; }
static BAM_FILE _bamfile_open_r(SEXP filename, SEXP indexname, SEXP filemode) { BAM_FILE bfile = (BAM_FILE) Calloc(1, _BAM_FILE); bfile->file = NULL; if (0 != Rf_length(filename)) { const char *cfile = translateChar(STRING_ELT(filename, 0)); bfile->file = _bam_tryopen(cfile, CHAR(STRING_ELT(filemode, 0)), 0); if ((bfile->file->type & TYPE_BAM) != 1) { samclose(bfile->file); Free(bfile); Rf_error("'filename' is not a BAM file\n file: %s", cfile); } bfile->pos0 = bam_tell(bfile->file->x.bam); bfile->irange0 = 0; } bfile->index = NULL; if (0 != Rf_length(indexname)) { const char *cindex = translateChar(STRING_ELT(indexname, 0)); bfile->index = _bam_tryindexload(cindex); if (NULL == bfile->index) { samclose(bfile->file); Free(bfile); Rf_error("failed to open BAM index\n index: %s\n", cindex); } } bfile->iter = NULL; bfile->pbuffer = NULL; return bfile; }
static BAM_FILE _bamfile_open_w(SEXP file0, SEXP file1) { samfile_t *infile, *outfile; BAM_FILE bfile; if (0 == Rf_length(file1)) Rf_error("'file1' must be a character(1) path to a valid bam file"); infile = _bam_tryopen(translateChar(STRING_ELT(file1, 0)), "rb", 0); outfile = _bam_tryopen(translateChar(STRING_ELT(file0, 0)), "wb", infile->header); samclose(infile); bfile = (BAM_FILE) Calloc(1, _BAM_FILE); bfile->file = outfile; bfile->pos0 = bam_tell(bfile->file->x.bam); bfile->irange0 = 0; return bfile; }