int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; int len; uint8_t *buf; if (in->open_mode != 'r') return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_dopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } #ifdef _USE_KNETFILE while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) fwrite(buf, 1, len, fp->fp); #else while (!feof(in->fp) && (len = fread(buf, 1, BUF_SIZE, in->fp)) > 0) fwrite(buf, 1, len, fp->fp); #endif free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
void VariantList::printToCompressedVCF(IHeader::SharedPtr headerPtr, bool printHeader, int out) { BGZF* fp = bgzf_dopen(out, "w"); if (printHeader) { bgzf_write(fp, headerPtr->getHeader().c_str(), headerPtr->getHeader().size()); } for(const auto variantPtr : this->m_variant_ptrs) { bgzf_write(fp, variantPtr->getVariantLine(headerPtr).c_str(), variantPtr->getVariantLine(headerPtr).size()); } bgzf_close(fp); }
void ReadDB::import_reads(const std::string& input_filename, const std::string& out_fasta_filename) { // Open readers FILE* read_fp = fopen(input_filename.c_str(), "r"); if(read_fp == NULL) { fprintf(stderr, "error: could not open %s for read\n", input_filename.c_str()); exit(EXIT_FAILURE); } gzFile gz_read_fp = gzdopen(fileno(read_fp), "r"); if(gz_read_fp == NULL) { fprintf(stderr, "error: could not open %s using gzdopen\n", input_filename.c_str()); exit(EXIT_FAILURE); } // Open writers FILE* write_fp = fopen(out_fasta_filename.c_str(), "w"); if(write_fp == NULL) { fprintf(stderr, "error: could not open %s for write\n", out_fasta_filename.c_str()); exit(EXIT_FAILURE); } BGZF* bgzf_write_fp = bgzf_dopen(fileno(write_fp), "w"); if(bgzf_write_fp == NULL) { fprintf(stderr, "error: could not open %s for bgzipped write\n", out_fasta_filename.c_str()); exit(EXIT_FAILURE); } // read input sequences, add to DB and convert to fasta int ret = 0; kseq_t* seq = kseq_init(gz_read_fp); while((ret = kseq_read(seq)) >= 0) { // Check for a path to the fast5 file in the comment of the read std::string path = ""; if(seq->comment.l > 0) { // This splitting code implicitly handles both the 2 and 3 field // fasta format that poretools will output. The FAST5 path // is always the last field. std::vector<std::string> fields = split(seq->comment.s, ' '); path = fields.back(); // as a sanity check we require the path name to end in ".fast5" if(path.length() < 6 || path.substr(path.length() - 6) != ".fast5") { path = ""; } } // sanity check that the read does not exist in the database // JTS 04/2019: changed error to warning to account for duplicate reads coming out of // some versions of guppy. auto iter = m_data.find(seq->name.s); if(iter != m_data.end()) { fprintf(stderr, "Warning: duplicate read name %s found in fasta file\n", seq->name.s); continue; } // add path add_signal_path(seq->name.s, path); // write sequence in gzipped fasta for fai indexing later std::string out_record; out_record += ">"; out_record += seq->name.s; out_record += "\n"; out_record += seq->seq.s; out_record += "\n"; size_t write_length = bgzf_write(bgzf_write_fp, out_record.c_str(), out_record.length()); if(write_length != out_record.length()) { fprintf(stderr, "error in bgzf_write, aborting\n"); exit(EXIT_FAILURE); } } // check for abnormal exit conditions if(ret <= -2) { fprintf(stderr, "kseq_read returned %d indicating an error with the input file %s\n", ret, input_filename.c_str()); exit(EXIT_FAILURE); } // cleanup kseq_destroy(seq); gzclose(gz_read_fp); fclose(read_fp); bgzf_close(bgzf_write_fp); fclose(write_fp); }
void bcf_file::print_bcf(const parameters ¶ms) { LOG.printLOG("Outputting BCF file...\n"); BGZF * out; if(!params.stream_out) { string output_file = params.output_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); string header_str; uint32_t len_text = 0; vector<char> header; char magic[5] = {'B','C','F','\2','\2'}; bgzf_write(out, magic, 5); for (unsigned int ui=0; ui<meta_data.lines.size(); ui++) { for (unsigned int uj=0; uj<meta_data.lines[ui].length(); uj++) header.push_back( meta_data.lines[ui][uj] ); header.push_back('\n'); } header_str = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; if (meta_data.N_indv > 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui<meta_data.N_indv; ui++) if (include_indv[ui]) { header_str += "\t"; header_str += meta_data.indv[ui]; } header_str += "\n"; for (unsigned int ui=0; ui<header_str.length(); ui++) header.push_back( header_str[ui] ); header.push_back( '\0' ); len_text = header.size(); bgzf_write(out, (char *)&len_text, sizeof(len_text) ); bgzf_write(out, (char *)&header[0], len_text ); vector<char> variant_line; entry * e = new bcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true); e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; bgzf_close(out); }
void vcf_file::print_bcf(const parameters ¶ms) { LOG.printLOG("Outputting BCF file...\n"); BGZF * out; if(!params.stream_out) { string output_file = params.output_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); string header_str; uint32_t len_text = 0; vector<char> header; char magic[5] = {'B','C','F','\2', '\1'}; bgzf_write(out, magic, 5); if (meta_data.has_idx) { LOG.warning("VCF file contains IDX values in header. These are being removed for conversion to BCF."); meta_data.reprint(); meta_data.reparse(); } for (unsigned int ui=0; ui<meta_data.lines.size(); ui++) { for (unsigned int uj=0; uj<meta_data.lines[ui].length(); uj++) header.push_back( meta_data.lines[ui][uj] ); header.push_back('\n'); } if (meta_data.has_contigs == false) { vector<string> contig_vector; get_contigs(params.contigs_file, contig_vector); for(unsigned int ui=0; ui<contig_vector.size(); ui++) { meta_data.add_CONTIG_descriptor(contig_vector[ui].substr(10, contig_vector[ui].size()-8),int(ui)); for(unsigned int uj=0; uj<contig_vector[ui].size(); uj++) header.push_back(contig_vector[ui][uj]); header.push_back('\n'); } } header_str = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; if (meta_data.N_indv > 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui<meta_data.N_indv; ui++) if (include_indv[ui]) { header_str += "\t"; header_str += meta_data.indv[ui]; } header_str += "\n"; for (unsigned int ui=0; ui<header_str.length(); ui++) header.push_back( header_str[ui] ); header.push_back( '\0' ); len_text = header.size(); bgzf_write(out, (char *)&len_text, sizeof(len_text) ); bgzf_write(out, (char *)&header[0], len_text ); vector<char> variant_line; entry * e = new vcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true,true,true,true); e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; bgzf_close(out); }
static void reheader_vcf_gz(args_t *args) { BGZF *fp = bgzf_open(args->fname,"r"); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fname, strerror(errno)); kstring_t hdr = {0,0,0}; char *buffer = (char*) fp->uncompressed_block; // Read the header and find the position of the data block if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); int skip_until = 1; // end of the header in the current uncompressed block while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { kputsn(buffer,skip_until,&hdr); if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", args->fname); skip_until = 0; } // The header has finished if ( buffer[skip_until]!='#' ) { kputsn(buffer,skip_until,&hdr); break; } } skip_until++; if ( skip_until>=fp->block_length ) { kputsn(buffer,fp->block_length,&hdr); if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", args->fname); skip_until = 0; } } int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; read_header_file(args->header_fname, &hdr); } if ( samples ) { set_samples(samples, nsamples, &hdr); int i; for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } // Output the modified header BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w"); bgzf_write(bgzf_out, hdr.s, hdr.l); free(hdr.s); // Output all remainig data read with the header block if ( fp->block_length - skip_until > 0 ) { if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file without as it is, without decompressing ssize_t nread; int page_size = getpagesize(); char *buf = (char*) valloc(page_size); while (1) { nread = bgzf_raw_read(fp, buf, page_size); if ( nread<=0 ) break; int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); free(buf); }
int main_bam2fq(int argc, char *argv[]) { BGZF *fp, *fpse = 0; bam1_t *b; uint8_t *buf; int max_buf, c, has12 = 0; kstring_t str; int64_t n_singletons = 0, n_reads = 0; char last[512], *fnse = 0; while ((c = getopt(argc, argv, "as:")) > 0) if (c == 'a') has12 = 1; else if (c == 's') fnse = optarg; if (argc == optind) { fprintf(stderr, "\nUsage: bam2fq [-a] [-s outSE] <in.bam>\n\n"); fprintf(stderr, "Options: -a append /1 and /2 to the read name\n"); fprintf(stderr, " -s FILE write singleton reads to FILE [assume single-end]\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); assert(fp); bam_hdr_destroy(bam_hdr_read(fp)); buf = 0; max_buf = 0; str.l = str.m = 0; str.s = 0; last[0] = 0; if (fnse) fpse = bgzf_open(fnse, "w1"); b = bam_init1(); while (bam_read1(fp, b) >= 0) { int i, qlen = b->core.l_qseq, is_print = 0; uint8_t *qual, *seq; if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments ++n_reads; if (fpse) { if (str.l && strcmp(last, bam_get_qname(b))) { bgzf_write(fpse, str.s, str.l); str.l = 0; ++n_singletons; } if (str.l) is_print = 1; strcpy(last, bam_get_qname(b)); } else is_print = 1; qual = bam_get_qual(b); kputc(qual[0] == 0xff? '>' : '@', &str); kputsn(bam_get_qname(b), b->core.l_qname - 1, &str); if (has12) { kputc('/', &str); kputw(b->core.flag>>6&3, &str); } kputc('\n', &str); if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); buf = (uint8_t*)realloc(buf, max_buf); } buf[qlen] = 0; seq = bam_get_seq(b); for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence if (bam_is_rev(b)) { // reverse complement for (i = 0; i < qlen>>1; ++i) { int8_t t = seq_comp_table[buf[qlen - 1 - i]]; buf[qlen - 1 - i] = seq_comp_table[buf[i]]; buf[i] = t; } if (qlen&1) buf[i] = seq_comp_table[buf[i]]; } for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]]; kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (qual[0] != 0xff) { kputsn("+\n", 2, &str); for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i]; if (bam_is_rev(b)) { // reverse for (i = 0; i < qlen>>1; ++i) { uint8_t t = buf[qlen - 1 - i]; buf[qlen - 1 - i] = buf[i]; buf[i] = t; } } } kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (is_print) { fwrite(str.s, 1, str.l, stdout); str.l = 0; } } if (fpse) { if (str.l) { bgzf_write(fpse, str.s, str.l); ++n_singletons; } fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons); bgzf_close(fpse); } fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads); free(buf); free(str.s); bam_destroy1(b); bgzf_close(fp); return 0; }
// ClassifyFileType // Attempt to classify the alignment file as one of CSV, BED or SAM from it's initial 8k char contents // Currently processes CSV, BED and SAM format file types // Assumes must be SAM if initial lines have at least one prefixed by a '@' followed by a 2 letter record type code // etClassifyFileType CUtility::ClassifyFileType(char *pszFileName) { int hFile; gzFile gz; BGZF* pInBGZF; int BuffLen; int BuffIdx; UINT8 Buffer[cFileClassifyBuffLen]; UINT8 *pBuff; bool bStartNL; bool bSkipEOL; UINT8 Chr; int NumLines; int FieldCnt; int TabCnt; int CommaCnt; int FldLen; bool bInQuotes; int LikelyCSV; int LikelyBED; int LikelySAM; int LikelyNonCSVSAMBED; bool bSeenSAMHdrs; int FileNameLen; bool bGZd; FileNameLen = (int)strlen(pszFileName); bGZd = false; if(FileNameLen >= 4) { if(!stricmp(&pszFileName[FileNameLen-3],".gz")) bGZd = true; else { if(FileNameLen >= 5 && !stricmp(&pszFileName[FileNameLen-4],".bam")) { hFile = open(pszFileName,O_READSEQ); if(hFile == -1) return(eCFTopenerr); // BAM will using BGZF compression .. if((pInBGZF = bgzf_dopen(hFile, "r"))==NULL) { gDiagnostics.DiagOut(eDLFatal,gszProcName,"ClassifyFileType: unable to initialise for BGZF processing on file '%s'",pszFileName); close(hFile); return(eCFTopenerr); } hFile = -1; // try reading the header, bgzf_read will confirm it does start with "BAM\1" .... if((BuffLen = (int)bgzf_read(pInBGZF,Buffer,100)) < 100) // will be < 100 if errors ... { gDiagnostics.DiagOut(eDLFatal,gszProcName,"ClassifyFileType: Not a BAM format file '%s'",pszFileName); bgzf_close(pInBGZF); return(eCFTopenerr); } bgzf_close(pInBGZF); return(eCFTSAM); } } } // now can try to actually open file and read in first cFileClassifyBuffLen chars if(bGZd) { gz = gzopen(pszFileName,"rb"); if(gz == NULL) { gDiagnostics.DiagOut(eDLFatal,gszProcName,"Open: unable to open for reading gzip'd file '%s'",pszFileName); return(eCFTopenerr); } BuffLen = gzread(gz,Buffer,sizeof(Buffer)-1); gzclose(gz); } else { hFile = open(pszFileName,O_READSEQ); if(hFile == -1) return(eCFTopenerr); // read the 1st cFileTypeBuffLen into buffer BuffLen = read(hFile,Buffer,sizeof(Buffer)-1); close(hFile); } if(BuffLen < cMinFileClassifyLen) // an arbitary lower limit! return(eCFTlenerr); Buffer[BuffLen] = '\0'; pBuff = Buffer; NumLines = 0; LikelyCSV = 0; LikelyBED = 0; LikelySAM = 0; LikelyNonCSVSAMBED = 0; BuffIdx = 0; bStartNL = true; bSeenSAMHdrs = false; while(Chr = *pBuff++) { BuffIdx += 1; if(bStartNL) { FieldCnt = 0; TabCnt = 0; CommaCnt = 0; FldLen = 0; bInQuotes = false; bStartNL = false; bSkipEOL = false; NumLines += 1; } if(Chr == '\n' || Chr == '\r') // if at end of line { bStartNL = true; bSkipEOL = false; if(FieldCnt < 3) // BED can have down to 3 fields, CSV alignment and SAM should have more continue; if(!bInQuotes) { if(CommaCnt >= 3 && CommaCnt > TabCnt) // if at least as many commas as tabs as assumed field separators then most likely a CSV file LikelyCSV += 10; else // if more tabs than commas then could be either BED or SAM { if(bSeenSAMHdrs) { LikelyBED += 5; LikelySAM += 10; // SAM would be distinguished by it's header lines starting with '@" } else { LikelyBED += 20; LikelySAM += 5; } } } continue; } if(bSkipEOL) continue; if(!FieldCnt && !FldLen && (Chr == ' ' || Chr == '\t')) // simply slough all leading whitespaces before intial field starts continue; // nested quotes are potentially a problem; currently quotes are simply sloughed if(Chr == '\'' || Chr == '"') { bInQuotes = !bInQuotes; continue; } if(!FieldCnt && !bInQuotes && Chr == '@' || Chr == '>') { if(Chr == '@') // if SAM then header line(s) should be present and can be expected to start with "@HD", "@SQ", "@RG", "@PG", "@CO" { if(BuffIdx < (BuffLen - 3)) { if(((*pBuff == 'H' && pBuff[1] == 'D') || (*pBuff == 'S' && pBuff[1] == 'Q') || (*pBuff == 'R' && pBuff[1] == 'G') || (*pBuff == 'P' && pBuff[1] == 'G') || (*pBuff == 'C' && pBuff[1] == 'O')) && (pBuff[2] == ' ' || pBuff[2] == '\t' )) { bSeenSAMHdrs = true; LikelyNonCSVSAMBED = 0; LikelySAM += 10000; bSkipEOL = true; continue; } else { if(!bSeenSAMHdrs) // if no SAM headers parsed then could easily be a fastq... { LikelyNonCSVSAMBED += 50; bSkipEOL = true; continue; } } } } if(Chr == '>') // if at start of line then could easily be fasta... LikelyNonCSVSAMBED += 50; } switch(Chr) { case ' ': // simply slough spaces continue; case ',': // if comma then likely is a csv, but could still be BED if in optional fields 9 (itemRgb) onwards if(TabCnt < 8 && FieldCnt >= TabCnt) { FieldCnt += 1; CommaCnt += 1; FldLen = 0; } break; case '\t': // tabs are in BED and SAM as field separators, but could also be present in CSV as spacers if(CommaCnt < 3 && FieldCnt >= CommaCnt) { FieldCnt += 1; TabCnt += 1; FldLen = 0; } break; default: // any other char is assumed to be part of an actual field value FldLen += 1; break; } } if(LikelyNonCSVSAMBED >= 250 || (LikelyCSV < 10 && LikelyBED < 10 && LikelySAM < 500)) return(eCFTunknown); if(LikelyCSV >= LikelyBED && LikelyCSV >= LikelySAM) return(eCFTCSV); if(LikelyBED >= LikelySAM) return(eCFTBED); return(eCFTSAM); }
int reheader_file(const char *header, const char *file, int meta) { char *buffer; int skip_until = 0; FILE *fh; int page_size; char *buf; BGZF *bgzf_out; ssize_t nread; BGZF *fp = bgzf_open(file,"r"); if (bgzf_read_block(fp) != 0 || !fp->block_length) return -1; buffer = fp->uncompressed_block; if ( buffer[0]==meta ) { skip_until = 1; // Skip the header while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=meta ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } } } fh = fopen(header,"r"); if ( !fh ) error("%s: %s", header,strerror(errno)); page_size = getpagesize(); buf = malloc(page_size); //Dong Code bgzf_out = bgzf_dopen(fileno(stdout), "w"); while ( (nread=fread(buf,1,page_size-1,fh))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode); } fclose(fh); if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode); } if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); while (1) { int count; #ifdef _USE_KNETFILE nread = knet_read(fp->fp, buf, page_size); #else nread = fread(buf, 1, page_size, fp->fp); #endif if ( nread<=0 ) break; count = fwrite(buf, 1, nread, bgzf_out->fp); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); return 0; }