tmap_refseq_t * tmap_refseq_read(const char *fn_fasta, uint32_t is_rev) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; char *fn_pac = NULL, *fn_anno = NULL; tmap_refseq_t *refseq = NULL; // allocate some memory refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->is_rev = is_rev; refseq->is_shm = 0; // read annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "rb", TMAP_ANNO_COMPRESSION); tmap_refseq_read_anno(fp_anno, refseq); tmap_file_fclose(fp_anno); free(fn_anno); // read the sequence fn_pac = tmap_get_file_name(fn_fasta, (0 == is_rev) ? TMAP_PAC_FILE : TMAP_REV_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "rb", (0 == is_rev) ? TMAP_PAC_COMPRESSION : TMAP_REV_PAC_COMPRESSION); refseq->seq = tmap_malloc(sizeof(uint8_t)*tmap_refseq_seq_memory(refseq->len), "refseq->seq"); // allocate if(tmap_refseq_seq_memory(refseq->len) != tmap_file_fread(refseq->seq, sizeof(uint8_t), tmap_refseq_seq_memory(refseq->len), fp_pac)) { tmap_error(NULL, Exit, ReadFileError); } tmap_file_fclose(fp_pac); free(fn_pac); return refseq; }
void tmap_refseq_write(tmap_refseq_t *refseq, const char *fn_fasta, uint32_t is_rev) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; char *fn_pac = NULL, *fn_anno = NULL; uint8_t x = 0; // write annotation file if(0 == is_rev) { fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION); tmap_refseq_write_anno(fp_anno, refseq); tmap_file_fclose(fp_anno); free(fn_anno); } // write the sequence fn_pac = tmap_get_file_name(fn_fasta, (0 == is_rev) ? TMAP_PAC_FILE : TMAP_REV_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "wb", (0 == is_rev) ? TMAP_PAC_COMPRESSION : TMAP_REV_PAC_COMPRESSION); if(tmap_refseq_seq_memory(refseq->len) != tmap_file_fwrite(refseq->seq, sizeof(uint8_t), tmap_refseq_seq_memory(refseq->len), fp_pac)) { tmap_error(NULL, Exit, WriteFileError); } if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } } // store number of unused bits at the last byte x = refseq->len % 4; if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } tmap_file_fclose(fp_pac); free(fn_pac); }
int tmap_refseq_refinfo_main(int argc, char *argv[]) { int c, help=0; tmap_refseq_t *refseq = NULL; tmap_file_t *fp_anno = NULL; char *fn_anno = NULL; char *fn_fasta = NULL; while((c = getopt(argc, argv, "vh")) >= 0) { switch(c) { case 'v': tmap_progress_set_verbosity(1); break; case 'h': help = 1; break; default: return 1; } } if(1 != argc - optind || 1 == help) { tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-vh] <in.fasta>\n", PACKAGE, argv[0]); return 1; } fn_fasta = argv[optind]; // Note: 'tmap_file_stdout' should not have been previously modified tmap_file_stdout = tmap_file_fdopen(fileno(stdout), "wb", TMAP_FILE_NO_COMPRESSION); // allocate some memory refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->is_rev = 0; refseq->is_shm = 0; // read the annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "rb", TMAP_ANNO_COMPRESSION); tmap_refseq_read_anno(fp_anno, refseq); tmap_file_fclose(fp_anno); free(fn_anno); // no need to read in the pac refseq->seq = NULL; // print the header tmap_refseq_print_header(tmap_file_stdout, refseq); // destroy tmap_refseq_destroy(refseq); // close the output tmap_file_fclose(tmap_file_stdout); return 0; }
tmap_sa_t * tmap_sa_read(const char *fn_fasta) { char *fn_sa = NULL; tmap_file_t *fp_sa = NULL; tmap_sa_t *sa = NULL; fn_sa = tmap_get_file_name(fn_fasta, TMAP_SA_FILE); fp_sa = tmap_file_fopen(fn_sa, "rb", TMAP_SA_COMPRESSION); sa = tmap_calloc(1, sizeof(tmap_sa_t), "sa"); if(1 != tmap_file_fread(&sa->primary, sizeof(tmap_bwt_int_t), 1, fp_sa) || 1 != tmap_file_fread(&sa->sa_intv, sizeof(tmap_bwt_int_t), 1, fp_sa) || 1 != tmap_file_fread(&sa->seq_len, sizeof(tmap_bwt_int_t), 1, fp_sa)) { tmap_error(NULL, Exit, ReadFileError); } sa->n_sa = (sa->seq_len + sa->sa_intv) / sa->sa_intv; sa->sa = tmap_calloc(sa->n_sa, sizeof(tmap_bwt_int_t), "sa->sa"); sa->sa[0] = -1; if(sa->n_sa-1 != tmap_file_fread(sa->sa + 1, sizeof(tmap_bwt_int_t), sa->n_sa - 1, fp_sa)) { tmap_error(NULL, Exit, ReadFileError); } sa->sa_intv_log2 = tmap_log2(sa->sa_intv); tmap_file_fclose(fp_sa); free(fn_sa); sa->is_shm = 0; return sa; }
size_t tmap_refseq_shm_read_num_bytes(const char *fn_fasta, uint32_t is_rev) { size_t n = 0; tmap_file_t *fp_anno = NULL; char *fn_anno = NULL; tmap_refseq_t *refseq = NULL; // allocate some memory refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->is_rev = is_rev; refseq->is_shm = 0; // read the annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "rb", TMAP_ANNO_COMPRESSION); tmap_refseq_read_anno(fp_anno, refseq); tmap_file_fclose(fp_anno); free(fn_anno); // No need to read in the pac refseq->seq = NULL; // get the number of bytes n = tmap_refseq_shm_num_bytes(refseq); // destroy tmap_refseq_destroy(refseq); return n; }
void tmap_sa_write(const char *fn_fasta, tmap_sa_t *sa) { char *fn_sa = NULL; tmap_file_t *fp_sa = NULL; fn_sa = tmap_get_file_name(fn_fasta, TMAP_SA_FILE); fp_sa = tmap_file_fopen(fn_sa, "wb", TMAP_SA_COMPRESSION); if(1 != tmap_file_fwrite(&sa->primary, sizeof(tmap_bwt_int_t), 1, fp_sa) || 1 != tmap_file_fwrite(&sa->sa_intv, sizeof(tmap_bwt_int_t), 1, fp_sa) || 1 != tmap_file_fwrite(&sa->seq_len, sizeof(tmap_bwt_int_t), 1, fp_sa) || sa->n_sa-1 != tmap_file_fwrite(sa->sa+1, sizeof(tmap_bwt_int_t), sa->n_sa-1, fp_sa)) { tmap_error(NULL, Exit, WriteFileError); } tmap_file_fclose(fp_sa); free(fn_sa); }
size_t tmap_sa_shm_read_num_bytes(const char *fn_fasta) { size_t n = 0; char *fn_sa = NULL; tmap_file_t *fp_sa = NULL; tmap_sa_t *sa = NULL; fn_sa = tmap_get_file_name(fn_fasta, TMAP_SA_FILE); fp_sa = tmap_file_fopen(fn_sa, "rb", TMAP_SA_COMPRESSION); sa = tmap_calloc(1, sizeof(tmap_sa_t), "sa"); if(1 != tmap_file_fread(&sa->primary, sizeof(tmap_bwt_int_t), 1, fp_sa) || 1 != tmap_file_fread(&sa->sa_intv, sizeof(tmap_bwt_int_t), 1, fp_sa) || 1 != tmap_file_fread(&sa->seq_len, sizeof(tmap_bwt_int_t), 1, fp_sa)) { tmap_error(NULL, Exit, ReadFileError); } sa->n_sa = (sa->seq_len + sa->sa_intv) / sa->sa_intv; // No need to read in sa->sa sa->sa = NULL; tmap_file_fclose(fp_sa); free(fn_sa); sa->is_shm = 0; sa->is_mm = 0; // get the number of bytes n = tmap_sa_shm_num_bytes(sa); tmap_sa_destroy(sa); return n; }
int tmap_index(int argc, char *argv[]) { int c; tmap_index_opt_t opt; opt.fn_fasta = NULL; opt.occ_interval = TMAP_BWT_OCC_INTERVAL; opt.hash_width = INT32_MAX; opt.sa_interval = TMAP_SA_INTERVAL; opt.is_large = -1; opt.check_hash = 1; if(2 == argc && 0 == strcmp("--version", argv[1])) { tmap_file_stdout = tmap_file_fdopen(fileno(stdout), "wb", TMAP_FILE_NO_COMPRESSION); tmap_file_fprintf(tmap_file_stdout, "%s\n", tmap_refseq_get_version_format(PACKAGE_VERSION)); tmap_file_fclose(tmap_file_stdout); return 0; } while((c = getopt(argc, argv, "f:o:i:w:a:hvH")) >= 0) { switch(c) { case 'f': opt.fn_fasta = tmap_strdup(optarg); break; case 'o': opt.occ_interval = atoi(optarg); break; case 'i': opt.sa_interval = atoi(optarg); break; case 'w': opt.hash_width = atoi(optarg); break; case 'a': if(0 == strcmp("is", optarg)) opt.is_large = 0; else if(0 == strcmp("bwtsw", optarg)) opt.is_large = 1; else tmap_error("Option -a value not correct", Exit, CommandLineArgument); break; case 'v': tmap_progress_set_verbosity(1); break; case 'h': return usage(&opt); case 'H': opt.check_hash = 0; break; default: return usage(&opt); } } if(argc != optind || 1 == argc) { return usage(&opt); } if(NULL == opt.fn_fasta) { tmap_error("required option -f", Exit, CommandLineArgument); } if(opt.occ_interval < TMAP_BWT_OCC_MOD || 0 != (opt.occ_interval % 2) || 0 != (opt.occ_interval % TMAP_BWT_OCC_MOD)) { tmap_error("option -o out of range", Exit, CommandLineArgument); } if(opt.hash_width < 0) { tmap_error("option -w out of range", Exit, CommandLineArgument); } if(opt.sa_interval <= 0 || (1 < opt.sa_interval && 0 != (opt.sa_interval % 2))) { tmap_error("option -i out of range", Exit, CommandLineArgument); } tmap_index_core(&opt); free(opt.fn_fasta); tmap_progress_print2("terminating successfully"); return 0; }
int tmap_refseq_pac2fasta_main(int argc, char *argv[]) { int c, help=0, amb=0; uint32_t i, j, k; char *fn_fasta = NULL; tmap_refseq_t *refseq = NULL; while((c = getopt(argc, argv, "avh")) >= 0) { switch(c) { case 'a': amb = 1; break; case 'v': tmap_progress_set_verbosity(1); break; case 'h': help = 1; break; default: return 1; } } if(1 != argc - optind || 1 == help) { tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-avh] <in.fasta>\n", PACKAGE, argv[0]); return 1; } fn_fasta = argv[optind]; // Note: 'tmap_file_stdout' should not have been previously modified tmap_file_stdout = tmap_file_fdopen(fileno(stdout), "wb", TMAP_FILE_NO_COMPRESSION); // read in the reference sequence refseq = tmap_refseq_read(fn_fasta, 0); for(i=0;i<refseq->num_annos;i++) { tmap_file_fprintf(tmap_file_stdout, ">%s", refseq->annos[i].name->s); // new line handled later for(j=k=0;j<refseq->annos[i].len;j++) { if(0 == (j % TMAP_REFSEQ_FASTA_LINE_LENGTH)) { tmap_file_fprintf(tmap_file_stdout, "\n"); } if(1 == amb && 0 < refseq->annos[i].num_amb) { // move the next ambiguous region while(k < refseq->annos[i].num_amb && refseq->annos[i].amb_positions_end[k] < j+1) { k++; } // check for the ambiguous region if(k < refseq->annos[i].num_amb && 0 == tmap_interval_overlap(j+1, j+1, refseq->annos[i].amb_positions_start[k], refseq->annos[i].amb_positions_end[k])) { tmap_file_fprintf(tmap_file_stdout, "%c", tmap_iupac_int_to_char[refseq->annos[i].amb_bases[k]]); } else { tmap_file_fprintf(tmap_file_stdout, "%c", "ACGTN"[(int)tmap_refseq_seq_i(refseq, j + refseq->annos[i].offset)]); } } else { tmap_file_fprintf(tmap_file_stdout, "%c", "ACGTN"[(int)tmap_refseq_seq_i(refseq, j + refseq->annos[i].offset)]); } } tmap_file_fprintf(tmap_file_stdout, "\n"); } // destroy tmap_refseq_destroy(refseq); // close the output tmap_file_fclose(tmap_file_stdout); return 0; }
uint64_t tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; tmap_seq_io_t *seqio = NULL; tmap_seq_t *seq = NULL; tmap_refseq_t *refseq = NULL; char *fn_pac = NULL, *fn_anno = NULL; uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE]; int32_t i, j, l, buffer_length; uint32_t num_IUPAC_found= 0, amb_bases_mem = 0; uint8_t x = 0; uint64_t ref_len; tmap_progress_print("packing the reference FASTA"); refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->version_id = TMAP_VERSION_ID; refseq->package_version = tmap_string_clone2(PACKAGE_VERSION); refseq->seq = buffer; // IMPORTANT: must nullify later refseq->annos = NULL; refseq->num_annos = 0; refseq->len = 0; refseq->is_rev = 0; refseq->is_shm = 0; memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; // input files seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression); seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ); // output files fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION); // read in sequences while(0 <= (l = tmap_seq_io_read(seqio, seq))) { tmap_anno_t *anno = NULL; tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l); refseq->num_annos++; refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos"); anno = &refseq->annos[refseq->num_annos-1]; anno->name = tmap_string_clone(seq->data.fq->name); anno->len = l; anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len; anno->amb_positions_start = NULL; anno->amb_positions_end = NULL; anno->amb_bases = NULL; anno->num_amb = 0; amb_bases_mem = 0; // fill the buffer for(i=0;i<l;i++) { uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]]; // handle IUPAC codes if(4 <= c) { int32_t k; // warn users about IUPAC codes if(0 == num_IUPAC_found) { tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange); for(j=4;j<15;j++) { c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]]; // get the lexicographically smallest base not compatible with this code for(k=0;k<4;k++) { if(!(c & (0x1 << k))) { break; } } tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]); } } num_IUPAC_found++; // change it to a mismatched base than the IUPAC code c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]]; // store IUPAC bases if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary amb_bases_mem = anno->num_amb + 1; tmap_roundup32(amb_bases_mem); anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } // encode stretches of the same base if(0 < anno->num_amb && anno->amb_positions_end[anno->num_amb-1] == i && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) { anno->amb_positions_end[anno->num_amb-1]++; // expand the range } else { // new ambiguous base and range anno->num_amb++; anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]; } // get the lexicographically smallest base not compatible with // this code for(j=0;j<4;j++) { if(!(c & (0x1 << j))) { break; } } c = j & 3; // Note: Ns will go to As } if(3 < c) { tmap_error("bug encountered", Exit, OutOfRange); } if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; } tmap_refseq_seq_store_i(refseq, buffer_length, c); buffer_length++; } refseq->len += l; // re-size the amibiguous bases if(anno->num_amb < amb_bases_mem) { amb_bases_mem = anno->num_amb; anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } } // write out the buffer if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } } // store number of unused bits at the last byte x = refseq->len % 4; if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } refseq->seq = NULL; // IMPORTANT: nullify this ref_len = refseq->len; // save for return tmap_progress_print2("total genome length [%u]", refseq->len); if(0 < num_IUPAC_found) { if(1 == num_IUPAC_found) { tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found); } else { tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found); } } // write annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION); tmap_refseq_write_anno(fp_anno, refseq); // close files tmap_file_fclose(fp_pac); tmap_file_fclose(fp_anno); // check sequence name uniqueness for(i=0;i<refseq->num_annos;i++) { for(j=i+1;j<refseq->num_annos;j++) { if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) { tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n", i+1, refseq->annos[i].name->s, j+1, refseq->annos[j].name->s); tmap_error("Contig names must be unique", Exit, OutOfRange); } } } tmap_refseq_destroy(refseq); tmap_seq_io_destroy(seqio); tmap_seq_destroy(seq); free(fn_pac); free(fn_anno); tmap_progress_print2("packed the reference FASTA"); tmap_refseq_pac2revpac(fn_fasta); return ref_len; }