void tmap_map1_print_max_diff(tmap_map_opt_t *opt, int32_t stage) { int32_t i, k, l; // initialize for(i=0;i<=TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH;i++) { opt->max_diff_table[i] = 0; } if(opt->max_diff < 0) { if(0 < stage) tmap_progress_print("calculating maximum differences in map1 for stage %d", stage); else tmap_progress_print("calculating maximum differences in map1"); for(i = 17, k = 0;i <= TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH;i++) { l = tmap_map1_cal_maxdiff(i, opt->max_err_rate, opt->max_diff_fnr); if(l != k ) { tmap_progress_print("%dbp reads will have at most %d differences", i, l); } opt->max_diff_table[i] = l; k = l; } } else { for(i=0;i <= TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH;i++) { opt->max_diff_table[i] = opt->max_diff; } } }
static void tmap_index_core(tmap_index_opt_t *opt) { uint64_t ref_len = 0; // pack the reference sequence ref_len = tmap_refseq_fasta2pac(opt->fn_fasta, TMAP_FILE_NO_COMPRESSION, 0); if(TMAP_INDEX_TOO_BIG_GENOME <= ref_len) { // too big (2^32 - 1)! tmap_error("Reference sequence too large", Exit, OutOfRange); } // check returned genome size if(opt->is_large < 0) { if(TMAP_INDEX_LARGE_GENOME <= ref_len) { opt->is_large = 1; tmap_progress_print("defaulting to \"bwtsw\" BWT construction algorithm"); } else { opt->is_large = 0; tmap_progress_print("defaulting to \"is\" BWT construction algorithm"); } } // create the bwt tmap_bwt_pac2bwt(opt->fn_fasta, opt->is_large, opt->occ_interval, opt->hash_width, opt->check_hash); // create the suffix array tmap_sa_bwt2sa(opt->fn_fasta, opt->sa_interval); // pack the reference sequence ref_len = tmap_refseq_fasta2pac(opt->fn_fasta, TMAP_FILE_NO_COMPRESSION, 1); }
static int32_t tmap_shmget(key_t key, size_t size, int32_t shmflg, int32_t create) { int32_t shmid, i; if(0 == create) { // try a number of times before failing for(i=0,shmid=-1;shmid<0 && i<TMAP_SHMGET_RETRIES-1;i++) { if(0 <= (shmid = shmget(key, size, shmflg))) { return shmid; } tmap_progress_print("could not get shared memory, %d more %s", TMAP_SHMGET_RETRIES-i-1, (1 != TMAP_SHMGET_RETRIES-i-1) ? "retries" : "retry"); tmap_progress_print("retrying in %d seconds", TMAP_SHMGET_SLEEP); // sleep and retry sleep(TMAP_SHMGET_SLEEP); } } if((shmid = shmget(key, size, shmflg)) < 0) { tmap_error(NULL, Exit, SharedMemoryGet); } return shmid; }
tmap_index_t* tmap_index_init(const char *fn_fasta, key_t shm_key, int32_t mm) { tmap_index_t *index = NULL; index = tmap_calloc(1, sizeof(tmap_index_t), "index"); index->shm_key = shm_key; index->mm = mm; // get the reference information // primary 65380; sa_intv: 32 // seq_len = 97004 //n_sa = 3032, sa 67973 .. 18446744073709551615 if (1 == index->mm) { tmap_progress_print("Retrieving reference data from memory map"); index->refseq = tmap_refseq_mm_read(fn_fasta); index->bwt = tmap_bwt_mm_read(fn_fasta); index->sa = tmap_sa_mm_read(fn_fasta); tmap_progress_print2("Reference data retrieved from memory map"); } else if(0 == index->shm_key) { tmap_progress_print("reading in reference data"); index->refseq = tmap_refseq_read(fn_fasta); index->bwt = tmap_bwt_read(fn_fasta); index->sa = tmap_sa_read(fn_fasta); tmap_progress_print2("reference data read in"); } else { tmap_progress_print("retrieving reference data from shared memory"); index->shm = tmap_shm_init(index->shm_key, 0, 0); if(NULL == (index->refseq = tmap_refseq_shm_unpack(tmap_shm_get_buffer(index->shm, TMAP_SHM_LISTING_REFSEQ)))) { tmap_error("the packed reference sequence was not found in shared memory", Exit, SharedMemoryListing); } if(NULL == (index->bwt = tmap_bwt_shm_unpack(tmap_shm_get_buffer(index->shm, TMAP_SHM_LISTING_BWT)))) { tmap_error("the BWT string was not found in shared memory", Exit, SharedMemoryListing); } if(NULL == (index->sa = tmap_sa_shm_unpack(tmap_shm_get_buffer(index->shm, TMAP_SHM_LISTING_SA)))) { tmap_error("the SA was not found in shared memory", Exit, SharedMemoryListing); } tmap_progress_print2("reference data retrieved from shared memory"); } if((index->refseq->len << 1) != index->bwt->seq_len) { tmap_error("refseq and bwt lengths do not match", Exit, OutOfRange); } if((index->refseq->len << 1) != index->sa->seq_len) { tmap_error("refseq and sa lengths do not match", Exit, OutOfRange); } return index; }
void tmap_refseq_pac2revpac(const char *fn_fasta) { uint32_t i, j, c; tmap_refseq_t *refseq=NULL, *refseq_rev=NULL; tmap_progress_print("reversing the packed reference FASTA"); refseq = tmap_refseq_read(fn_fasta, 0); // shallow copy refseq_rev = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq_rev"); (*refseq_rev) = (*refseq); // update sequence refseq_rev->seq = NULL; refseq_rev->seq = tmap_calloc(tmap_refseq_seq_memory(refseq->len), sizeof(uint8_t), "refseq_rev->seq"); for(i=0;i<refseq->len;i++) { c = tmap_refseq_seq_i(refseq, i); j = refseq->len - i - 1; tmap_refseq_seq_store_i(refseq_rev, j, c); } // write tmap_refseq_write(refseq_rev, fn_fasta, 1); // free free(refseq_rev->seq); free(refseq_rev); tmap_refseq_destroy(refseq); tmap_progress_print2("reversed the packed reference FASTA"); }
tmap_index_t* tmap_index_init(const char *fn_fasta, key_t shm_key) { tmap_index_t *index = NULL; index = tmap_calloc(1, sizeof(tmap_index_t), "index"); index->shm_key = shm_key; // get the reference information if(0 == index->shm_key) { tmap_progress_print("reading in reference data"); index->refseq = tmap_refseq_read(fn_fasta); index->bwt = tmap_bwt_read(fn_fasta); index->sa = tmap_sa_read(fn_fasta); tmap_progress_print2("reference data read in"); } else { tmap_progress_print("retrieving reference data from shared memory"); index->shm = tmap_shm_init(index->shm_key, 0, 0); if(NULL == (index->refseq = tmap_refseq_shm_unpack(tmap_shm_get_buffer(index->shm, TMAP_SHM_LISTING_REFSEQ)))) { tmap_error("the packed reference sequence was not found in shared memory", Exit, SharedMemoryListing); } if(NULL == (index->bwt = tmap_bwt_shm_unpack(tmap_shm_get_buffer(index->shm, TMAP_SHM_LISTING_BWT)))) { tmap_error("the BWT string was not found in shared memory", Exit, SharedMemoryListing); } if(NULL == (index->sa = tmap_sa_shm_unpack(tmap_shm_get_buffer(index->shm, TMAP_SHM_LISTING_SA)))) { tmap_error("the SA was not found in shared memory", Exit, SharedMemoryListing); } tmap_progress_print2("reference data retrieved from shared memory"); } if((index->refseq->len << 1) != index->bwt->seq_len) { tmap_error("refseq and bwt lengths do not match", Exit, OutOfRange); } if((index->refseq->len << 1) != index->sa->seq_len) { tmap_error("refseq and sa lengths do not match", Exit, OutOfRange); } return index; }
void tmap_sa_bwt2sa(const char *fn_fasta, uint32_t intv) { int64_t isa, s; // S(isa) = sa uint64_t i; tmap_bwt_t *bwt = NULL; tmap_sa_t *sa = NULL; tmap_progress_print("constructing the SA from the BWT string"); bwt = tmap_bwt_read(fn_fasta); sa = tmap_calloc(1, sizeof(tmap_sa_t), "sa"); sa->primary = bwt->primary; sa->sa_intv = intv; sa->seq_len = bwt->seq_len; sa->n_sa = (bwt->seq_len + intv) / intv; // calculate SA value sa->sa = tmap_calloc(sa->n_sa, sizeof(tmap_bwt_int_t), "sa->sa"); isa = 0; s = bwt->seq_len; for(i = 0; i < bwt->seq_len; ++i) { if(isa % intv == 0) sa->sa[isa/intv] = s; --s; isa = tmap_bwt_invPsi(bwt, isa); } if(isa % intv == 0) sa->sa[isa/intv] = s; sa->sa[0] = (tmap_bwt_int_t)-1; // before this line, bwt->sa[0] = bwt->seq_len tmap_sa_write(fn_fasta, sa); tmap_bwt_destroy(bwt); tmap_sa_destroy(sa); sa=NULL; bwt=NULL; tmap_progress_print2("constructed the SA from the BWT string"); }
tmap_shm_t * tmap_shm_init(key_t key, size_t size, int32_t create) { tmap_shm_t *shm = NULL; int32_t i, shmflg = 0; struct shmid_ds buf; shm = tmap_calloc(1, sizeof(tmap_shm_t), "shm"); shm->key = key; shm->size = size; if(1 == create) { shm->size += sizeof(uint32_t); // add for synchronization shm->size += sizeof(uint32_t); // add for on/off bits for listing what is in memory shm->size += 32*sizeof(size_t); // add for the byte size of each listing shmflg = IPC_CREAT | IPC_EXCL | 0666; shm->creator = 1; } else { shmflg = 0666; shm->creator = 0; } // get the shared memory id shm->shmid = tmap_shmget(shm->key, shm->size, shmflg, create); // attach the shared memory shm->ptr = tmap_shmat(shm->shmid, NULL, 0); shm->buf = ((uint8_t*)shm->ptr); shm->buf += sizeof(uint32_t); // synchronization shm->buf += sizeof(uint32_t) + 32*sizeof(size_t); // listings tmap_shmctl(shm->shmid, IPC_STAT, &buf); if(1 == create) { // check that the current process created the shared memory if(buf.shm_cpid != getpid() || TMAP_SHM_READY == tmap_shm_get_state(shm)) { tmap_error("shared memory was not created by the current process", Exit, OutOfRange); } tmap_shm_set_not_ready(shm); if(buf.shm_segsz != shm->size) { tmap_error("shared memory size does not match the expected size", Exit, OutOfRange); } } else { // try a number of times before failing for(i=0;i<TMAP_SHMGET_RETRIES;i++) { if(TMAP_SHM_READY == tmap_shm_get_state(shm)) { break; } tmap_progress_print("shared memory not ready, %d more retries", TMAP_SHMGET_RETRIES-i-1); tmap_progress_print("retrying in %d seconds", TMAP_SHMGET_SLEEP); // sleep and retry sleep(TMAP_SHMGET_SLEEP); } if(TMAP_SHMGET_RETRIES == i) { tmap_error("shared memory did not become available", Exit, SharedMemoryGet); } // set the size shm->size = buf.shm_segsz; } return shm; }
uint64_t tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression) { tmap_file_t *fp_pac = NULL, *fp_anno = NULL; tmap_seq_io_t *seqio = NULL; tmap_seq_t *seq = NULL; tmap_refseq_t *refseq = NULL; char *fn_pac = NULL, *fn_anno = NULL; uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE]; int32_t i, j, l, buffer_length; uint32_t num_IUPAC_found= 0, amb_bases_mem = 0; uint8_t x = 0; uint64_t ref_len; tmap_progress_print("packing the reference FASTA"); refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq"); refseq->version_id = TMAP_VERSION_ID; refseq->package_version = tmap_string_clone2(PACKAGE_VERSION); refseq->seq = buffer; // IMPORTANT: must nullify later refseq->annos = NULL; refseq->num_annos = 0; refseq->len = 0; refseq->is_rev = 0; refseq->is_shm = 0; memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; // input files seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression); seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ); // output files fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE); fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION); // read in sequences while(0 <= (l = tmap_seq_io_read(seqio, seq))) { tmap_anno_t *anno = NULL; tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l); refseq->num_annos++; refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos"); anno = &refseq->annos[refseq->num_annos-1]; anno->name = tmap_string_clone(seq->data.fq->name); anno->len = l; anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len; anno->amb_positions_start = NULL; anno->amb_positions_end = NULL; anno->amb_bases = NULL; anno->num_amb = 0; amb_bases_mem = 0; // fill the buffer for(i=0;i<l;i++) { uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]]; // handle IUPAC codes if(4 <= c) { int32_t k; // warn users about IUPAC codes if(0 == num_IUPAC_found) { tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange); for(j=4;j<15;j++) { c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]]; // get the lexicographically smallest base not compatible with this code for(k=0;k<4;k++) { if(!(c & (0x1 << k))) { break; } } tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]); } } num_IUPAC_found++; // change it to a mismatched base than the IUPAC code c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]]; // store IUPAC bases if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary amb_bases_mem = anno->num_amb + 1; tmap_roundup32(amb_bases_mem); anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } // encode stretches of the same base if(0 < anno->num_amb && anno->amb_positions_end[anno->num_amb-1] == i && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) { anno->amb_positions_end[anno->num_amb-1]++; // expand the range } else { // new ambiguous base and range anno->num_amb++; anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]; } // get the lexicographically smallest base not compatible with // this code for(j=0;j<4;j++) { if(!(c & (0x1 << j))) { break; } } c = j & 3; // Note: Ns will go to As } if(3 < c) { tmap_error("bug encountered", Exit, OutOfRange); } if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE); buffer_length = 0; } tmap_refseq_seq_store_i(refseq, buffer_length, c); buffer_length++; } refseq->len += l; // re-size the amibiguous bases if(anno->num_amb < amb_bases_mem) { amb_bases_mem = anno->num_amb; anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start"); anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end"); anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases"); } } // write out the buffer if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } } // store number of unused bits at the last byte x = refseq->len % 4; if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) { tmap_error(fn_pac, Exit, WriteFileError); } refseq->seq = NULL; // IMPORTANT: nullify this ref_len = refseq->len; // save for return tmap_progress_print2("total genome length [%u]", refseq->len); if(0 < num_IUPAC_found) { if(1 == num_IUPAC_found) { tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found); } else { tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found); } } // write annotation file fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE); fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION); tmap_refseq_write_anno(fp_anno, refseq); // close files tmap_file_fclose(fp_pac); tmap_file_fclose(fp_anno); // check sequence name uniqueness for(i=0;i<refseq->num_annos;i++) { for(j=i+1;j<refseq->num_annos;j++) { if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) { tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n", i+1, refseq->annos[i].name->s, j+1, refseq->annos[j].name->s); tmap_error("Contig names must be unique", Exit, OutOfRange); } } } tmap_refseq_destroy(refseq); tmap_seq_io_destroy(seqio); tmap_seq_destroy(seq); free(fn_pac); free(fn_anno); tmap_progress_print2("packed the reference FASTA"); tmap_refseq_pac2revpac(fn_fasta); return ref_len; }