static void append_header_text(bam_header_t *header, char* text, int len) { int x = header->l_text + 1; int y = header->l_text + len + 1; // 1 byte null if (text == 0) return; kroundup32(x); kroundup32(y); if (x < y) header->text = (char*)realloc(header->text, y); strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. header->l_text += len; header->text[header->l_text] = 0; }
s_align* mengyao_ssw_core ( char *pacseq,//Refernce seq(this is the pacseq),len int len,//orignal seq length.. char *seq,//Original seq... int reflen, //length of reference string.. int filter, int Skip_DP, s_profile* p ) { //init_SSW(); kseq_t *read_seq, *ref_seq; //int32_t m; int8_t* ref_num = (int8_t*)pacseq; int8_t* num = (int8_t*)seq; //s_profile* p; //this block can be called once for every difference query (1 read is aligned to multiple references and thus we do not have to init these lines again //jq: we throw in the query here read_seq = (kseq_t*)calloc(1, sizeof(kseq_t)); read_seq->seq.l=len; read_seq->seq.m=len; read_seq->seq.s=seq; kroundup32(read_seq->seq.m); //for (m = 0; m < read_seq->seq.l; ++m) num[m] = table[(int)read_seq->seq.s[m]]; //p = ssw_init(num, read_seq->seq.l, mata, n, 1); ref_seq = (kseq_t*)calloc(1, sizeof(kseq_t)); ref_seq->seq.l=reflen; ref_seq->seq.m=reflen; ref_seq->seq.s=pacseq; kroundup32(ref_seq->seq.m); s_align* result; //for (m = 0; m < ref_seq->seq.l; ++m) ref_num[m] = table[(int)ref_seq->seq.s[m]]; result = ssw_align (p, ref_num, ref_seq->seq.l, gap_open, gap_extension, 2, filter,Skip_DP, 0); //if (result && result->score1 >= filter){ result = ssw_write(result, ref_seq, read_seq); //} //align_destroy(result); //free(num); //free(ref_num); free (read_seq);free(ref_seq); return result; }
int _regidx_build_index(regidx_t *idx) { int iseq; for (iseq=0; iseq<idx->nseq; iseq++) { reglist_t *list = &idx->seq[iseq]; int j,k, imax = 0; // max index bin for (j=0; j<list->nregs; j++) { int ibeg = list->regs[j].start >> LIDX_SHIFT; int iend = list->regs[j].end >> LIDX_SHIFT; if ( imax < iend + 1 ) { int old_imax = imax; imax = iend + 1; kroundup32(imax); list->idx = (int*) realloc(list->idx, imax*sizeof(int)); for (k=old_imax; k<imax; k++) list->idx[k] = -1; } if ( ibeg==iend ) { if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j; } else { for (k=ibeg; k<=iend; k++) if ( list->idx[k]<0 ) list->idx[k] = j; } list->nidx = iend + 1; } } return 0; }
bwa_seq_t *load_reads(const char *fa_fn, uint32_t *n_seqs) { bwa_seq_t *seqs, *part_seqs; bwa_seqio_t *ks; int n_part_seqs = 0, n_seqs_full = 0, n_seqs_loaded = 0; clock_t t = clock(); ks = bwa_open_reads(BWA_MODE, fa_fn); n_seqs_full = N_CHUNK_SEQS; show_msg(__func__, "Loading reads from library %s...\n", fa_fn); seqs = (bwa_seq_t*) calloc (N_DF_MAX_SEQS, sizeof(bwa_seq_t)); while ((part_seqs = bwa_read_seq(ks, N_CHUNK_SEQS, &n_part_seqs, BWA_MODE, 0)) != 0) { show_msg(__func__, "%d sequences loaded: %.2f sec... \n", n_seqs_loaded + n_part_seqs, fa_fn, (float) (clock() - t) / CLOCKS_PER_SEC); pe_reverse_seqs(part_seqs, n_part_seqs); if ((n_seqs_loaded + n_part_seqs) > n_seqs_full) { n_seqs_full += n_part_seqs + 2; kroundup32(n_seqs_full); seqs = (bwa_seq_t*) realloc(seqs, sizeof(bwa_seq_t) * n_seqs_full); } memmove(&seqs[n_seqs_loaded], part_seqs, sizeof(bwa_seq_t) * n_part_seqs); free(part_seqs); n_seqs_loaded += n_part_seqs; } bwa_seq_close(ks); if (n_seqs_loaded < 1) { err_fatal(__func__, "No sequence in file %s, make sure the format is correct! \n", fa_fn); } *n_seqs = n_seqs_loaded; return seqs; }
void resize(uint32_t new_min) { if(new_min > m_data) { m_data = new_min; kroundup32(m_data); data = (uint8_t *)realloc(data, m_data * sizeof(uint8_t)); } }
int bam_read1(bamFile fp, bam1_t *b) { bam1_core_t *c = &b->core; int32_t block_len, ret, i; uint32_t x[8]; assert(BAM_CORE_SIZE == 32); if ((ret = bam_read(fp, &block_len, 4)) != 4) { if (ret == 0) return -1; // normal end-of-file else return -2; // truncated } if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3; if (bam_is_be) { bam_swap_endian_4p(&block_len); for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); } c->tid = x[0]; c->pos = x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; b->data_len = block_len - BAM_CORE_SIZE; if (b->m_data < b->data_len) { b->m_data = b->data_len; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; if (bam_is_be) swap_endian_data(c, b->data_len, b->data); return 4 + block_len; }
void osl_grow(overlapSetList *osl) { int i; osl->m++; kroundup32(osl->m); osl->os = realloc(osl->os, osl->m * sizeof(overlapSet*)); assert(osl->os); for(i=osl->l; i<osl->m; i++) osl->os[i] = NULL; }
//auxiliary functions for low level BAM record creation uint8_t* realloc_bdata(bam1_t *b, int size) { if (b->m_data < size) { b->m_data = size; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } if (b->data_len<size) b->data_len=size; return b->data; }
static inline uint8_t *alloc_data(bam1_t *b, int size) { if (b->m_data < size) { b->m_data = size; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } return b->data; }
void realloc(keep<T> *k,size_t newlen){ // fprintf(stderr,"[%s] k:%p k->m:%lu newlen:%lu\n",__FUNCTION__,k,k->m,newlen); kroundup32(newlen); k->d = (T*) realloc(k->d,sizeof(T)*newlen); assert(k->d!=NULL); memset(k->d+k->m,0,(newlen-k->m)*sizeof(T)); k->m=newlen; //fprintf(stderr,"[%s] k:%p k->m:%lu newlen:%lu\n",__FUNCTION__,k,k->m,newlen); }
overlapSet *os_grow(overlapSet *os) { int i; os->m++; kroundup32(os->m); os->overlaps = realloc(os->overlaps, os->m * sizeof(GTFentry*)); assert(os->overlaps); for(i=os->l; i<os->m; i++) os->overlaps[i] = NULL; return os; }
/* from seqtk.c */ static void cpy_kstr(kstring_t *dst, const kstring_t *src) { if (src->l == 0) return; if (src->l + 1 > dst->m) { dst->m = src->l + 1; kroundup32(dst->m); dst->s = realloc(dst->s, dst->m); } dst->l = src->l; memcpy(dst->s, src->s, src->l + 1); }
uint8_t* dupalloc_bdata(bam1_t *b, int size) { //same as realloc_bdata, but does not free previous data //but returns it instead //it ALWAYS duplicates data b->m_data = size; kroundup32(b->m_data); uint8_t* odata=b->data; b->data = (uint8_t*)malloc(b->m_data); memcpy((void*)b->data, (void*)odata, b->data_len); b->data_len=size; return odata; //user must FREE this after }
bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) { bwtsw2_t *p; p = calloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; if (b->n) { kroundup32(p->max); p->hits = calloc(p->max, sizeof(bsw2hit_t)); memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); } return p; }
static void growHT2(hashTable *ht, int len) { int i; ht->m = ht->l+1; kroundup32(ht->m); ht->str = realloc(ht->str, ht->m*sizeof(char*)); assert(ht->str); ht->elements = realloc(ht->elements, ht->m*sizeof(hashTableElement*)); for(i=ht->l; i<ht->m; i++) { ht->str[i] = NULL; ht->elements[i] = NULL; } rehashHT2(ht, len); }
static int ks_getuntil1(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int iter, int tid, int thrds, bool *first) { if (dret) *dret = 0; str->l = 0; if (ks->begin >= ks->end && ks->is_eof) return -1; for (;;) { int i; if (ks->begin >= ks->end) { if (*first) { gzseek(ks->f, ((iter*thrds)+tid)*READ_SIZE, SEEK_SET); *first = false; ks->atend = false; ks->begin = 0; ks->end = gzread(ks->f, ks->buf, READ_SIZE); move_to_start(ks, iter, tid, thrds); if (ks->end < READ_SIZE) ks->is_eof = 1; if (ks->end == 0) return -1; } else if (!ks->is_eof) { ks->begin = 0; ks->atend = true; ks->end = gzread(ks->f, ks->buf, READ_IND_SIZE); if (ks->end < READ_IND_SIZE) ks->is_eof = 1; if (ks->end == 0) break; } else break; } if (delimiter) { for (i = ks->begin; i < ks->end; ++i) if (ks->buf[i] == delimiter) break; } else { for (i = ks->begin; i < ks->end; ++i) if (isspace(ks->buf[i])) break; } if (str->m - str->l < i - ks->begin + 1) { str->m = str->l + (i - ks->begin) + 1; kroundup32(str->m); str->s = (char*)realloc(str->s, str->m); } memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); str->l = str->l + (i - ks->begin); ks->begin = i + 1; if (i < ks->end) { if (dret) *dret = ks->buf[i]; break; } } str->s[str->l] = '\0'; return str->l; }
static uniqueSet *us_grow(uniqueSet *us) { int i; us->m++; kroundup32(us->m); us->IDs = realloc(us->IDs, us->m * sizeof(int32_t)); assert(us->IDs); us->cnts = realloc(us->cnts, us->m * sizeof(uint32_t)); assert(us->cnts); for(i=us->l; i<us->m; i++) { us->IDs[i] = -1; us->cnts[i] = 0; } return us; }
static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) { if (n != b->core.n_cigar) { int o = b->core.l_qname + b->core.n_cigar * 4; if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) { b->m_data = b->data_len + (n - b->core.n_cigar) * 4; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o); memcpy(b->data + b->core.l_qname, cigar, n * 4); b->data_len += (n - b->core.n_cigar) * 4; b->core.n_cigar = n; } else memcpy(b->data + b->core.l_qname, cigar, n * 4); }
int ksprintf(kstring_t *s, const char *fmt, ...) { va_list ap; int l; va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); va_end(ap); if (l + 1 > s->m - s->l) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); } va_end(ap); s->l += l; return l; }
int bam_read1_fromArray(char *bamChar, bam1_t *b) //modified from samtools bam_read1 to assign BAM record in mmemry to bam structure { bam1_core_t *c = &b->core; int32_t block_len; //, ret, i; // // uint32_t x[8]; // // if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { // // if (ret == 0) return -1; // normal end-of-file // // else return -2; // truncated // // } uint32_t *x; uint32_t *bamU32=(uint32_t*) bamChar; block_len=bamU32[0]; // // if (bgzf_read(fp, x, 32) != 32) return -3; // // if (fp->is_be) { // // ed_swap_4p(&block_len); // // for (i = 0; i < 8; ++i) ed_swap_4p(x + i); // // } x=bamU32+1; c->tid = x[0]; c->pos = x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; b->l_data = block_len - 32; if (b->l_data < 0 || c->l_qseq < 0) return -4; if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data) return -4; if (b->m_data < b->l_data) { b->m_data = b->l_data; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); if (!b->data) return -4; } // // if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4; // // //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; // // if (fp->is_be) swap_data(c, b->l_data, b->data, 0); b->data=(uint8_t*) bamChar+4*9; return 4 + block_len; }
static int kseq_read1(kseq_t *seq, int iter, int tid, int thrds, bool *first) { int c; kstream_t *ks = seq->f; if (seq->last_char == 0) { /* then jump to the next header line */ while ((c = ks_getc1(ks, iter, tid, thrds, first)) != -1 && c != '>' && c != '@'); if (c == -1) return -1; /* end of file */ seq->last_char = c; } /* the first header char has been read */ seq->comment.l = seq->seq.l = seq->qual.l = 0; if (ks_getuntil1(ks, 0, &seq->name, &c, iter, tid, thrds, first) < 0) return -1; if (c != '\n') ks_getuntil1(ks, '\n', &seq->comment, 0, iter, tid, thrds, first); while ((c = ks_getc1(ks, iter, tid, thrds, first)) != -1 && c != '>' && c != '+' && c != '@') { if (isgraph(c)) { /* printable non-space character */ if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ seq->seq.m = seq->seq.l + 2; kroundup32(seq->seq.m); /* rounded to next closest 2^k */ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); } seq->seq.s[seq->seq.l++] = (char)c; } } if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ if (c != '+') return seq->seq.l; /* FASTA */ if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ seq->qual.m = seq->seq.m; seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); } while ((c = ks_getc1(ks, iter, tid, thrds, first)) != -1 && c != '\n'); /* skip the rest of '+' line */ if (c == -1) return -2; /* we should not stop here */ while ((c = ks_getc1(ks, iter, tid, thrds, first)) != -1 && seq->qual.l < seq->seq.l) if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; seq->qual.s[seq->qual.l] = 0; /* null terminated string */ seq->last_char = 0; /* we have not come to the next header line */ if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ return seq->seq.l; }
char *ibs_read_all(const char *url, int *len) { int l, n = 0, m = 0; char *ret = 0, *buf; kurl_t *ku; ku = kurl_open(url, 0); if (ku == 0) return 0; buf = calloc(1, IBS_BUFSIZE); while ((l = kurl_read(ku, buf, IBS_BUFSIZE)) > 0) { if (n + l + 1 > m) { m = n + l + 1; kroundup32(m); ret = realloc(ret, m); } memcpy(ret + n, buf, l); n += l; ret[n] = 0; } kurl_close(ku); if (len) *len = n; return ret; }
int main (int argc, char * const argv[]) { clock_t start, end; float cpu_time; gzFile read_fp, ref_fp; kseq_t *read_seq, *ref_seq; int32_t l, m, k, match = 2, mismatch = 2, gap_open = 3, gap_extension = 1, path = 0, reverse = 0, n = 5, sam = 0, protein = 0, header = 0, s1 = 67108864, s2 = 128, filter = 0; int8_t* mata = (int8_t*)calloc(25, sizeof(int8_t)); const int8_t* mat = mata; char mat_name[16]; mat_name[0] = '\0'; int8_t* ref_num = (int8_t*)malloc(s1); int8_t* num = (int8_t*)malloc(s2), *num_rc = 0; char* read_rc = 0; static const int8_t mat50[] = { // A R N D C Q E G H I L K M F P S T W Y V B Z X * 5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -2, -1, -1, -3, -1, 1, 0, -3, -2, 0, -2, -1, -1, -5, // A -2, 7, -1, -2, -4, 1, 0, -3, 0, -4, -3, 3, -2, -3, -3, -1, -1, -3, -1, -3, -1, 0, -1, -5, // R -1, -1, 7, 2, -2, 0, 0, 0, 1, -3, -4, 0, -2, -4, -2, 1, 0, -4, -2, -3, 5, 0, -1, -5, // N -2, -2, 2, 8, -4, 0, 2, -1, -1, -4, -4, -1, -4, -5, -1, 0, -1, -5, -3, -4, 6, 1, -1, -5, // D -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -3, -3, -1, -5, // C -1, 1, 0, 0, -3, 7, 2, -2, 1, -3, -2, 2, 0, -4, -1, 0, -1, -1, -1, -3, 0, 4, -1, -5, // Q -1, 0, 0, 2, -3, 2, 6, -3, 0, -4, -3, 1, -2, -3, -1, -1, -1, -3, -2, -3, 1, 5, -1, -5, // E 0, -3, 0, -1, -3, -2, -3, 8, -2, -4, -4, -2, -3, -4, -2, 0, -2, -3, -3, -4, -1, -2, -1, -5, // G -2, 0, 1, -1, -3, 1, 0, -2, 10, -4, -3, 0, -1, -1, -2, -1, -2, -3, 2, -4, 0, 0, -1, -5, // H -1, -4, -3, -4, -2, -3, -4, -4, -4, 5, 2, -3, 2, 0, -3, -3, -1, -3, -1, 4, -4, -3, -1, -5, // I -2, -3, -4, -4, -2, -2, -3, -4, -3, 2, 5, -3, 3, 1, -4, -3, -1, -2, -1, 1, -4, -3, -1, -5, // L -1, 3, 0, -1, -3, 2, 1, -2, 0, -3, -3, 6, -2, -4, -1, 0, -1, -3, -2, -3, 0, 1, -1, -5, // K -1, -2, -2, -4, -2, 0, -2, -3, -1, 2, 3, -2, 7, 0, -3, -2, -1, -1, 0, 1, -3, -1, -1, -5, // M -3, -3, -4, -5, -2, -4, -3, -4, -1, 0, 1, -4, 0, 8, -4, -3, -2, 1, 4, -1, -4, -4, -1, -5, // F -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3, -2, -1, -1, -5, // P 1, -1, 1, 0, -1, 0, -1, 0, -1, -3, -3, 0, -2, -3, -1, 5, 2, -4, -2, -2, 0, 0, -1, -5, // S 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 2, 5, -3, -2, 0, 0, -1, -1, -5, // T -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1, 1, -4, -4, -3, 15, 2, -3, -5, -2, -1, -5, // W -2, -1, -2, -3, -3, -1, -2, -3, 2, -1, -1, -2, 0, 4, -3, -2, -2, 2, 8, -1, -3, -2, -1, -5, // Y 0, -3, -3, -4, -1, -3, -3, -4, -4, 4, 1, -3, 1, -1, -3, -2, 0, -3, -1, 5, -3, -3, -1, -5, // V -2, -1, 5, 6, -3, 0, 1, -1, 0, -4, -4, 0, -3, -4, -2, 0, 0, -5, -3, -3, 6, 1, -1, -5, // B -1, 0, 0, 1, -3, 4, 5, -2, 0, -3, -3, 1, -1, -4, -1, 0, -1, -2, -2, -3, 1, 5, -1, -5, // Z -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, // X -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1 // * }; /* This table is used to transform amino acid letters into numbers. */ int8_t aa_table[128] = { 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23, 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23, 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23, 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23 }; /* This table is used to transform nucleotide letters into numbers. */ int8_t nt_table[128] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; int8_t* table = nt_table; // Parse command line. while ((l = getopt(argc, argv, "m:x:o:e:a:f:pcrsh")) >= 0) { switch (l) { case 'm': match = atoi(optarg); break; case 'x': mismatch = atoi(optarg); break; case 'o': gap_open = atoi(optarg); break; case 'e': gap_extension = atoi(optarg); break; case 'a': strcpy(mat_name, optarg); break; case 'f': filter = atoi(optarg); break; case 'p': protein = 1; break; case 'c': path = 1; break; case 'r': reverse = 1; break; case 's': sam = 1; break; case 'h': header = 1; break; } } if (optind + 2 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: ssw_test [options] ... <target.fasta> <query.fasta>(or <query.fastq>)\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "\t-m N\tN is a positive integer for weight match in genome sequence alignment. [default: 2]\n"); fprintf(stderr, "\t-x N\tN is a positive integer. -N will be used as weight mismatch in genome sequence alignment. [default: 2]\n"); fprintf(stderr, "\t-o N\tN is a positive integer. -N will be used as the weight for the gap opening. [default: 3]\n"); fprintf(stderr, "\t-e N\tN is a positive integer. -N will be used as the weight for the gap extension. [default: 1]\n"); fprintf(stderr, "\t-p\tDo protein sequence alignment. Without this option, the ssw_test will do genome sequence alignment.\n"); fprintf(stderr, "\t-a FILE\tFILE is either the Blosum or Pam weight matrix. [default: Blosum50]\n"); fprintf(stderr, "\t-c\tReturn the alignment path.\n"); fprintf(stderr, "\t-f N\tN is a positive integer. Only output the alignments with the Smith-Waterman score >= N.\n"); fprintf(stderr, "\t-r\tThe best alignment will be picked between the original read alignment and the reverse complement read alignment.\n"); fprintf(stderr, "\t-s\tOutput in SAM format. [default: no header]\n"); fprintf(stderr, "\t-h\tIf -s is used, include header in SAM output.\n\n"); return 1; } // initialize scoring matrix for genome sequences for (l = k = 0; LIKELY(l < 4); ++l) { for (m = 0; LIKELY(m < 4); ++m) mata[k++] = l == m ? match : -mismatch; /* weight_match : -weight_mismatch */ mata[k++] = 0; // ambiguous base } for (m = 0; LIKELY(m < 5); ++m) mata[k++] = 0; if (protein == 1 && (! strcmp(mat_name, "\0"))) { n = 24; table = aa_table; mat = mat50; } else if (strcmp(mat_name, "\0")) { // Parse score matrix. FILE *f_mat = fopen(mat_name, "r"); char line[128]; mata = (int8_t*)realloc(mata, 1024 * sizeof(int8_t)); k = 0; m = 0; while (fgets(line, 128, f_mat)) { if (line[0] == '*' || (line[0] >= 'A' && line[0] <= 'Z')) { if (line[0] >= 'A' && line[0] <= 'Z') aa_table[(int)line[0]] = aa_table[(int)line[0] + 32] = m; char str[4], *s = str; str[0] = '\0'; l = 1; while (line[l]) { if ((line[l] >= '0' && line[l] <= '9') || line[l] == '-') *s++ = line[l]; else if (str[0] != '\0') { *s = '\0'; mata[k++] = (int8_t)atoi(str); s = str; str[0] = '\0'; } ++l; } if (str[0] != '\0') { *s = '\0'; mata[k++] = (int8_t)atoi(str); s = str; str[0] = '\0'; } ++m; } } if (k == 0) { fprintf(stderr, "Problem of reading the weight matrix file.\n"); return 1; } fclose(f_mat); n = m; table = aa_table; mat = mata; } //fprintf(stderr, "query: %s\n", argv[optind + 1]); read_fp = gzopen(argv[optind + 1], "r"); if (! read_fp) { fprintf (stderr, "gzopen of '%s' failed.\n", argv[optind + 1]); exit (EXIT_FAILURE); } read_seq = kseq_init(read_fp); if (sam && header && path) { fprintf(stdout, "@HD\tVN:1.4\tSO:queryname\n"); ref_fp = gzopen(argv[optind], "r"); ref_seq = kseq_init(ref_fp); while ((l = kseq_read(ref_seq)) >= 0) fprintf(stdout, "@SQ\tSN:%s\tLN:%d\n", ref_seq->name.s, (int32_t)ref_seq->seq.l); kseq_destroy(ref_seq); gzclose(ref_fp); } else if (sam && !path) { fprintf(stderr, "SAM format output is only available together with option -c.\n"); sam = 0; } // alignment if (reverse == 1 && n == 5) { read_rc = (char*)malloc(s2); num_rc = (int8_t*)malloc(s2); } start = clock(); while (kseq_read(read_seq) >= 0) { s_profile* p, *p_rc = 0; int32_t readLen = read_seq->seq.l; int32_t maskLen = readLen / 2; while (readLen >= s2) { ++s2; kroundup32(s2); num = (int8_t*)realloc(num, s2); if (reverse == 1 && n == 5) { read_rc = (char*)realloc(read_rc, s2); num_rc = (int8_t*)realloc(num_rc, s2); } } for (m = 0; m < readLen; ++m) num[m] = table[(int)read_seq->seq.s[m]]; p = ssw_init(num, readLen, mat, n, 2); if (reverse == 1 && n == 5) { reverse_comple(read_seq->seq.s, read_rc); for (m = 0; m < readLen; ++m) num_rc[m] = table[(int)read_rc[m]]; p_rc = ssw_init(num_rc, readLen, mat, n, 2); }else if (reverse == 1 && n == 24) { fprintf (stderr, "Reverse complement alignment is not available for protein sequences. \n"); return 1; } ref_fp = gzopen(argv[optind], "r"); ref_seq = kseq_init(ref_fp); while (kseq_read(ref_seq) >= 0) { s_align* result, *result_rc = 0; int32_t refLen = ref_seq->seq.l; int8_t flag = 0; while (refLen > s1) { ++s1; kroundup32(s1); ref_num = (int8_t*)realloc(ref_num, s1); } for (m = 0; m < refLen; ++m) ref_num[m] = table[(int)ref_seq->seq.s[m]]; if (path == 1) flag = 2; result = ssw_align (p, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen); if (reverse == 1 && protein == 0) result_rc = ssw_align(p_rc, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen); if (result_rc && result_rc->score1 > result->score1 && result_rc->score1 >= filter) { if (sam) ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 1); else ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 0); }else if (result && result->score1 >= filter){ if (sam) ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 1); else ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 0); } else if (! result) return 1; if (result_rc) align_destroy(result_rc); align_destroy(result); } if(p_rc) init_destroy(p_rc); init_destroy(p); kseq_destroy(ref_seq); gzclose(ref_fp); } end = clock(); cpu_time = ((float) (end - start)) / CLOCKS_PER_SEC; fprintf(stderr, "CPU time: %f seconds\n", cpu_time); if (num_rc) { free(num_rc); free(read_rc); } kseq_destroy(read_seq); gzclose(read_fp); free(num); free(ref_num); free(mata); return 0; }
/*--------------------------------------------------------------------------- * Samtools compatibility portion */ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID int pos, int end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID int mpos, int isize, int len, const char *seq, const char *qual) { static const char L[256] = { 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15, 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 }; bam1_t *b = (bam1_t *)*bp; uint8_t *cp; int i, qname_nuls, bam_len; //b->l_aux = extra_len; // we fill this out later qname_nuls = 4 - qname_len%4; if (qname_len + qname_nuls > 255) // Check for core.l_qname overflow return -1; bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len; if (b->m_data < bam_len) { b->m_data = bam_len; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); if (!b->data) return -1; } b->l_data = bam_len; b->core.tid = rname; b->core.pos = pos-1; b->core.bin = bam_reg2bin(pos-1, end); b->core.qual = mapq; b->core.l_qname = qname_len+qname_nuls; b->core.l_extranul = qname_nuls-1; b->core.flag = flag; b->core.n_cigar = ncigar; b->core.l_qseq = len; b->core.mtid = mrnm; b->core.mpos = mpos-1; b->core.isize = isize; cp = b->data; strncpy((char *)cp, qname, qname_len); for (i = 0; i < qname_nuls; i++) cp[qname_len+i] = '\0'; cp += qname_len+qname_nuls; if (ncigar > 0) memcpy(cp, cigar, ncigar*4); cp += ncigar*4; for (i = 0; i+1 < len; i+=2) { *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]]; } if (i < len) *cp++ = L[(uc)seq[i]]<<4; if (qual) memcpy(cp, qual, len); else memset(cp, '\xff', len); return bam_len; }
faidx_t *fai_build_core(RAZF *rz) { char c, *name; int l_name, m_name, ret; int len, line_len, line_blen, state; int l1, l2; faidx_t *idx; uint64_t offset; idx = (faidx_t*)calloc(1, sizeof(faidx_t)); idx->hash = kh_init(s); name = 0; l_name = m_name = 0; len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; while (razf_read(rz, &c, 1)) { if (c == '\n') { // an empty line if (state == 1) { offset = razf_tell(rz); continue; } else if ((state == 0 && len < 0) || state == 2) continue; } if (c == '>') { // fasta header if (len >= 0) fai_insert_index(idx, name, len, line_len, line_blen, offset); l_name = 0; while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { if (m_name < l_name + 2) { m_name = l_name + 2; kroundup32(m_name); name = (char*)realloc(name, m_name); } name[l_name++] = c; } name[l_name] = '\0'; if (ret == 0) { fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); free(name); fai_destroy(idx); return 0; } if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); state = 1; len = 0; offset = razf_tell(rz); } else { if (state == 3) { fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } if (state == 2) state = 3; l1 = l2 = 0; do { ++l1; if (isgraph(c)) ++l2; } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); if (state == 3 && l2) { fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } ++l1; len += l2; if (l2 >= 0x10000) { fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } if (state == 1) line_len = l1, line_blen = l2, state = 0; else if (state == 0) { if (l1 != line_len || l2 != line_blen) state = 2; } } } fai_insert_index(idx, name, len, line_len, line_blen, offset); free(name); return idx; }
int main_bam2fq(int argc, char *argv[]) { BGZF *fp, *fpse = 0; bam1_t *b; uint8_t *buf; int max_buf, c, has12 = 0; kstring_t str; int64_t n_singletons = 0, n_reads = 0; char last[512], *fnse = 0; while ((c = getopt(argc, argv, "as:")) > 0) if (c == 'a') has12 = 1; else if (c == 's') fnse = optarg; if (argc == optind) { fprintf(stderr, "\nUsage: bam2fq [-a] [-s outSE] <in.bam>\n\n"); fprintf(stderr, "Options: -a append /1 and /2 to the read name\n"); fprintf(stderr, " -s FILE write singleton reads to FILE [assume single-end]\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); assert(fp); bam_hdr_destroy(bam_hdr_read(fp)); buf = 0; max_buf = 0; str.l = str.m = 0; str.s = 0; last[0] = 0; if (fnse) fpse = bgzf_open(fnse, "w1"); b = bam_init1(); while (bam_read1(fp, b) >= 0) { int i, qlen = b->core.l_qseq, is_print = 0; uint8_t *qual, *seq; if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments ++n_reads; if (fpse) { if (str.l && strcmp(last, bam_get_qname(b))) { bgzf_write(fpse, str.s, str.l); str.l = 0; ++n_singletons; } if (str.l) is_print = 1; strcpy(last, bam_get_qname(b)); } else is_print = 1; qual = bam_get_qual(b); kputc(qual[0] == 0xff? '>' : '@', &str); kputsn(bam_get_qname(b), b->core.l_qname - 1, &str); if (has12) { kputc('/', &str); kputw(b->core.flag>>6&3, &str); } kputc('\n', &str); if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); buf = (uint8_t*)realloc(buf, max_buf); } buf[qlen] = 0; seq = bam_get_seq(b); for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence if (bam_is_rev(b)) { // reverse complement for (i = 0; i < qlen>>1; ++i) { int8_t t = seq_comp_table[buf[qlen - 1 - i]]; buf[qlen - 1 - i] = seq_comp_table[buf[i]]; buf[i] = t; } if (qlen&1) buf[i] = seq_comp_table[buf[i]]; } for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]]; kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (qual[0] != 0xff) { kputsn("+\n", 2, &str); for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i]; if (bam_is_rev(b)) { // reverse for (i = 0; i < qlen>>1; ++i) { uint8_t t = buf[qlen - 1 - i]; buf[qlen - 1 - i] = buf[i]; buf[i] = t; } } } kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (is_print) { fwrite(str.s, 1, str.l, stdout); str.l = 0; } } if (fpse) { if (str.l) { bgzf_write(fpse, str.s, str.l); ++n_singletons; } fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons); bgzf_close(fpse); } fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads); free(buf); free(str.s); bam_destroy1(b); bgzf_close(fp); return 0; }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int sort_type) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; char *fnout = 0; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = realloc(buf, max_k * sizeof(void*)); memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; if (b->data_len < b->m_data>>2) { // shrink b->m_data = b->data_len; kroundup32(b->m_data); b->data = realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); // output file name fnout = calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); // write the final output if (n_files == 0) { // a single block char mode[8]; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); sort_aux_core(k, buf, sort_type); #ifndef _PBGZF_USE write_buffer(fnout, mode, k, buf, header, n_threads); #else write_buffer(fnout, mode, k, buf, header); #endif } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } #ifndef _PBGZF_USE bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); #else bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, level); #endif for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } free(fnout); // free for (k = 0; k < max_k; ++k) { if (!buf[k]) continue; free(buf[k]->data); free(buf[k]); } free(buf); bam_header_destroy(header); bam_close(fp); }
static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { bam_lplbuf_t *tv = (bam_lplbuf_t*)data; freenode_t *p; int i, l, max_level; // allocate memory if necessary if (tv->max < n) { // enlarge tv->max = n; kroundup32(tv->max); tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); } tv->n_cur = n; // update cnt for (p = tv->head; p->next; p = p->next) if (p->cnt > 0) --p->cnt; // calculate cur_level[] max_level = 0; for (i = l = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; if (p->is_head) { if (tv->head->next && tv->head->cnt == 0) { // then take a free slot freenode_t *p = tv->head->next; tv->cur_level[i] = tv->head->level; mp_free(tv->mp, tv->head); tv->head = p; --tv->n_nodes; } else tv->cur_level[i] = ++tv->max_level; } else { tv->cur_level[i] = tv->pre_level[l++]; if (p->is_tail) { // then return a free slot tv->tail->level = tv->cur_level[i]; tv->tail->next = mp_alloc(tv->mp); tv->tail = tv->tail->next; ++tv->n_nodes; } } if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; ((bam_pileup1_t*)p)->level = tv->cur_level[i]; } assert(l == tv->n_pre); tv->func(tid, pos, n, pl, tv->user_data); // sort the linked list if (tv->n_nodes) { freenode_t *q; if (tv->n_nodes + 1 > tv->m_aux) { // enlarge tv->m_aux = tv->n_nodes + 1; kroundup32(tv->m_aux); tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); } for (p = tv->head, i = l = 0; p->next;) { if (p->level > max_level) { // then discard this entry q = p->next; mp_free(tv->mp, p); p = q; } else { tv->aux[i++] = p; p = p->next; } } tv->aux[i] = tv->tail; // add a proper tail for the loop below tv->n_nodes = i; if (tv->n_nodes) { ks_introsort(node, tv->n_nodes, tv->aux); for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; tv->head = tv->aux[0]; } else tv->head = tv->tail; } // clean up tv->max_level = max_level; memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); // squeeze out terminated levels for (i = l = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; if (!p->is_tail) tv->pre_level[l++] = tv->pre_level[i]; } tv->n_pre = l; /* fprintf(stderr, "%d\t", pos+1); for (i = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; if (p->is_head) fprintf(stderr, "^"); if (p->is_tail) fprintf(stderr, "$"); fprintf(stderr, "%d,", p->level); } fprintf(stderr, "\n"); */ return 0; }
static cigar* banded_sw (const int8_t* ref, const int8_t* read, int32_t refLen, int32_t readLen, int32_t score, const uint32_t weight_gapO, /* will be used as - */ const uint32_t weight_gapE, /* will be used as - */ int32_t band_width, const int8_t* mat, /* pointer to the weight matrix */ int32_t n) { uint32_t *c = (uint32_t*)malloc(16 * sizeof(uint32_t)), *c1; int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, l, max = 0; int64_t s2 = 1024; char op, prev_op; int32_t width, width_d, *h_b, *e_b, *h_c; int8_t *direction, *direction_line; cigar* result = (cigar*)malloc(sizeof(cigar)); h_b = (int32_t*)malloc(s1 * sizeof(int32_t)); e_b = (int32_t*)malloc(s1 * sizeof(int32_t)); h_c = (int32_t*)malloc(s1 * sizeof(int32_t)); direction = (int8_t*)malloc(s2 * sizeof(int8_t)); do { width = band_width * 2 + 3, width_d = band_width * 2 + 1; while (width >= s1) { ++s1; kroundup32(s1); h_b = (int32_t*)realloc(h_b, s1 * sizeof(int32_t)); e_b = (int32_t*)realloc(e_b, s1 * sizeof(int32_t)); h_c = (int32_t*)realloc(h_c, s1 * sizeof(int32_t)); } while (width_d * readLen * 3 >= s2) { ++s2; kroundup32(s2); if (s2 < 0) { fprintf(stderr, "Alignment score and position are not consensus.\n"); exit(1); } direction = (int8_t*)realloc(direction, s2 * sizeof(int8_t)); } direction_line = direction; for (j = 1; LIKELY(j < width - 1); j ++) h_b[j] = 0; for (i = 0; LIKELY(i < readLen); i ++) { int32_t beg = 0, end = refLen - 1, u = 0, edge; j = i - band_width; beg = beg > j ? beg : j; // band start j = i + band_width; end = end < j ? end : j; // band end edge = end + 1 < width - 1 ? end + 1 : width - 1; f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0; direction_line = direction + width_d * i * 3; for (j = beg; LIKELY(j <= end); j ++) { int32_t b, e1, f1, d, de, df, dh; set_u(u, band_width, i, j); set_u(e, band_width, i - 1, j); set_u(b, band_width, i, j - 1); set_u(d, band_width, i - 1, j - 1); set_d(de, band_width, i, j, 0); set_d(df, band_width, i, j, 1); set_d(dh, band_width, i, j, 2); temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO; temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE; e_b[u] = temp1 > temp2 ? temp1 : temp2; direction_line[de] = temp1 > temp2 ? 3 : 2; temp1 = h_c[b] - weight_gapO; temp2 = f - weight_gapE; f = temp1 > temp2 ? temp1 : temp2; direction_line[df] = temp1 > temp2 ? 5 : 4; e1 = e_b[u] > 0 ? e_b[u] : 0; f1 = f > 0 ? f : 0; temp1 = e1 > f1 ? e1 : f1; temp2 = h_b[d] + mat[ref[j] * n + read[i]]; h_c[u] = temp1 > temp2 ? temp1 : temp2; if (h_c[u] > max) max = h_c[u]; if (temp1 <= temp2) direction_line[dh] = 1; else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df]; } for (j = 1; j <= u; j ++) h_b[j] = h_c[j]; } band_width *= 2; } while (LIKELY(max < score)); band_width /= 2; // trace back i = readLen - 1; j = refLen - 1; e = 0; // Count the number of M, D or I. l = 0; // record length of current cigar op = prev_op = 'M'; temp2 = 2; // h while (LIKELY(i > 0)) { set_d(temp1, band_width, i, j, temp2); switch (direction_line[temp1]) { case 1: --i; --j; temp2 = 2; direction_line -= width_d * 3; op = 'M'; break; case 2: --i; temp2 = 0; // e direction_line -= width_d * 3; op = 'I'; break; case 3: --i; temp2 = 2; direction_line -= width_d * 3; op = 'I'; break; case 4: --j; temp2 = 1; op = 'D'; break; case 5: --j; temp2 = 2; op = 'D'; break; default: fprintf(stderr, "Trace back error: %d.\n", direction_line[temp1 - 1]); free(direction); free(h_c); free(e_b); free(h_b); free(c); free(result); return 0; } if (op == prev_op) ++e; else { ++l; while (l >= s) { ++s; kroundup32(s); c = (uint32_t*)realloc(c, s * sizeof(uint32_t)); } c[l - 1] = to_cigar_int(e, prev_op); prev_op = op; e = 1; } } if (op == 'M') { ++l; while (l >= s) { ++s; kroundup32(s); c = (uint32_t*)realloc(c, s * sizeof(uint32_t)); } c[l - 1] = to_cigar_int(e + 1, op); }else { l += 2; while (l >= s) { ++s; kroundup32(s); c = (uint32_t*)realloc(c, s * sizeof(uint32_t)); } c[l - 2] = to_cigar_int(e, op); c[l - 1] = to_cigar_int(1, 'M'); } // reverse cigar c1 = (uint32_t*)malloc(l * sizeof(uint32_t)); s = 0; e = l - 1; while (LIKELY(s <= e)) { c1[s] = c[e]; c1[e] = c[s]; ++ s; -- e; } result->seq = c1; result->length = l; free(direction); free(h_c); free(e_b); free(h_b); free(c); return result; }
faidx_t *fai_build_core(BGZF *bgzf) { char *name; int c; int l_name, m_name; int line_len, line_blen, state; int l1, l2; faidx_t *idx; uint64_t offset; int64_t len; idx = (faidx_t*)calloc(1, sizeof(faidx_t)); name = (char*)calloc(1, sizeof(char)); /* at least 1 byte, for '\0' */ idx->hash = kh_init(s); l_name = m_name = 0; len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; while ( (c=bgzf_getc(bgzf))>=0 ) { if (c == '\n') { // an empty line if (state == 1) { offset = bgzf_utell(bgzf); continue; } else if ((state == 0 && len < 0) || state == 2) continue; } if (c == '>') { // fasta header if (len >= 0) fai_insert_index(idx, name, len, line_len, line_blen, offset); l_name = 0; while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) { if (m_name < l_name + 2) { m_name = l_name + 2; kroundup32(m_name); name = (char*)realloc(name, m_name); } name[l_name++] = c; } name[l_name] = '\0'; if ( c<0 ) { fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); free(name); fai_destroy(idx); return 0; } if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); state = 1; len = 0; offset = bgzf_utell(bgzf); } else { if (state == 3) { fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } if (state == 2) state = 3; l1 = l2 = 0; do { ++l1; if (isgraph(c)) ++l2; } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); if (state == 3 && l2) { fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } ++l1; len += l2; if (state == 1) line_len = l1, line_blen = l2, state = 0; else if (state == 0) { if (l1 != line_len || l2 != line_blen) state = 2; } } } if ( name ) fai_insert_index(idx, name, len, line_len, line_blen, offset); else { free(idx); return NULL; } free(name); return idx; }