void populate_index(uint64_t *table, cmph_t *hash, BGZF *fastq_file) { while( 1 ) { /* Find @ */ char c; while( ( c = bgzf_getc( fastq_file ) ) != '@' && c >= 0 ) { } long pos = bgzf_tell( fastq_file ); if( pos == -1 ) { break; } char *accession = NULL; cmph_uint32 accession_length; if( read_one_line( &accession, &accession_length, fastq_file ) != 1 ) { break; } /* Next char is sequence, save pos */ unsigned int id = cmph_search( hash, accession, accession_length ); table[ id ] = (uint64_t) pos; } }
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) { int l; char c; khiter_t iter; faidx1_t val; char *seq=NULL; // Adjust position iter = kh_get(s, fai->hash, c_name); if(iter == kh_end(fai->hash)) return 0; val = kh_value(fai->hash, iter); if(p_end_i < p_beg_i) p_beg_i = p_end_i; if(p_beg_i < 0) p_beg_i = 0; else if(val.len <= p_beg_i) p_beg_i = val.len - 1; if(p_end_i < 0) p_end_i = 0; else if(val.len <= p_end_i) p_end_i = val.len - 1; // Now retrieve the sequence int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); if ( ret<0 ) { *len = -1; fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); return NULL; } l = 0; seq = (char*)malloc(p_end_i - p_beg_i + 2); while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1) if (isgraph(c)) seq[l++] = c; seq[l] = '\0'; *len = l; return seq; }
int key_fastq_read(void *data, char **key, cmph_uint32 *keylen) { BGZF *fp = (BGZF *) data; char c; *keylen = 0; /* Find header start */ while( ( c = bgzf_getc( fp ) ) != '@' && c >= 0 ) { } *key = NULL; if( read_one_line( key, keylen, fp ) == 1 ) { return (int) *keylen; } else { return -1; } }
char * bgzf_fgets(char *buffer, size_t buffer_size, BGZF *fp) { char c = 0; size_t i = 0; while( i < buffer_size - 1 && ((c = bgzf_getc( fp )) > 0) ) { buffer[ i++ ] = c; if( c == '\n' ) { break; } } buffer[ i ] = '\0'; if( i > 0 && c != -1 ) { return buffer; } else { return NULL; } }
faidx_t *fai_build_core(BGZF *bgzf) { char *name; int c; int l_name, m_name; int line_len, line_blen, state; int l1, l2; faidx_t *idx; uint64_t offset; int64_t len; idx = (faidx_t*)calloc(1, sizeof(faidx_t)); name = (char*)calloc(1, sizeof(char)); /* at least 1 byte, for '\0' */ idx->hash = kh_init(s); l_name = m_name = 0; len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; while ( (c=bgzf_getc(bgzf))>=0 ) { if (c == '\n') { // an empty line if (state == 1) { offset = bgzf_utell(bgzf); continue; } else if ((state == 0 && len < 0) || state == 2) continue; } if (c == '>') { // fasta header if (len >= 0) fai_insert_index(idx, name, len, line_len, line_blen, offset); l_name = 0; while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) { if (m_name < l_name + 2) { m_name = l_name + 2; kroundup32(m_name); name = (char*)realloc(name, m_name); } name[l_name++] = c; } name[l_name] = '\0'; if ( c<0 ) { fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); free(name); fai_destroy(idx); return 0; } if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); state = 1; len = 0; offset = bgzf_utell(bgzf); } else { if (state == 3) { fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } if (state == 2) state = 3; l1 = l2 = 0; do { ++l1; if (isgraph(c)) ++l2; } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); if (state == 3 && l2) { fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); free(name); fai_destroy(idx); return 0; } ++l1; len += l2; if (state == 1) line_len = l1, line_blen = l2, state = 0; else if (state == 0) { if (l1 != line_len || l2 != line_blen) state = 2; } } } if ( name ) fai_insert_index(idx, name, len, line_len, line_blen, offset); else { free(idx); return NULL; } free(name); return idx; }
char *fai_fetch(const faidx_t *fai, const char *str, int *len) { char *s; int c, i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; int beg, end; beg = end = -1; h = fai->hash; name_end = l = strlen(str); s = (char*)malloc(l+1); // remove space for (i = k = 0; i < l; ++i) if (!isspace(str[i])) s[k++] = str[i]; s[k] = 0; l = k; // determine the sequence name for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end if (i >= 0) name_end = i; if (name_end < l) { // check if this is really the end int n_hyphen = 0; for (i = name_end + 1; i < l; ++i) { if (s[i] == '-') ++n_hyphen; else if (!isdigit(s[i]) && s[i] != ',') break; } if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name s[name_end] = 0; iter = kh_get(s, h, s); if (iter == kh_end(h)) { // cannot find the sequence name iter = kh_get(s, h, str); // try str as the name if (iter == kh_end(h)) { *len = 0; free(s); return 0; } else s[name_end] = ':', name_end = l; } } else iter = kh_get(s, h, str); if(iter == kh_end(h)) { fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); free(s); *len = -2; return 0; }; val = kh_value(h, iter); // parse the interval if (name_end < l) { for (i = k = name_end + 1; i < l; ++i) if (s[i] != ',') s[k++] = s[i]; s[k] = 0; beg = atoi(s + name_end + 1); for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; end = i < k? atoi(s + i + 1) : val.len; if (beg > 0) --beg; } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; free(s); // now retrieve the sequence int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); if ( ret<0 ) { *len = -1; fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); return NULL; } l = 0; s = (char*)malloc(end - beg + 2); while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg ) if (isgraph(c)) s[l++] = c; s[l] = '\0'; *len = l; return s; }
value caml_bgzf_getc(value bgzf) { CAMLparam1(bgzf); CAMLreturn(Val_int(bgzf_getc(BGZF_val(bgzf)))); }