bin_mdef_t * bin_mdef_read(cmd_ln_t *config, const char *filename) { bin_mdef_t *m; FILE *fh; size_t tree_start; int32 val, i, swap, pos, end; int32 *sseq_size; int do_mmap; /* Try to read it as text first. */ if ((m = bin_mdef_read_text(config, filename)) != NULL) return m; E_INFO("Reading binary model definition: %s\n", filename); if ((fh = fopen(filename, "rb")) == NULL) return NULL; if (fread(&val, 4, 1, fh) != 1) { fclose(fh); E_ERROR_SYSTEM("Failed to read byte-order marker from %s\n", filename); return NULL; } swap = 0; if (val == BIN_MDEF_OTHER_ENDIAN) { swap = 1; E_INFO("Must byte-swap %s\n", filename); } if (fread(&val, 4, 1, fh) != 1) { fclose(fh); E_ERROR_SYSTEM("Failed to read version from %s\n", filename); return NULL; } if (swap) SWAP_INT32(&val); if (val > BIN_MDEF_FORMAT_VERSION) { E_ERROR("File format version %d for %s is newer than library\n", val, filename); fclose(fh); return NULL; } if (fread(&val, 4, 1, fh) != 1) { fclose(fh); E_ERROR_SYSTEM("Failed to read header length from %s\n", filename); return NULL; } if (swap) SWAP_INT32(&val); /* Skip format descriptor. */ fseek(fh, val, SEEK_CUR); /* Finally allocate it. */ m = ckd_calloc(1, sizeof(*m)); m->refcnt = 1; /* Check these, to make gcc/glibc shut up. */ #define FREAD_SWAP32_CHK(dest) \ if (fread((dest), 4, 1, fh) != 1) { \ fclose(fh); \ ckd_free(m); \ E_ERROR_SYSTEM("Failed to read %s from %s\n", #dest, filename); \ return NULL; \ } \ if (swap) SWAP_INT32(dest); FREAD_SWAP32_CHK(&m->n_ciphone); FREAD_SWAP32_CHK(&m->n_phone); FREAD_SWAP32_CHK(&m->n_emit_state); FREAD_SWAP32_CHK(&m->n_ci_sen); FREAD_SWAP32_CHK(&m->n_sen); FREAD_SWAP32_CHK(&m->n_tmat); FREAD_SWAP32_CHK(&m->n_sseq); FREAD_SWAP32_CHK(&m->n_ctx); FREAD_SWAP32_CHK(&m->n_cd_tree); FREAD_SWAP32_CHK(&m->sil); /* CI names are first in the file. */ m->ciname = ckd_calloc(m->n_ciphone, sizeof(*m->ciname)); /* Decide whether to read in the whole file or mmap it. */ do_mmap = config ? cmd_ln_boolean_r(config, "-mmap") : TRUE; if (swap) { E_WARN("-mmap specified, but mdef is other-endian. Will not memory-map.\n"); do_mmap = FALSE; } /* Actually try to mmap it. */ if (do_mmap) { m->filemap = mmio_file_read(filename); if (m->filemap == NULL) do_mmap = FALSE; } pos = ftell(fh); if (do_mmap) { /* Get the base pointer from the memory map. */ m->ciname[0] = (char *)mmio_file_ptr(m->filemap) + pos; /* Success! */ m->alloc_mode = BIN_MDEF_ON_DISK; } else { /* Read everything into memory. */ m->alloc_mode = BIN_MDEF_IN_MEMORY; fseek(fh, 0, SEEK_END); end = ftell(fh); fseek(fh, pos, SEEK_SET); m->ciname[0] = ckd_malloc(end - pos); if (fread(m->ciname[0], 1, end - pos, fh) != end - pos) E_FATAL("Failed to read %d bytes of data from %s\n", end - pos, filename); } for (i = 1; i < m->n_ciphone; ++i) m->ciname[i] = m->ciname[i - 1] + strlen(m->ciname[i - 1]) + 1; /* Skip past the padding. */ tree_start = m->ciname[i - 1] + strlen(m->ciname[i - 1]) + 1 - m->ciname[0]; tree_start = (tree_start + 3) & ~3; m->cd_tree = (cd_tree_t *) (m->ciname[0] + tree_start); if (swap) { for (i = 0; i < m->n_cd_tree; ++i) { SWAP_INT16(&m->cd_tree[i].ctx); SWAP_INT16(&m->cd_tree[i].n_down); SWAP_INT32(&m->cd_tree[i].c.down); } } m->phone = (mdef_entry_t *) (m->cd_tree + m->n_cd_tree); if (swap) { for (i = 0; i < m->n_phone; ++i) { SWAP_INT32(&m->phone[i].ssid); SWAP_INT32(&m->phone[i].tmat); } } sseq_size = (int32 *) (m->phone + m->n_phone); if (swap) SWAP_INT32(sseq_size); m->sseq = ckd_calloc(m->n_sseq, sizeof(*m->sseq)); m->sseq[0] = (uint16 *) (sseq_size + 1); if (swap) { for (i = 0; i < *sseq_size; ++i) SWAP_INT16(m->sseq[0] + i); } if (m->n_emit_state) { for (i = 1; i < m->n_sseq; ++i) m->sseq[i] = m->sseq[0] + i * m->n_emit_state; } else { m->sseq_len = (uint8 *) (m->sseq[0] + *sseq_size); for (i = 1; i < m->n_sseq; ++i) m->sseq[i] = m->sseq[i - 1] + m->sseq_len[i - 1]; } /* Now build the CD-to-CI mappings using the senone sequences. * This is the only really accurate way to do it, though it is * still inaccurate in the case of heterogeneous topologies or * cross-state tying. */ m->cd2cisen = (int16 *) ckd_malloc(m->n_sen * sizeof(*m->cd2cisen)); m->sen2cimap = (int16 *) ckd_malloc(m->n_sen * sizeof(*m->sen2cimap)); /* Default mappings (identity, none) */ for (i = 0; i < m->n_ci_sen; ++i) m->cd2cisen[i] = i; for (; i < m->n_sen; ++i) m->cd2cisen[i] = -1; for (i = 0; i < m->n_sen; ++i) m->sen2cimap[i] = -1; for (i = 0; i < m->n_phone; ++i) { int32 j, ssid = m->phone[i].ssid; for (j = 0; j < bin_mdef_n_emit_state_phone(m, i); ++j) { int s = bin_mdef_sseq2sen(m, ssid, j); int ci = bin_mdef_pid2ci(m, i); /* Take the first one and warn if we have cross-state tying. */ if (m->sen2cimap[s] == -1) m->sen2cimap[s] = ci; if (m->sen2cimap[s] != ci) E_WARN ("Senone %d is shared between multiple base phones\n", s); if (j > bin_mdef_n_emit_state_phone(m, ci)) E_WARN("CD phone %d has fewer states than CI phone %d\n", i, ci); else m->cd2cisen[s] = bin_mdef_sseq2sen(m, m->phone[ci].ssid, j); } } /* Set the silence phone. */ m->sil = bin_mdef_ciphone_id(m, S3_SILENCE_CIPHONE); E_INFO ("%d CI-phone, %d CD-phone, %d emitstate/phone, %d CI-sen, %d Sen, %d Sen-Seq\n", m->n_ciphone, m->n_phone - m->n_ciphone, m->n_emit_state, m->n_ci_sen, m->n_sen, m->n_sseq); fclose(fh); return m; }
static int32 read_sendump(s2_semi_mgau_t *s, mdef_t *mdef, char const *file) { FILE *fp; char line[1000]; int32 i, n; int32 do_swap, do_mmap; size_t filesize, offset; int n_clust = 256; /* Number of clusters (if zero, we are just using * 8-bit quantized weights) */ int r = s->n_density; int c = mdef_n_sen(mdef); s->n_sen = c; do_mmap = cmd_ln_boolean_r(s->config, "-mmap"); if ((fp = fopen(file, "rb")) == NULL) return -1; E_INFO("Loading senones from dump file %s\n", file); /* Read title size, title */ fread(&n, sizeof(int32), 1, fp); /* This is extremely bogus */ do_swap = 0; if (n < 1 || n > 999) { SWAP_INT32(&n); if (n < 1 || n > 999) { E_FATAL("Title length %x in dump file %s out of range\n", n, file); } do_swap = 1; } if (fread(line, sizeof(char), n, fp) != n) E_FATAL("Cannot read title\n"); if (line[n - 1] != '\0') E_FATAL("Bad title in dump file\n"); E_INFO("%s\n", line); /* Read header size, header */ fread(&n, 1, sizeof(n), fp); if (do_swap) SWAP_INT32(&n); if (fread(line, sizeof(char), n, fp) != n) E_FATAL("Cannot read header\n"); if (line[n - 1] != '\0') E_FATAL("Bad header in dump file\n"); /* Read other header strings until string length = 0 */ for (;;) { fread(&n, 1, sizeof(n), fp); if (do_swap) SWAP_INT32(&n); if (n == 0) break; if (fread(line, sizeof(char), n, fp) != n) E_FATAL("Cannot read header\n"); /* Look for a cluster count, if present */ if (!strncmp(line, "cluster_count ", strlen("cluster_count "))) { n_clust = atoi(line + strlen("cluster_count ")); } } /* Read #codewords, #pdfs */ fread(&r, 1, sizeof(r), fp); if (do_swap) SWAP_INT32(&r); fread(&c, 1, sizeof(c), fp); if (do_swap) SWAP_INT32(&c); E_INFO("Rows: %d, Columns: %d\n", r, c); if (n_clust) { E_ERROR ("Dump file is incompatible with PocketSphinx\n"); fclose(fp); return -1; } if (do_mmap) { E_INFO("Using memory-mapped I/O for senones\n"); } offset = ftell(fp); fseek(fp, 0, SEEK_END); filesize = ftell(fp); fseek(fp, offset, SEEK_SET); /* Allocate memory for pdfs (or memory map them) */ if (do_mmap) s->sendump_mmap = mmio_file_read(file); /* Otherwise, set up all pointers, etc. */ if (s->sendump_mmap) { s->mixw = ckd_calloc(s->n_feat, sizeof(*s->mixw)); for (i = 0; i < s->n_feat; i++) { /* Pointers into the mmap()ed 2d array */ s->mixw[i] = ckd_calloc(r, sizeof(**s->mixw)); } for (n = 0; n < s->n_feat; n++) { for (i = 0; i < r; i++) { s->mixw[n][i] = ((uint8 *) mmio_file_ptr(s->sendump_mmap)) + offset; offset += c; } } } else { s->mixw = ckd_calloc_3d(s->n_feat, r, c, sizeof(***s->mixw)); /* Read pdf values and ids */ for (n = 0; n < s->n_feat; n++) { for (i = 0; i < r; i++) { if (fread(s->mixw[n][i], sizeof(***s->mixw), c, fp) != (size_t) c) { E_ERROR("Failed to read %d bytes from sendump\n", c); return -1; } } } } fclose(fp); return 0; }