/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_Y(const AzIntArr &ia_tokno, int dic_sz, const AzIntArr &ia_pos, int xpch_sz, /* patch size used to generate X */ int min_dist, int max_dist, AzSmat *m_y) const { AzX::throw_if_null(m_y, "AzPrepText2::gen_Y", "m_y"); int t_num; const int *tokno = ia_tokno.point(&t_num); m_y->reform(dic_sz*2, ia_pos.size()); /* *2 for left and right */ for (int ix = 0; ix < ia_pos.size(); ++ix) { int xtx0 = ia_pos.get(ix); int xtx1 = xtx0 + xpch_sz; AzIntArr ia_ctx0, ia_ctx1; for (int tx = MAX(0,xtx0+min_dist); tx < MIN(t_num,xtx0); ++tx) if (tokno[tx] >= 0) ia_ctx0.put(tokno[tx]); ia_ctx0.unique(); for (int tx = MAX(0,xtx1); tx < MIN(t_num,xtx1+max_dist); ++tx) if (tokno[tx] >= 0) ia_ctx1.put(tokno[tx]); ia_ctx1.unique(); ia_ctx1.add(dic_sz); AzIntArr ia_ctx; ia_ctx.concat(&ia_ctx0); ia_ctx.concat(&ia_ctx1); if (ia_ctx.size() > 0) { ia_ctx.unique(); m_y->col_u(ix)->load(&ia_ctx, 1); } } }
/*--------------------------------------*/ virtual void writeText(const char *fn, int digits, bool doSparse=false, bool doAppend=false) const { AzIntArr ia; ia.range(0, colNum()); writeText(fn, &ia, digits, doSparse, doAppend); }
AzSortedFeat_Sparse & operator =(const AzSortedFeat_Sparse &inp) { /* never tested */ if (this == &inp) return *this; ia_zero.reset(&inp.ia_zero); ia_index.reset(&inp.ia_index); v_value.set(&inp.v_value); _shouldDoBackward = inp._shouldDoBackward; data_num = inp.data_num; return *this; }
virtual void format(AzBytArr &s, bool do_reset=false) const { if (do_reset) s.reset(); // s.c("("); int ix; for (ix = 0; ix < ia_sz.size(); ++ix) { if (ix > 0) s.c(" x "); s.cn(ia_sz.get(ix)); } // s.c(")"); }
/*------------------------------------------------------------*/ inline void show(const AzOut &out) const { AzPrint::writeln(out, "AzpPoolingDflt::show"); int sz = pia2_out2inp.size(); int ox; for (ox = 0; ox < sz; ++ox) { AzBytArr s("["); s.cn(ox); s.c("] "); AzIntArr ia; pia2_out2inp.get(ox, &ia); ia.print(out, s.c_str()); } }
void show_below_above(const AzIntArr &ia_below, const AzIntArr &ia_above, AzBytArr &s) const { s << "("; for (int ix = 0; ix < ia_below.size(); ++ix) { if (ix > 0) s << ","; s << ia_below.get(ix); } s << ") -> ("; for (int ix = 0; ix < ia_above.size(); ++ix) { if (ix > 0) s << ","; s << ia_above.get(ix); } s << ")"; }
inline int *base_index_for_update(int *len) { if (isOriginal) { throw new AzException("AzSortedFeat_Dense::base_index_for_update", "Not allowed"); } return ia_index.point_u(len); }
void reset() { a_dense.free(&arrd); a_sparse.free(&arrs); f_num = 0; ia_isActive.reset(); active_num = 0; }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_Y_ifeat(int top_num_each, int top_num_total, const AzSmat *m_feat, const AzIntArr &ia_tokno, const AzIntArr &ia_pos, int xpch_sz, int min_dist, int max_dist, bool do_nolr, int f_pch_sz, int f_pch_step, int f_padding, AzSmat *m_y, feat_info fi[2]) const { const char *eyec = "AzPrepText2::gen_neigh_topfeat"; AzX::throw_if_null(m_feat, eyec, "m_feat"); AzX::throw_if_null(m_y, eyec, "m_y"); int t_num; const int *tokno = ia_tokno.point(&t_num); int feat_sz = m_feat->rowNum(); int f_pch_num = DIVUP(t_num+f_padding*2-f_pch_sz, f_pch_step) + 1; if (m_feat->colNum() != f_pch_num) { AzBytArr s("#patch mismatch: Expcected: "); s << f_pch_num << " Actual: " << m_feat->colNum(); AzX::throw_if(true, AzInputError, eyec, s.c_str()); } if (do_nolr) m_y->reform(feat_sz, ia_pos.size()); else m_y->reform(feat_sz*2, ia_pos.size()); for (int ix = 0; ix < ia_pos.size(); ++ix) { int xtx0 = ia_pos[ix]; int xtx1 = xtx0 + xpch_sz; AzIFarr ifa_ctx; int offs = 0; for (int tx = xtx0+min_dist; tx < xtx0; ++tx) { if (tx + f_pch_sz > xtx0) break; set_ifeat(m_feat, top_num_each, (tx+f_padding)/f_pch_step, offs, &ifa_ctx, fi); } if (!do_nolr) offs = feat_sz; for (int tx = xtx1; tx < xtx1+max_dist; ++tx) { if (tx + f_pch_sz > xtx1+max_dist) break; set_ifeat(m_feat, top_num_each, (tx+f_padding)/f_pch_step, offs, &ifa_ctx, fi); } ifa_ctx.squeeze_Max(); if (top_num_total > 0 && ifa_ctx.size() > top_num_total) { ifa_ctx.sort_Float(false); ifa_ctx.cut(top_num_total); } m_y->col_u(ix)->load(&ifa_ctx); } }
/*-------------------------------------------------------------------------*/ void AzTools_text::tokenize(AzByte *buff, int &len, const AzDic *dic, AzIntArr &ia_nn, bool do_lower, bool do_utf8dashes, /*--- output ---*/ AzDataArr<AzIntArr> &aia_tokno) { const char *eyec = "AzTools_text::tokenize(multi n)"; AzStrPool sp_tok; tokenize(buff, len, do_utf8dashes, do_lower, &sp_tok); int t_num = sp_tok.size(); aia_tokno.reset(ia_nn.size()); for (int ix = 0; ix < ia_nn.size(); ++ix) { identify_tokens(&sp_tok, ia_nn[ix], dic, aia_tokno(ix)); if (aia_tokno[ix]->size() != t_num) throw new AzException(eyec, "Conflict in #tokens"); } }
inline void rewind(AzCursor &cur) const { if (_shouldDoBackward) { cur.set(ia_index.size()); } else { cur.set(0); } }
virtual void read(AzFile *file) { AzpCNet3::read(file); int my_version = AzTools::read_header(file, reserved_len); iia_layer_conn.read(file); ia_layer_order.read(file); layer_info.read(file); conn.read(file); sp_conn.read(file); }
virtual void write(AzFile *file) { AzpCNet3::write(file); AzTools::write_header(file, version, reserved_len); iia_layer_conn.write(file); ia_layer_order.writec(file); /* write2: const */ layer_info.write(file); conn.write(file); sp_conn.writec(file); }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_Y_ngram_bow(const AzIntArr &ia_nn, const AzDataArr<AzIntArr> &aia_tokno, int dic_sz, const AzIntArr &ia_pos, int xpch_sz, /* patch size used to generate X */ int min_dist, int max_dist, bool do_nolr, AzSmat *m_y) const { const char *eyec = "AzPrepText2::gen_Y_ngram_bow"; int t_num = aia_tokno[0]->size(); if (do_nolr) m_y->reform(dic_sz, ia_pos.size()); else m_y->reform(dic_sz*2, ia_pos.size()); /* *2 for left and right */ for (int ix = 0; ix < ia_pos.size(); ++ix) { int xtx0 = ia_pos.get(ix); int xtx1 = xtx0 + xpch_sz; AzIntArr ia_ctx; int base = xtx0+min_dist; for (int nx = 0; nx < aia_tokno.size(); ++nx) { const AzIntArr *ia_tokno = aia_tokno[nx]; int nn = ia_nn[nx]; for (int tx = MAX(0,base); tx <= MIN(t_num,xtx0)-nn; ++tx) { int tokno = ia_tokno->get(tx); if (tokno >= 0) ia_ctx.put(tokno); } } base = xtx1; for (int nx = 0; nx < aia_tokno.size(); ++nx) { const AzIntArr *ia_tokno = aia_tokno[nx]; int nn = ia_nn[nx]; for (int tx = MAX(0,base); tx <= MIN(t_num,xtx1+max_dist)-nn; ++tx) { int tokno = ia_tokno->get(tx); if (tokno >= 0) { if (do_nolr) ia_ctx.put(tokno); else ia_ctx.put(dic_sz+tokno); } } } ia_ctx.unique(); m_y->col_u(ix)->load(&ia_ctx, 1); } }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_X_bow(const AzIntArr &ia_tokno, int dic_sz, int pch_sz, int pch_step, int padding, bool do_skip_stopunk, /*--- output ---*/ AzSmat *m_feat, AzIntArr *ia_pos) const /* patch position: may be NULL */ { const char *eyec = "AzPrepText2::gen_X_bow"; AzX::throw_if_null(m_feat, eyec, "m_feat"); int t_num; const int *tokno = ia_tokno.point(&t_num); int pch_num = DIVUP(t_num+padding*2-pch_sz, pch_step) + 1; m_feat->reform(dic_sz, pch_num); if (ia_pos != NULL) ia_pos->reset(); int col = 0; int tx0 = -padding; for (int pch_no = 0; pch_no < pch_num; ++pch_no) { int tx1 = tx0 + pch_sz; AzIntArr ia_rows; for (int tx = MAX(0, tx0); tx < MIN(t_num, tx1); ++tx) { if (tokno[tx] >= 0) ia_rows.put(tokno[tx]); } if (!do_skip_stopunk || ia_rows.size() > 0) { ia_rows.unique(); /* sorting too */ m_feat->col_u(col)->load(&ia_rows, 1); if (ia_pos != NULL) ia_pos->put(tx0); ++col; } if (tx1 >= t_num+padding) break; int dist = 1; if (do_skip_stopunk) { /*--- to avoid repeating the same bow ---*/ int tx; for (tx = tx0; tx < t_num; ++tx) if (tx >= 0 && tokno[tx] >= 0) break; int dist0 = tx-tx0+1; /* to lose a word, we have to slide a window this much */ tx = tx1; for (tx = tx1; tx < t_num; ++tx) if (tx >= 0 && tokno[tx] >= 0) break; int dist1 = tx-tx1+1; /* to get a new word, we have to slide a window this much */ dist = MIN(dist0, dist1); } tx0 += MAX(dist, pch_step); } m_feat->resize(col); }
/*------------------------------------------------------------------*/ double AzTaskTools::analyzeLoss(AzLossType loss_type, const AzDvect *v_p, const AzDvect *v_y, const AzIntArr *inp_ia_dx, double p_coeff) { if (loss_type == AzLoss_None) { return -1; } const double *p = v_p->point(); const double *y = v_y->point(); AzIntArr iq; const AzIntArr *ia_dx = inp_ia_dx; if (ia_dx == NULL) { iq.range(0, v_p->rowNum()); ia_dx = &iq; } double total_loss = 0; int num; const int *dxs = ia_dx->point(&num); int ix; for (ix = 0; ix < num; ++ix) { int dx = dxs[ix]; double loss = AzLoss::getLoss(loss_type, p[dx]*p_coeff, y[dx]); total_loss += loss; } double avg_loss = 0; if (num > 0) { avg_loss = total_loss / (double)num; } return avg_loss; }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_X_seq(const AzIntArr &ia_tokno, int dic_sz, int pch_sz, int pch_step, int padding, bool do_allow_zero, bool do_skip_stopunk, /*--- output ---*/ AzSmat *m_feat, AzIntArr *ia_pos) const /* patch position: may be NULL */ { const char *eyec = "AzPrepText2::gen_X_seq"; AzX::throw_if_null(m_feat, eyec, "m_feat"); AzX::no_support(do_skip_stopunk, eyec, "variable strides with Seq"); int t_num; const int *tokno = ia_tokno.point(&t_num); int pch_num = DIVUP(t_num+padding*2-pch_sz, pch_step) + 1; m_feat->reform(dic_sz*pch_sz, pch_num); if (ia_pos != NULL) ia_pos->reset(); int col = 0; int tx0 = -padding; for (int pch_no = 0; pch_no < pch_num; ++pch_no) { int tx1 = tx0 + pch_sz; AzSmat m; for (int tx = tx0; tx < tx1; ++tx) { AzSmat m0(dic_sz, 1); if (tx >= 0 && tx < t_num && tokno[tx] >= 0) { AzIntArr ia_row; ia_row.put(tokno[tx]); m0.col_u(0)->load(&ia_row, 1); } if (tx == tx0) m.set(&m0); else m.rbind(&m0); } if (do_allow_zero || !m.isZero()) { m_feat->col_u(col)->set(m.col(0)); if (ia_pos != NULL) ia_pos->put(tx0); ++col; } if (tx1 >= t_num+padding) break; tx0 += pch_step; } m_feat->resize(col); }
void read(AzFile *file) { ia_sz.read(file); }
void write(AzFile *file) { ia_sz.write(file); }
virtual int sz(int dx) const { AzX::throw_if((dx < 0 || dx >= ia_sz.size()), "AzxD::sz", "dim is out of range"); return ia_sz.get(dx); }
virtual int get_dim() const { return ia_sz.size(); }
virtual void reset(const int *arr, int len) { ia_sz.reset(arr, len); }
virtual void reset(const AzIntArr *ia) { ia_sz.reset(ia); }
virtual void reset(const AzxD *inp) { ia_sz.reset(&inp->ia_sz); }
virtual bool isSame(const AzxD *inp) const { if (ia_sz.compare(&inp->ia_sz) != 0) return false; return true; }
virtual void reset(int dim=0) { /* generate a unit size region */ ia_sz.reset(); if (dim == 0) return; ia_sz.reset(dim, 1); }
virtual bool is_valid() const { if (ia_sz.size() <= 0) return false; int ix; for (ix = 0; ix < ia_sz.size(); ++ix) if (ia_sz.get(ix) <= 0) return false; return true; }
AzxD(const AzxD *inp) { ia_sz.reset(&inp->ia_sz); }
virtual int top_lay_ind() const { return ia_layer_order.get(ia_layer_order.size()-1); }
/*--------------------------------------*/ virtual void writeText(const char *fn, int digits) const { AzIntArr ia; ia.range(0, rowNum()); writeText(fn, &ia, digits); }