/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_Y(const AzIntArr &ia_tokno, int dic_sz, const AzIntArr &ia_pos, int xpch_sz, /* patch size used to generate X */ int min_dist, int max_dist, AzSmat *m_y) const { AzX::throw_if_null(m_y, "AzPrepText2::gen_Y", "m_y"); int t_num; const int *tokno = ia_tokno.point(&t_num); m_y->reform(dic_sz*2, ia_pos.size()); /* *2 for left and right */ for (int ix = 0; ix < ia_pos.size(); ++ix) { int xtx0 = ia_pos.get(ix); int xtx1 = xtx0 + xpch_sz; AzIntArr ia_ctx0, ia_ctx1; for (int tx = MAX(0,xtx0+min_dist); tx < MIN(t_num,xtx0); ++tx) if (tokno[tx] >= 0) ia_ctx0.put(tokno[tx]); ia_ctx0.unique(); for (int tx = MAX(0,xtx1); tx < MIN(t_num,xtx1+max_dist); ++tx) if (tokno[tx] >= 0) ia_ctx1.put(tokno[tx]); ia_ctx1.unique(); ia_ctx1.add(dic_sz); AzIntArr ia_ctx; ia_ctx.concat(&ia_ctx0); ia_ctx.concat(&ia_ctx1); if (ia_ctx.size() > 0) { ia_ctx.unique(); m_y->col_u(ix)->load(&ia_ctx, 1); } } }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_X_bow(const AzIntArr &ia_tokno, int dic_sz, int pch_sz, int pch_step, int padding, bool do_skip_stopunk, /*--- output ---*/ AzSmat *m_feat, AzIntArr *ia_pos) const /* patch position: may be NULL */ { const char *eyec = "AzPrepText2::gen_X_bow"; AzX::throw_if_null(m_feat, eyec, "m_feat"); int t_num; const int *tokno = ia_tokno.point(&t_num); int pch_num = DIVUP(t_num+padding*2-pch_sz, pch_step) + 1; m_feat->reform(dic_sz, pch_num); if (ia_pos != NULL) ia_pos->reset(); int col = 0; int tx0 = -padding; for (int pch_no = 0; pch_no < pch_num; ++pch_no) { int tx1 = tx0 + pch_sz; AzIntArr ia_rows; for (int tx = MAX(0, tx0); tx < MIN(t_num, tx1); ++tx) { if (tokno[tx] >= 0) ia_rows.put(tokno[tx]); } if (!do_skip_stopunk || ia_rows.size() > 0) { ia_rows.unique(); /* sorting too */ m_feat->col_u(col)->load(&ia_rows, 1); if (ia_pos != NULL) ia_pos->put(tx0); ++col; } if (tx1 >= t_num+padding) break; int dist = 1; if (do_skip_stopunk) { /*--- to avoid repeating the same bow ---*/ int tx; for (tx = tx0; tx < t_num; ++tx) if (tx >= 0 && tokno[tx] >= 0) break; int dist0 = tx-tx0+1; /* to lose a word, we have to slide a window this much */ tx = tx1; for (tx = tx1; tx < t_num; ++tx) if (tx >= 0 && tokno[tx] >= 0) break; int dist1 = tx-tx1+1; /* to get a new word, we have to slide a window this much */ dist = MIN(dist0, dist1); } tx0 += MAX(dist, pch_step); } m_feat->resize(col); }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_Y_ifeat(int top_num_each, int top_num_total, const AzSmat *m_feat, const AzIntArr &ia_tokno, const AzIntArr &ia_pos, int xpch_sz, int min_dist, int max_dist, bool do_nolr, int f_pch_sz, int f_pch_step, int f_padding, AzSmat *m_y, feat_info fi[2]) const { const char *eyec = "AzPrepText2::gen_neigh_topfeat"; AzX::throw_if_null(m_feat, eyec, "m_feat"); AzX::throw_if_null(m_y, eyec, "m_y"); int t_num; const int *tokno = ia_tokno.point(&t_num); int feat_sz = m_feat->rowNum(); int f_pch_num = DIVUP(t_num+f_padding*2-f_pch_sz, f_pch_step) + 1; if (m_feat->colNum() != f_pch_num) { AzBytArr s("#patch mismatch: Expcected: "); s << f_pch_num << " Actual: " << m_feat->colNum(); AzX::throw_if(true, AzInputError, eyec, s.c_str()); } if (do_nolr) m_y->reform(feat_sz, ia_pos.size()); else m_y->reform(feat_sz*2, ia_pos.size()); for (int ix = 0; ix < ia_pos.size(); ++ix) { int xtx0 = ia_pos[ix]; int xtx1 = xtx0 + xpch_sz; AzIFarr ifa_ctx; int offs = 0; for (int tx = xtx0+min_dist; tx < xtx0; ++tx) { if (tx + f_pch_sz > xtx0) break; set_ifeat(m_feat, top_num_each, (tx+f_padding)/f_pch_step, offs, &ifa_ctx, fi); } if (!do_nolr) offs = feat_sz; for (int tx = xtx1; tx < xtx1+max_dist; ++tx) { if (tx + f_pch_sz > xtx1+max_dist) break; set_ifeat(m_feat, top_num_each, (tx+f_padding)/f_pch_step, offs, &ifa_ctx, fi); } ifa_ctx.squeeze_Max(); if (top_num_total > 0 && ifa_ctx.size() > top_num_total) { ifa_ctx.sort_Float(false); ifa_ctx.cut(top_num_total); } m_y->col_u(ix)->load(&ifa_ctx); } }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_X_seq(const AzIntArr &ia_tokno, int dic_sz, int pch_sz, int pch_step, int padding, bool do_allow_zero, bool do_skip_stopunk, /*--- output ---*/ AzSmat *m_feat, AzIntArr *ia_pos) const /* patch position: may be NULL */ { const char *eyec = "AzPrepText2::gen_X_seq"; AzX::throw_if_null(m_feat, eyec, "m_feat"); AzX::no_support(do_skip_stopunk, eyec, "variable strides with Seq"); int t_num; const int *tokno = ia_tokno.point(&t_num); int pch_num = DIVUP(t_num+padding*2-pch_sz, pch_step) + 1; m_feat->reform(dic_sz*pch_sz, pch_num); if (ia_pos != NULL) ia_pos->reset(); int col = 0; int tx0 = -padding; for (int pch_no = 0; pch_no < pch_num; ++pch_no) { int tx1 = tx0 + pch_sz; AzSmat m; for (int tx = tx0; tx < tx1; ++tx) { AzSmat m0(dic_sz, 1); if (tx >= 0 && tx < t_num && tokno[tx] >= 0) { AzIntArr ia_row; ia_row.put(tokno[tx]); m0.col_u(0)->load(&ia_row, 1); } if (tx == tx0) m.set(&m0); else m.rbind(&m0); } if (do_allow_zero || !m.isZero()) { m_feat->col_u(col)->set(m.col(0)); if (ia_pos != NULL) ia_pos->put(tx0); ++col; } if (tx1 >= t_num+padding) break; tx0 += pch_step; } m_feat->resize(col); }