void append_const(double const_to_add) { int r_num = m_feat.rowNum()+1, c_num = m_feat.colNum(); m_feat.resize(r_num, c_num); int col; for (col = 0; col < c_num; ++col) { AzIFarr ifa; m_feat.col(col)->nonZero(&ifa); ifa.put(r_num-1, const_to_add); m_feat.col_u(col)->load(&ifa); } }
/*----------------------------------------------------------------------*/ void AzSvDataS::mergeData(const AzSmat *m_x, const AzSvFeatInfo *feat, const char *fn_template, const char *str, bool doSparse, int digits, const char *out_x_fn, const char *out_n_fn, int num, const char *names[]) { const char *eyec = "AzSvDataS::mergeData"; int data_num = m_x->colNum(); int f_num = m_x->rowNum(); if (feat->featNum() != f_num) { throw new AzException(eyec, "Conflict btw m_x and featInfo"); } AzFile n_file(out_n_fn); n_file.open("wb"); int fx; for (fx = 0; fx < feat->featNum(); ++fx) { AzBytArr s; feat->desc(fx, &s); s.nl(); s.writeText(&n_file); } AzSmat m; m_x->transpose(&m); m.resize(data_num, f_num+num); AzStrPool sp_names; for (fx = 0; fx < num; ++fx) { AzBytArr s_fn(fn_template); s_fn.replace("*", names[fx]); AzDvect v; AzSvDataS::readVector(s_fn.c_str(), &v); if (v.rowNum() != m.rowNum()) { throw new AzException(AzInputError, eyec, "conflict in #data:", s_fn.c_str()); } m.col_u(f_num+fx)->set(&v); AzBytArr s_nm; if (AzTools::isSpecified(str)) s_nm.c(str); s_nm.c(names[fx]); s_nm.nl(); s_nm.writeText(&n_file); } n_file.close(true); AzSmat m1; m.transpose(&m1); m1.writeText(out_x_fn, digits, doSparse); }
virtual void reset_data_for_test(const AzOut &out, const AzSmat *m_data) { bool doSparse = false; if (m_data->rowNum()*m_data->colNum() > Az_max_test_entries) { /* large data */ /*--- dense is faster but uses up more memory if data is sparse ---*/ double nz_ratio; m_data->nonZeroNum(&nz_ratio); if (nz_ratio < 0.6) { /* relatively sparse */ doSparse = true; AzBytArr s; s.c("Large and sparse test data (nonzero ratio=", nz_ratio); s.c("); treated as sparse data."); AzPrint::writeln(out, s); } } data_num = m_data->colNum(); m_tran_dense.reset(); m_tran_sparse.reset(); if (doSparse) { m_data->transpose(&m_tran_sparse); } else { m_tran_dense.transpose_from(m_data); } sorted_arr.reset(); feat.reset(m_data->rowNum()); }
virtual void reset_data(const AzOut &out, const AzSmat *m_data, AzParam &p, bool beTight, const AzSvFeatInfo *inp_feat=NULL) { resetParam(p); printParam(out); /*--- count nonzero components ---*/ double nz_ratio; m_data->nonZeroNum(&nz_ratio); AzBytArr s("Training data: "); s.cn(m_data->rowNum());s.c("x");s.cn(m_data->colNum()); s.c(", nonzero_ratio=", nz_ratio, 4); /*--- decide sparse or dense ---*/ AzBytArr s_dp("; managed as dense data"); bool doSparse = false; if (dataproc == dataproc_Auto && nz_ratio < Az_nz_ratio_threshold || dataproc == dataproc_Sparse) { doSparse = true; s_dp.reset("; managed as sparse data"); } if (dataproc != dataproc_Auto) s_dp.concat(" as requested."); else s_dp.concat("."); AzPrint::writeln(out, "-------------"); AzPrint::writeln(out, s, s_dp); AzPrint::writeln(out, "-------------"); /*--- pre-sort data ---*/ m_tran_sparse.reset(); m_tran_dense.unlock(); m_tran_dense.reset(); data_num = m_data->colNum(); if (doSparse) { m_data->transpose(&m_tran_sparse); sorted_arr.reset_sparse(&m_tran_sparse, beTight); } else { m_tran_dense.transpose_from(m_data); sorted_arr.reset_dense(&m_tran_dense, beTight); /* prohibit any action to change the pointers to the column vectors */ m_tran_dense.lock(); } if (inp_feat != NULL) { feat.reset(inp_feat); if (feat.featNum() != m_data->rowNum()) { throw new AzException(AzInputError, "AzDataForTrTree::reset", "#feat mismatch"); } } else { feat.reset(m_data->rowNum()); } }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_X_seq(const AzIntArr &ia_tokno, int dic_sz, int pch_sz, int pch_step, int padding, bool do_allow_zero, bool do_skip_stopunk, /*--- output ---*/ AzSmat *m_feat, AzIntArr *ia_pos) const /* patch position: may be NULL */ { const char *eyec = "AzPrepText2::gen_X_seq"; AzX::throw_if_null(m_feat, eyec, "m_feat"); AzX::no_support(do_skip_stopunk, eyec, "variable strides with Seq"); int t_num; const int *tokno = ia_tokno.point(&t_num); int pch_num = DIVUP(t_num+padding*2-pch_sz, pch_step) + 1; m_feat->reform(dic_sz*pch_sz, pch_num); if (ia_pos != NULL) ia_pos->reset(); int col = 0; int tx0 = -padding; for (int pch_no = 0; pch_no < pch_num; ++pch_no) { int tx1 = tx0 + pch_sz; AzSmat m; for (int tx = tx0; tx < tx1; ++tx) { AzSmat m0(dic_sz, 1); if (tx >= 0 && tx < t_num && tokno[tx] >= 0) { AzIntArr ia_row; ia_row.put(tokno[tx]); m0.col_u(0)->load(&ia_row, 1); } if (tx == tx0) m.set(&m0); else m.rbind(&m0); } if (do_allow_zero || !m.isZero()) { m_feat->col_u(col)->set(m.col(0)); if (ia_pos != NULL) ia_pos->put(tx0); ++col; } if (tx1 >= t_num+padding) break; tx0 += pch_step; } m_feat->resize(col); }
/*--- For the sparse data format ---*/ inline static void parseDataLine_Sparse(const AzByte *inp, int inp_len, int f_num, const char *data_fn, int line_no, /*--- output ---*/ AzSmat *m_feat, int col) { AzIFarr ifa_ex_val; _parseDataLine_Sparse(inp, inp_len, f_num, data_fn, line_no, ifa_ex_val); m_feat->load(col, &ifa_ex_val); }
virtual inline bool isLE(int dx, int fx, double border_val) const { double value; if (AzSmat::isNull(&m_tran_sparse)) { value = m_tran_dense.get(dx, fx); } else { value = m_tran_sparse.get(dx, fx); } if (value <= border_val) return true; return false; }
inline void add_to(AzDvect *v_dst, int col, double coeff) const { if (md != NULL) v_dst->add(md->col(col), coeff); /* dst += md[,col]*coeff */ else if (ms != NULL) v_dst->add(ms->col(col), coeff); else throw new AzException("AzDSmat::add", "No data"); }
inline int colNum() const { if (md != NULL) return md->colNum(); if (ms != NULL) return ms->colNum(); return 0; }
inline int rowNum() const { if (md != NULL) return md->rowNum(); if (ms != NULL) return ms->rowNum(); return 0; }
/*-----------------------------------------------*/ void normalize_feat() { m_feat.normalize(); /* to unit vectors */ }
inline int size() const { return m_feat.colNum(); }
inline int featNum() const { return m_feat.rowNum(); }