/*----------------------------------------------------------------------*/ void AzSvDataS::mergeData(const AzSmat *m_x, const AzSvFeatInfo *feat, const char *fn_template, const char *str, bool doSparse, int digits, const char *out_x_fn, const char *out_n_fn, int num, const char *names[]) { const char *eyec = "AzSvDataS::mergeData"; int data_num = m_x->colNum(); int f_num = m_x->rowNum(); if (feat->featNum() != f_num) { throw new AzException(eyec, "Conflict btw m_x and featInfo"); } AzFile n_file(out_n_fn); n_file.open("wb"); int fx; for (fx = 0; fx < feat->featNum(); ++fx) { AzBytArr s; feat->desc(fx, &s); s.nl(); s.writeText(&n_file); } AzSmat m; m_x->transpose(&m); m.resize(data_num, f_num+num); AzStrPool sp_names; for (fx = 0; fx < num; ++fx) { AzBytArr s_fn(fn_template); s_fn.replace("*", names[fx]); AzDvect v; AzSvDataS::readVector(s_fn.c_str(), &v); if (v.rowNum() != m.rowNum()) { throw new AzException(AzInputError, eyec, "conflict in #data:", s_fn.c_str()); } m.col_u(f_num+fx)->set(&v); AzBytArr s_nm; if (AzTools::isSpecified(str)) s_nm.c(str); s_nm.c(names[fx]); s_nm.nl(); s_nm.writeText(&n_file); } n_file.close(true); AzSmat m1; m.transpose(&m1); m1.writeText(out_x_fn, digits, doSparse); }
/*-------------------------------------------------------------------------*/ void AzPrepText2::_write_XY_dic(const AzDic *dic, int x_row_num, const char *nm, const char *ext) const { const char *eyec = "AzPrepText2::_write_XY_dic"; if (dic == NULL) return; int pch_sz = x_row_num / dic->size(); AzX::throw_if((x_row_num % dic->size() != 0), eyec, "Conflict in #row and size of vocabulary"); AzBytArr s_fn(nm, ext); if (pch_sz > 1) { AzDic seq_dic; gen_seq_dic(dic, pch_sz, &seq_dic); seq_dic.writeText(s_fn.c_str()); } else { dic->writeText(s_fn.c_str()); } }
/*-------------------------------------------------------------------------*/ void AzPrepText2::gen_regions_parsup(int argc, const char *argv[]) const { const char *eyec = "AzPrepText2::gen_regions_parsup"; AzPrepText2_gen_regions_parsup_Param p(argc, argv, out); check_batch_id(p.s_batch_id); AzMats_file<AzSmat> mfile; int feat_data_num = mfile.reset_for_read(p.s_feat_fn.c_str()); AzStrPool sp_typ(10,10); sp_typ.put(kw_bow, kw_seq); AzXi::check_input(p.s_xtyp.c_str(), &sp_typ, eyec, kw_xtyp); bool do_xseq = p.s_xtyp.equals(kw_seq); bool do_skip_stopunk = (do_xseq) ? false : true; AzDic dic(p.s_xdic_fn.c_str()); AzX::throw_if((dic.size() <= 0), AzInputError, eyec, "No vocabulary"); /*--- scan files to determine buffer size and #data ---*/ AzOut noout; AzStrPool sp_list; AzIntArr ia_data_num; int buff_size = AzTools_text::scan_files_in_list(p.s_inp_fn.c_str(), p.s_txt_ext.c_str(), noout, &sp_list, &ia_data_num); int data_num = ia_data_num.sum(); AzX::throw_if ((data_num != feat_data_num), eyec, "#data mismatch"); /*--- read data and generate features ---*/ AzDataArr<AzSmat> am_x(data_num), am_y(data_num); buff_size += 256; AzBytArr s_buff; AzByte *buff = s_buff.reset(buff_size, 0); int no_data = 0, data_no = 0, cnum = 0, cnum_before_reduce = 0; feat_info fi[2]; for (int fx = 0; fx < sp_list.size(); ++fx) { /* for each file */ AzBytArr s_fn(sp_list.c_str(fx), p.s_txt_ext.c_str()); const char *fn = s_fn.c_str(); AzTimeLog::print(fn, log_out); AzFile file(fn); file.open("rb"); int num_in_file = ia_data_num.get(fx); int inc = num_in_file / 50, milestone = inc; int dx = 0; for ( ; ; ++dx) { /* for each doc */ AzTools::check_milestone(milestone, dx, inc); int len = file.gets(buff, buff_size); if (len <= 0) break; /*--- X ---*/ AzBytArr s_data(buff, len); int my_len = s_data.length(); AzIntArr ia_tokno; int nn = 1; AzTools_text::tokenize(s_data.point_u(), my_len, &dic, nn, p.do_lower, p.do_utf8dashes, &ia_tokno); AzIntArr ia_pos; bool do_allow_zero = false; if (do_xseq) gen_X_seq(ia_tokno, dic.size(), p.pch_sz, p.pch_step, p.padding, do_allow_zero, do_skip_stopunk, am_x.point_u(data_no), &ia_pos); else gen_X_bow(ia_tokno, dic.size(), p.pch_sz, p.pch_step, p.padding, do_skip_stopunk, am_x.point_u(data_no), &ia_pos); AzSmat m_feat; mfile.read(&m_feat); if (am_x.point(data_no)->colNum() <= 0) { ++no_data; continue; } if (p.top_num_each > 0 || p.top_num_total > 0 || p.scale_y > 0) { double min_ifeat = m_feat.min(); AzX::no_support((min_ifeat < 0), eyec, "Negative values for internal-feature components."); } /*--- Y (ifeat: internal features generated by a supervised model) ---*/ gen_Y_ifeat(p.top_num_each, p.top_num_total, &m_feat, &ia_tokno, &ia_pos, p.pch_sz, -p.dist, p.dist, p.do_nolr, p.f_pch_sz, p.f_pch_step, p.f_padding, am_y.point_u(data_no), fi); if (p.min_yval > 0) { am_y.point_u(data_no)->cut(p.min_yval); } cnum_before_reduce += am_x.point(data_no)->colNum(); reduce_xy(p.min_x, p.min_y, am_x.point_u(data_no), am_y.point_u(data_no)); if (am_x.point(data_no)->colNum() <= 0) { ++no_data; continue; } cnum += am_x.point(data_no)->colNum(); ++data_no; } /* for each doc */ AzTools::finish_milestone(milestone); AzBytArr s(" #data="); s << data_no << " no_data=" << no_data << " #col=" << cnum; AzPrint::writeln(out, s); } /* for each file */ mfile.done(); AzBytArr s("#data="); s << data_no << " no_data=" << no_data << " #col=" << cnum << " #col_all=" << cnum_before_reduce; AzPrint::writeln(out, s); s.reset("all:"); fi[0].show(s); AzPrint::writeln(out, s); s.reset("top:"); fi[1].show(s); AzPrint::writeln(out, s); if (p.do_binarize) { AzTimeLog::print("Binarizing Y ... ", log_out); for (int dx = 0; dx < data_no; ++dx) am_y(dx)->binarize(); /* (x>0) ? 1 : (x<0) ? -1 : 0 */ } else if (p.scale_y > 0) { double max_top = fi[1].max_val; double scale = 1; if (max_top < p.scale_y) for ( ; ; scale *= 2) if (max_top*scale >= p.scale_y) break; if (max_top > p.scale_y*2) for ( ; ; scale /= 2) if (max_top*scale <= p.scale_y*2) break; s.reset("Multiplying Y with "); s << scale; AzPrint::writeln(out, s); for (int dx = 0; dx < data_no; ++dx) am_y(dx)->multiply(scale); } const char *outnm = p.s_rnm.c_str(); AzTimeLog::print("Generating X ... ", out); write_XY(am_x, data_no, p.s_batch_id, outnm, p.s_x_ext.c_str(), &dic, xtext_ext); AzTimeLog::print("Generating Y ... ", out); write_XY(am_y, data_no, p.s_batch_id, outnm, p.s_y_ext.c_str()); }
/* Note: X and Y use different dictionaries */ void AzPrepText2::gen_regions_unsup(int argc, const char *argv[]) const { const char *eyec = "AzPrepText2::gen_regions_unsup"; AzPrepText2_gen_regions_unsup_Param p(argc, argv, out); check_batch_id(p.s_batch_id); bool do_xseq = p.s_xtyp.equals(kw_seq); bool do_skip_stopunk = (do_xseq) ? false : true; AzDic ydic(p.s_ydic_fn.c_str()); int ydic_nn = ydic.get_max_n(); AzPrint::writeln(out, "y dic n=", ydic_nn); AzX::throw_if((ydic.size() <= 0), AzInputError, eyec, "No Y (target) vocabulary."); AzDic xdic(p.s_xdic_fn.c_str()); int xdic_nn = xdic.get_max_n(); AzPrint::writeln(out, "x dic n=", xdic_nn); AzX::throw_if((xdic.size() <= 0), AzInputError, eyec, "No vocabulary."); AzX::no_support((xdic_nn > 1 && do_xseq), eyec, "X with multi-word vocabulary and Seq option"); /*--- scan files to determine buffer size and #data ---*/ AzOut noout; AzStrPool sp_list; AzIntArr ia_data_num; int buff_size = AzTools_text::scan_files_in_list(p.s_inp_fn.c_str(), p.s_txt_ext.c_str(), noout, &sp_list, &ia_data_num); int data_num = ia_data_num.sum(); /*--- read data and generate features ---*/ AzDataArr<AzSmat> am_x(data_num), am_y(data_num); buff_size += 256; AzBytArr s_buff; AzByte *buff = s_buff.reset(buff_size, 0); int no_data = 0, data_no = 0, cnum = 0, cnum_before_reduce = 0; int l_dist = -p.dist, r_dist = p.dist; AzIntArr ia_xnn; for (int ix = 1; ix <= xdic_nn; ++ix) ia_xnn.put(ix); AzIntArr ia_ynn; for (int ix = 1; ix <= ydic_nn; ++ix) ia_ynn.put(ix); for (int fx = 0; fx < sp_list.size(); ++fx) { /* for each file */ AzBytArr s_fn(sp_list.c_str(fx), p.s_txt_ext.c_str()); const char *fn = s_fn.c_str(); AzTimeLog::print(fn, out); AzFile file(fn); file.open("rb"); int num_in_file = ia_data_num.get(fx); int inc = num_in_file / 50, milestone = inc; int dx = 0; for ( ; ; ++dx) { /* for each doc */ AzTools::check_milestone(milestone, dx, inc); int len = file.gets(buff, buff_size); if (len <= 0) break; /*--- X ---*/ AzIntArr ia_pos; AzBytArr s_data(buff, len); int my_len = s_data.length(); bool do_allow_zero = false; if (p.do_no_skip) { do_allow_zero = true; do_skip_stopunk = false; } int xtok_num = 0; if (xdic_nn > 1) { /* n-grams */ AzDataArr<AzIntArr> aia_xtokno; AzTools_text::tokenize(s_data.point_u(), my_len, &xdic, ia_xnn, p.do_lower, p.do_utf8dashes, aia_xtokno); if (aia_xtokno.size() > 0) xtok_num = aia_xtokno[0]->size(); gen_X_ngram_bow(ia_xnn, aia_xtokno, xdic.size(), p.pch_sz, p.pch_step, p.padding, do_allow_zero, am_x.point_u(data_no), &ia_pos); } else { /* words */ AzIntArr ia_xtokno; int nn = 1; AzTools_text::tokenize(s_data.point_u(), my_len, &xdic, nn, p.do_lower, p.do_utf8dashes, &ia_xtokno); xtok_num = ia_xtokno.size(); if (do_xseq) gen_X_seq(ia_xtokno, xdic.size(), p.pch_sz, p.pch_step, p.padding, do_allow_zero, do_skip_stopunk, am_x.point_u(data_no), &ia_pos); else gen_X_bow(ia_xtokno, xdic.size(), p.pch_sz, p.pch_step, p.padding, do_skip_stopunk, am_x.point_u(data_no), &ia_pos); } if (am_x.point(data_no)->colNum() <= 0) { ++no_data; continue; } /*--- Y ---*/ s_data.reset(buff, len); my_len = s_data.length(); if (ydic_nn > 1) { /* n-grams */ AzDataArr<AzIntArr> aia_ytokno; AzTools_text::tokenize(s_data.point_u(), my_len, &ydic, ia_ynn, p.do_lower, p.do_utf8dashes, aia_ytokno); int ytok_num = (aia_ytokno.size() > 0) ? aia_ytokno[0]->size() : 0; AzX::throw_if((xtok_num != ytok_num), eyec, "conflict in the numbers of X tokens and Y tokens"); gen_Y_ngram_bow(ia_ynn, aia_ytokno, ydic.size(), ia_pos, p.pch_sz, l_dist, r_dist, p.do_nolr, am_y.point_u(data_no)); } else { /* words */ int nn = 1; AzIntArr ia_ytokno; AzTools_text::tokenize(s_data.point_u(), my_len, &ydic, nn, p.do_lower, p.do_utf8dashes, &ia_ytokno); AzX::throw_if((xtok_num != ia_ytokno.size()), eyec, "conflict in the numbers of X tokens and Y tokens"); if (p.do_nolr) gen_Y_nolr(ia_ytokno, ydic.size(), ia_pos, p.pch_sz, l_dist, r_dist, am_y.point_u(data_no)); else gen_Y(ia_ytokno, ydic.size(), ia_pos, p.pch_sz, l_dist, r_dist, am_y.point_u(data_no)); } cnum_before_reduce += am_x.point(data_no)->colNum(); reduce_xy(p.min_x, p.min_y, am_x.point_u(data_no), am_y.point_u(data_no)); if (am_x.point(data_no)->colNum() <= 0) { ++no_data; continue; } cnum += am_x.point(data_no)->colNum(); ++data_no; } /* for each doc */ AzTools::finish_milestone(milestone); AzBytArr s(" #data="); s << data_no << " no_data=" << no_data << " #col=" << cnum; AzPrint::writeln(out, s); } /* for each file */ AzBytArr s("#data="); s << data_no << " no_data=" << no_data << " #col=" << cnum << " #col_all=" << cnum_before_reduce; AzPrint::writeln(out, s); const char *outnm = p.s_rnm.c_str(); AzTimeLog::print("Generating X ... ", out); write_XY(am_x, data_no, p.s_batch_id, outnm, p.s_x_ext.c_str(), &xdic, xtext_ext); AzTimeLog::print("Generating Y ... ", out); write_XY(am_y, data_no, p.s_batch_id, outnm, p.s_y_ext.c_str(), &ydic, ytext_ext); AzTimeLog::print("Done ... ", out); }