/*----------------------------------------------------------------------*/
void AzSvDataS::mergeData(const AzSmat *m_x,
                         const AzSvFeatInfo *feat,
                         const char *fn_template,
                         const char *str,
                         bool doSparse,
                         int digits,
                         const char *out_x_fn,
                         const char *out_n_fn,
                         int num,
                         const char *names[])
{
  const char *eyec = "AzSvDataS::mergeData";

  int data_num = m_x->colNum();
  int f_num = m_x->rowNum();
  if (feat->featNum() != f_num) {
    throw new AzException(eyec, "Conflict btw m_x and featInfo");
  }
  AzFile n_file(out_n_fn);
  n_file.open("wb");
  int fx;
  for (fx = 0; fx < feat->featNum(); ++fx) {
    AzBytArr s; feat->desc(fx, &s); s.nl();
    s.writeText(&n_file);
  }

  AzSmat m;
  m_x->transpose(&m);
  m.resize(data_num, f_num+num);
  AzStrPool sp_names;
  for (fx = 0; fx < num; ++fx) {
    AzBytArr s_fn(fn_template);
    s_fn.replace("*", names[fx]);

    AzDvect v;
    AzSvDataS::readVector(s_fn.c_str(), &v);
    if (v.rowNum() != m.rowNum()) {
      throw new AzException(AzInputError, eyec, "conflict in #data:", s_fn.c_str());
    }
    m.col_u(f_num+fx)->set(&v);

    AzBytArr s_nm;
    if (AzTools::isSpecified(str)) s_nm.c(str);
    s_nm.c(names[fx]); s_nm.nl();
    s_nm.writeText(&n_file);
  }
  n_file.close(true);

  AzSmat m1;
  m.transpose(&m1);
  m1.writeText(out_x_fn, digits, doSparse);
}
Example #2
0
/*-------------------------------------------------------------------------*/
void AzPrepText2::_write_XY_dic(const AzDic *dic, int x_row_num, const char *nm, const char *ext) const
{
  const char *eyec = "AzPrepText2::_write_XY_dic"; 
  if (dic == NULL) return; 
  int pch_sz = x_row_num / dic->size(); 
  AzX::throw_if((x_row_num % dic->size() != 0), eyec, "Conflict in #row and size of vocabulary"); 
  AzBytArr s_fn(nm, ext); 
  if (pch_sz > 1) {
    AzDic seq_dic; 
    gen_seq_dic(dic, pch_sz, &seq_dic); 
    seq_dic.writeText(s_fn.c_str()); 
  }
  else {
    dic->writeText(s_fn.c_str()); 
  }
} 
Example #3
0
/*-------------------------------------------------------------------------*/
void AzPrepText2::gen_regions_parsup(int argc, const char *argv[]) const
{
  const char *eyec = "AzPrepText2::gen_regions_parsup"; 
  AzPrepText2_gen_regions_parsup_Param p(argc, argv, out);   
  check_batch_id(p.s_batch_id); 
  
  AzMats_file<AzSmat> mfile; 
  int feat_data_num = mfile.reset_for_read(p.s_feat_fn.c_str()); 

  AzStrPool sp_typ(10,10); sp_typ.put(kw_bow, kw_seq); 
  AzXi::check_input(p.s_xtyp.c_str(), &sp_typ, eyec, kw_xtyp);  
  bool do_xseq = p.s_xtyp.equals(kw_seq); 
  bool do_skip_stopunk = (do_xseq) ? false : true; 

  AzDic dic(p.s_xdic_fn.c_str()); 
  AzX::throw_if((dic.size() <= 0), AzInputError, eyec, "No vocabulary"); 
  
  /*---  scan files to determine buffer size and #data  ---*/
  AzOut noout; 
  AzStrPool sp_list; 
  AzIntArr ia_data_num; 
  int buff_size = AzTools_text::scan_files_in_list(p.s_inp_fn.c_str(), p.s_txt_ext.c_str(), 
                                                   noout, &sp_list, &ia_data_num);   
  int data_num = ia_data_num.sum(); 
  AzX::throw_if ((data_num != feat_data_num), eyec, "#data mismatch"); 
  
  /*---  read data and generate features  ---*/
  AzDataArr<AzSmat> am_x(data_num), am_y(data_num); 
  
  buff_size += 256; 
  AzBytArr s_buff; 
  AzByte *buff = s_buff.reset(buff_size, 0); 
  int no_data = 0, data_no = 0, cnum = 0, cnum_before_reduce = 0; 
  feat_info fi[2]; 
  for (int fx = 0; fx < sp_list.size(); ++fx) { /* for each file */
    AzBytArr s_fn(sp_list.c_str(fx), p.s_txt_ext.c_str()); 
    const char *fn = s_fn.c_str(); 
    AzTimeLog::print(fn, log_out);   
    AzFile file(fn); 
    file.open("rb"); 
    int num_in_file = ia_data_num.get(fx); 
    int inc = num_in_file / 50, milestone = inc; 
    int dx = 0; 
    for ( ; ; ++dx) {  /* for each doc */
      AzTools::check_milestone(milestone, dx, inc); 
      int len = file.gets(buff, buff_size); 
      if (len <= 0) break; 
     
      /*---  X  ---*/
      AzBytArr s_data(buff, len); 
      int my_len = s_data.length();
      AzIntArr ia_tokno;        
      int nn = 1; 
      AzTools_text::tokenize(s_data.point_u(), my_len, &dic, nn, p.do_lower, p.do_utf8dashes, &ia_tokno);        
      
      AzIntArr ia_pos; 
      bool do_allow_zero = false;       
      if (do_xseq) gen_X_seq(ia_tokno, dic.size(), p.pch_sz, p.pch_step, p.padding, 
                             do_allow_zero, do_skip_stopunk, am_x.point_u(data_no), &ia_pos);       
      else         gen_X_bow(ia_tokno, dic.size(), p.pch_sz, p.pch_step, p.padding, 
                             do_skip_stopunk, am_x.point_u(data_no), &ia_pos); 
      AzSmat m_feat; 
      mfile.read(&m_feat);      
      if (am_x.point(data_no)->colNum() <= 0) {
        ++no_data; 
        continue; 
      }
      if (p.top_num_each > 0 || p.top_num_total > 0 || p.scale_y > 0) {
        double min_ifeat = m_feat.min(); 
        AzX::no_support((min_ifeat < 0), eyec, "Negative values for internal-feature components."); 
      }
 
      /*---  Y (ifeat: internal features generated by a supervised model) ---*/ 
      gen_Y_ifeat(p.top_num_each, p.top_num_total, &m_feat, &ia_tokno, &ia_pos, 
                  p.pch_sz, -p.dist, p.dist, p.do_nolr, 
                  p.f_pch_sz, p.f_pch_step, p.f_padding, 
                  am_y.point_u(data_no), fi); 
      if (p.min_yval > 0) {
        am_y.point_u(data_no)->cut(p.min_yval); 
      }                                         
      cnum_before_reduce += am_x.point(data_no)->colNum(); 
      reduce_xy(p.min_x, p.min_y, am_x.point_u(data_no), am_y.point_u(data_no));              
      if (am_x.point(data_no)->colNum() <= 0) {
        ++no_data; 
        continue; 
      }
      cnum += am_x.point(data_no)->colNum(); 
      ++data_no;         
    } /* for each doc */
    AzTools::finish_milestone(milestone); 
    AzBytArr s("   #data="); s << data_no << " no_data=" << no_data << " #col=" << cnum; 
    AzPrint::writeln(out, s); 
  } /* for each file */
  mfile.done();   

  AzBytArr s("#data="); s << data_no << " no_data=" << no_data << " #col=" << cnum << " #col_all=" << cnum_before_reduce;        
  AzPrint::writeln(out, s); 
  s.reset("all:"); fi[0].show(s); AzPrint::writeln(out, s); 
  s.reset("top:"); fi[1].show(s); AzPrint::writeln(out, s); 

  if (p.do_binarize) {
    AzTimeLog::print("Binarizing Y ... ", log_out); 
    for (int dx = 0; dx < data_no; ++dx) am_y(dx)->binarize(); /* (x>0) ? 1 : (x<0) ? -1 : 0 */
  }
  else if (p.scale_y > 0) {
    double max_top = fi[1].max_val; 
    double scale = 1; 
    if (max_top < p.scale_y) for ( ; ; scale *= 2) if (max_top*scale >= p.scale_y) break; 
    if (max_top > p.scale_y*2) for ( ; ; scale /= 2) if (max_top*scale <= p.scale_y*2) break; 
    s.reset("Multiplying Y with "); s << scale; AzPrint::writeln(out, s); 
    for (int dx = 0; dx < data_no; ++dx) am_y(dx)->multiply(scale); 
  }  
  
  const char *outnm = p.s_rnm.c_str(); 
  AzTimeLog::print("Generating X ... ", out);  
  write_XY(am_x, data_no, p.s_batch_id, outnm, p.s_x_ext.c_str(), &dic, xtext_ext); 
  AzTimeLog::print("Generating Y ... ", out);  
  write_XY(am_y, data_no, p.s_batch_id, outnm, p.s_y_ext.c_str());   
}
Example #4
0
/* Note: X and Y use different dictionaries */
void AzPrepText2::gen_regions_unsup(int argc, const char *argv[]) const
{
  const char *eyec = "AzPrepText2::gen_regions_unsup"; 
  AzPrepText2_gen_regions_unsup_Param p(argc, argv, out);   
  check_batch_id(p.s_batch_id);     
 
  bool do_xseq = p.s_xtyp.equals(kw_seq); 
  bool do_skip_stopunk = (do_xseq) ? false : true; 
  
  AzDic ydic(p.s_ydic_fn.c_str()); 
  int ydic_nn = ydic.get_max_n(); 
  AzPrint::writeln(out, "y dic n=", ydic_nn); 
  AzX::throw_if((ydic.size() <= 0), AzInputError, eyec, "No Y (target) vocabulary."); 
  
  AzDic xdic(p.s_xdic_fn.c_str()); 
  int xdic_nn = xdic.get_max_n(); 
  AzPrint::writeln(out, "x dic n=", xdic_nn);   
  AzX::throw_if((xdic.size() <= 0), AzInputError, eyec, "No vocabulary.");   
  AzX::no_support((xdic_nn > 1 && do_xseq), eyec, "X with multi-word vocabulary and Seq option");    

  /*---  scan files to determine buffer size and #data  ---*/
  AzOut noout; 
  AzStrPool sp_list; 
  AzIntArr ia_data_num; 
  int buff_size = AzTools_text::scan_files_in_list(p.s_inp_fn.c_str(), p.s_txt_ext.c_str(), 
                                                   noout, &sp_list, &ia_data_num);   
  int data_num = ia_data_num.sum(); 
  
  /*---  read data and generate features  ---*/
  AzDataArr<AzSmat> am_x(data_num), am_y(data_num); 
  
  buff_size += 256; 
  AzBytArr s_buff; 
  AzByte *buff = s_buff.reset(buff_size, 0); 
  int no_data = 0, data_no = 0, cnum = 0, cnum_before_reduce = 0; 
  int l_dist = -p.dist, r_dist = p.dist; 

  AzIntArr ia_xnn; for (int ix = 1; ix <= xdic_nn; ++ix) ia_xnn.put(ix); 
  AzIntArr ia_ynn; for (int ix = 1; ix <= ydic_nn; ++ix) ia_ynn.put(ix); 
  
  for (int fx = 0; fx < sp_list.size(); ++fx) { /* for each file */
    AzBytArr s_fn(sp_list.c_str(fx), p.s_txt_ext.c_str()); 
    const char *fn = s_fn.c_str(); 
    AzTimeLog::print(fn, out);   
    AzFile file(fn); 
    file.open("rb"); 
    int num_in_file = ia_data_num.get(fx); 
    int inc = num_in_file / 50, milestone = inc; 
    int dx = 0; 
    for ( ; ; ++dx) {  /* for each doc */
      AzTools::check_milestone(milestone, dx, inc); 
      int len = file.gets(buff, buff_size); 
      if (len <= 0) break; 
      
      /*---  X  ---*/
      AzIntArr ia_pos;    
      AzBytArr s_data(buff, len); 
      int my_len = s_data.length();
      
      bool do_allow_zero = false; 
      if (p.do_no_skip) {
        do_allow_zero = true; 
        do_skip_stopunk = false; 
      }
      int xtok_num = 0; 
      if (xdic_nn > 1) { /* n-grams */
        AzDataArr<AzIntArr> aia_xtokno; 
        AzTools_text::tokenize(s_data.point_u(), my_len, &xdic, ia_xnn, p.do_lower, p.do_utf8dashes, aia_xtokno);  
        if (aia_xtokno.size() > 0) xtok_num = aia_xtokno[0]->size();        
        gen_X_ngram_bow(ia_xnn, aia_xtokno, xdic.size(), p.pch_sz, p.pch_step, p.padding, do_allow_zero, 
                        am_x.point_u(data_no), &ia_pos); 
      }
      else { /* words */
        AzIntArr ia_xtokno;        
        int nn = 1; 
        AzTools_text::tokenize(s_data.point_u(), my_len, &xdic, nn, p.do_lower, p.do_utf8dashes, &ia_xtokno);         
        xtok_num = ia_xtokno.size(); 
        if (do_xseq) gen_X_seq(ia_xtokno, xdic.size(), p.pch_sz, p.pch_step, p.padding, 
                               do_allow_zero, do_skip_stopunk, am_x.point_u(data_no), &ia_pos);       
        else         gen_X_bow(ia_xtokno, xdic.size(), p.pch_sz, p.pch_step, p.padding, 
                               do_skip_stopunk, am_x.point_u(data_no), &ia_pos); 
      }
      if (am_x.point(data_no)->colNum() <= 0) {
        ++no_data; 
        continue; 
      }
      
      /*---  Y  ---*/
      s_data.reset(buff, len); 
      my_len = s_data.length();        

      if (ydic_nn > 1) { /* n-grams */
        AzDataArr<AzIntArr> aia_ytokno; 
        AzTools_text::tokenize(s_data.point_u(), my_len, &ydic, ia_ynn, p.do_lower, p.do_utf8dashes, aia_ytokno);  
        int ytok_num = (aia_ytokno.size() > 0) ? aia_ytokno[0]->size() : 0; 
        AzX::throw_if((xtok_num != ytok_num), eyec, "conflict in the numbers of X tokens and Y tokens"); 
        gen_Y_ngram_bow(ia_ynn, aia_ytokno, ydic.size(), ia_pos, 
                        p.pch_sz, l_dist, r_dist, p.do_nolr, am_y.point_u(data_no));   
      }
      else { /* words */
        int nn = 1; 
        AzIntArr ia_ytokno; 
        AzTools_text::tokenize(s_data.point_u(), my_len, &ydic, nn, p.do_lower, p.do_utf8dashes, &ia_ytokno);  
        AzX::throw_if((xtok_num != ia_ytokno.size()), eyec, "conflict in the numbers of X tokens and Y tokens"); 
        if (p.do_nolr) gen_Y_nolr(ia_ytokno, ydic.size(), ia_pos, 
                                  p.pch_sz, l_dist, r_dist, am_y.point_u(data_no));  
        else           gen_Y(ia_ytokno, ydic.size(), ia_pos, 
                             p.pch_sz, l_dist, r_dist, am_y.point_u(data_no));      
      }

      cnum_before_reduce += am_x.point(data_no)->colNum(); 
      reduce_xy(p.min_x, p.min_y, am_x.point_u(data_no), am_y.point_u(data_no)); 
      if (am_x.point(data_no)->colNum() <= 0) {
        ++no_data; 
        continue; 
      }
      cnum += am_x.point(data_no)->colNum(); 
      ++data_no;         
    } /* for each doc */
    AzTools::finish_milestone(milestone); 
    AzBytArr s("   #data="); s << data_no << " no_data=" << no_data << " #col=" << cnum; AzPrint::writeln(out, s); 
  } /* for each file */
  AzBytArr s("#data="); s << data_no << " no_data=" << no_data << " #col=" << cnum << " #col_all=" << cnum_before_reduce;
  AzPrint::writeln(out, s);   
    
  const char *outnm = p.s_rnm.c_str(); 
  AzTimeLog::print("Generating X ... ", out);  
  write_XY(am_x, data_no, p.s_batch_id, outnm, p.s_x_ext.c_str(), &xdic, xtext_ext); 
  AzTimeLog::print("Generating Y ... ", out);  
  write_XY(am_y, data_no, p.s_batch_id, outnm, p.s_y_ext.c_str(), &ydic, ytext_ext);   

  AzTimeLog::print("Done ... ", out); 
}