Esempio n. 1
0
/*-------------------------------------------------------------------------*/
int AzTools_text::replace_utf8dashes(AzByte *data, int len) /* inout */
{
  const char *eyec = "AzTools_text:;replace_utf8dashes"; 
  AzBytArr s_mydata;
  AzByte *mydata = s_mydata.reset(len*2, 0); 
  AzByte *mywp = mydata;  
  /* 0xe28093: en dash (often used as in 1900-1935) */
  /* 0xe28094: em dash (long dash) */ 
  /* 0xe2809c: double quote begin -> [ " ]  */
  /* 0xe2809d: duoble quote end   -> [ " ]  */
  /* 0xe28098: single quote begin -> [ ' ]*/
  /* 0xe28099: single quote end -> [ 's] if ending a token; [ ' ] otherwise */
  const AzByte *data_end = data+len, *wp = data; 
  for ( ; wp < data_end; ) {
    const AzByte *ptr = (AzByte *)memchr(wp,  0xE2, data_end-wp); 
    if (ptr == NULL) {
      ptr = data_end; 
    }
    int mvlen = Az64::ptr_diff(ptr-wp, eyec); 
    if (mvlen > 0) {
      memcpy(mywp, wp, mvlen);  mywp += mvlen; /* string before 0xE2 */
    }
    if (ptr+3 <= data_end) {
      AzByte prevch = (ptr-1 >= data) ? *(ptr-1) : 0; 
      AzByte nextch = (ptr+3 < data_end) ? *(ptr+3) : 0; 
      AzBytArr s; 
      if (*(ptr+1) == 0x80) {
        if (*(ptr+2) == 0x93) { /* en dash */
          if (prevch>='0' && prevch<='9' && nextch>='0' && nextch<='9') s.c("-");  /* between digits */
          else                                                          s.c(" - "); 
        }
        else if (*(ptr+2) == 0x94)                          s.c(" - "); /* em dash */
        else if (*(ptr+2) == 0x98)                          s.c(" ' "); 
        else if (*(ptr+2) == 0x9c || *(ptr+2) == 0x9d)      s.c(" \" "); /* double quote */        
        else if (*(ptr+2) == 0x99) {
          if (ptr+5<=data_end && memcmp(ptr+3, "s ", 2)==0) s.c(" '");    
          else                                              s.c(" ' "); 
        }
        else s.c(ptr, 3); 
      }
      else s.c(ptr, 3); 
      memcpy(mywp, s.point(), s.length());  mywp += s.length(); 
      wp = ptr+3;     
    }
    else {
      mvlen = Az64::ptr_diff(data_end-ptr, eyec);
      if (mvlen > 0) {
        memcpy(mywp, ptr, mvlen); mywp += mvlen; 
      }
      wp = data_end; 
    }
  }
  int mydata_len = Az64::ptr_diff(mywp-mydata); 
  memcpy(data, mydata, mydata_len); 
  mydata[mydata_len] = '\0'; 
  return mydata_len; 
}
Esempio n. 2
0
 inline virtual void throw_if_missing(const char *kw, const AzBytArr &s_val,
                                 const char *eyec) const {
   if (s_val.length() <= 0) {
     AzBytArr s_kw; s_kw.inQuotes(kw, "\"");
     throw new AzException(AzInputMissing, eyec, s_kw.c_str(), "is missing");
   }
 }
/*------------------------------------------------------------------*/
int AzSvDataS::if_sparse(AzBytArr &s_line,
                         int expected_f_num,
                         const char *str)
{
  const char *eyec = "AzSvDataS::if_sparse";

  int sparse_f_num = -1;
  AzBytArr s_sparse(str);
  AzStrPool sp_tok;
  AzTools::getStrings(s_line.point(), s_line.length(), &sp_tok);
  if (sp_tok.size() > 0 &&
      s_sparse.compare(sp_tok.c_str(0)) == 0) {
    if (sp_tok.size() >= 2) {
      sparse_f_num = atol(sp_tok.c_str(1));
    }
    if (sparse_f_num <= 0) {
      throw new AzException(AzInputError, eyec,
            "1st line of sparse data file must be \"sparse dd\" where dd is the feature dimensionality.");
    }
    if (expected_f_num > 0 && sparse_f_num != expected_f_num) {
      throw new AzException(AzInputError, eyec,
            "Conflict in feature dim: feature definition file vs. data file.");
    }
  }
  return sparse_f_num;
}
Esempio n. 4
0
/*-------------------------------------------------------------------------*/
void AzPrepText2::check_batch_id(const AzBytArr &s_batch_id) const
{
  const char *eyec = "AzPrepText::check_batch_id"; 
  if (s_batch_id.length() <= 0) return; 
  AzBytArr s(kw_batch_id); s << " should look like \"1of5\""; 
  const char *batch_id = s_batch_id.c_str(); 
  const char *of_str = strstr(batch_id, "of"); 
  AzX::throw_if((of_str == NULL), AzInputError, eyec, s.c_str()); 
  for (const char *wp = batch_id; wp < batch_id+s_batch_id.length(); ++wp) {
    if (wp >= of_str && wp < of_str+2) continue; 
    AzX::throw_if((*wp < '0' || *wp > '9'), AzInputError, eyec, s.c_str());  
  }
  int batch_no = atol(batch_id); 
  int batch_num = atol(of_str+2); 
  AzX::throw_if((batch_no < 1 || batch_no > batch_num), AzInputError, eyec, 
                s.c_str(), " batch# must start with 1 and must not exceed the number of batches. "); 
} 
Esempio n. 5
0
 inline void vLoss(const char *kw, AzLossType *out_loss) {
   if (param == NULL) return; 
   AzBytArr s; 
   vStr(kw, &s); 
   if (s.length() > 0) {
     *out_loss = AzLoss::lossType(s.c_str()); 
   }
   if (doCheck) sp_used_kw.put(kw); 
 }
 virtual void printParam(const AzOut &out) const {
   if (out.isNull()) return; 
   AzPrint o(out); 
   if (s_dataproc.length() > 0) {
     o.ppBegin("AzDataForTrTree", "Data processing"); 
     o.printV(kw_dataproc, s_dataproc); 
     o.ppEnd(); 
   }
 }
Esempio n. 7
0
 virtual void resetParam(AzParam &azp, const char *pfx, bool is_warmstart) {
   azp.reset_prefix(pfx); 
   if (!is_warmstart) {
     azp.vStr(kw_activ_typ, &s_activ_typ); 
     if (s_activ_typ.length() > 0) typ = *s_activ_typ.point(); 
     azp.vFloat(kw_trunc, &trunc); 
   }
   azp.swOn(&do_stat, kw_do_stat);       
   azp.reset_prefix(); 
 }
 /*---  for parameters  ---*/
 virtual void resetParam(AzParam &p) {
   p.vStr(kw_dataproc, &s_dataproc); 
   dataproc = dataproc_Auto; 
   if (s_dataproc.length() <= 0 || 
       s_dataproc.compare("Auto") == 0); 
   else if (s_dataproc.compare("Sparse") == 0) dataproc = dataproc_Sparse; 
   else if (s_dataproc.compare("Dense") == 0)  dataproc = dataproc_Dense; 
   else {
     throw new AzException(AzInputNotValid, kw_dataproc, 
           "must be either \"Auto\", \"Sparse\", or \"Dense\"."); 
   }
 }
Esempio n. 9
0
 void resetParam(AzParam &azp, const char *pfx, bool is_warmstart) {
   azp.reset_prefix(pfx); 
   if (!is_warmstart) {
     azp.vStr(kw_pl_type, &s_pl_type); 
     if (s_pl_type.length() > 0) ptyp = *s_pl_type.point(); 
     else                        ptyp = AzpPoolingDflt_None; 
     azp.vInt(kw_pl_num, &pl_num); 
     azp.vInt(kw_pl_sz, &pl_sz); 
     azp.vInt(kw_pl_step, &pl_step); 
     azp.swOff(&do_pl_simple_grid, kw_no_pl_simple_grid); 
   }
   azp.reset_prefix(); 
 }
Esempio n. 10
0
/*-------------------------------------------------------------------------*/
template <class Vmat> /* Vmat: AzSmatVar | AzSmatcVar */
int AzPrepText2::_write_XY(AzDataArr<AzSmat> &am_xy, /* destroyed to save memory */
                           int data_num, 
                           const AzBytArr &s_batch_id, 
                           const char *outnm, const char *xy_ext) const {                          
  Vmat mv_xy; mv_xy.transfer_from(&am_xy, data_num); am_xy.reset(); 
  AzBytArr s_xy(": "); AzTools::show_smat_stat(*mv_xy.data(), s_xy); 

  AzBytArr s_xy_fn(outnm, xy_ext); 
  if (s_batch_id.length() > 0) s_xy_fn << "." << s_batch_id.c_str(); 
  const char *xy_fn = s_xy_fn.c_str(); 
  AzTimeLog::print(xy_fn, s_xy.c_str(), out); 
  if (AzBytArr::endsWith(xy_ext, "smat")) mv_xy.write_matrix(xy_fn); 
  else                                    mv_xy.write(xy_fn);  
  return mv_xy.rowNum(); 
}  
/*------------------------------------------------------------------*/
int AzSvDataS::countFeatures(const AzByte *line,
                             const AzByte *line_end)
{
  const AzByte *wp = line;
  int count = 0;
  for ( ; wp < line_end; ) {
    AzBytArr s;
    AzTools::getString(&wp, line_end, &s);
    if (s.length() > 0) {
      ++count;
    }
  }

  return count;
}
Esempio n. 12
0
  /*-------------------*/
  void toplevel_header(const char *desc, AzByte dlm='*') 
  {
    if (out.isNull()) return; 

    newline(); 

    AzBytArr s; 
    s.fill(dlm, 3); 
    int dlm_len = Az64::cstrlen(desc) + s.length()*2; 
    dlm_len = MIN(line_width, dlm_len); 

    AzBytArr s_long; 
    s_long.fill(dlm, dlm_len); 

    AzPrint::writeln(out, s_long.c_str()); 
    AzPrint::write(out, s.c_str()); 
    AzPrint::write(out, desc); 
    AzPrint::writeln(out, s.c_str()); 
    AzPrint::writeln(out, s_long.c_str()); 
  }
Esempio n. 13
0
 inline void printV_if_not_empty(const char *kw, const AzBytArr &s) {
   if (o == NULL) return; 
   if (s.length() <= 0) return; 
   itemBegin(); 
   *o<<kw<<s.c_str(); 
 }
Esempio n. 14
0
 void throw_if_empty(const char *kw, const AzBytArr &s) const {
   if (s.length() <= 0) throw new AzException(AzInputError, "AzsLinear::resetParam", kw, "must be specified."); 
 }