/*-------------------------------------------------------------------------*/ int AzTools_text::replace_utf8dashes(AzByte *data, int len) /* inout */ { const char *eyec = "AzTools_text:;replace_utf8dashes"; AzBytArr s_mydata; AzByte *mydata = s_mydata.reset(len*2, 0); AzByte *mywp = mydata; /* 0xe28093: en dash (often used as in 1900-1935) */ /* 0xe28094: em dash (long dash) */ /* 0xe2809c: double quote begin -> [ " ] */ /* 0xe2809d: duoble quote end -> [ " ] */ /* 0xe28098: single quote begin -> [ ' ]*/ /* 0xe28099: single quote end -> [ 's] if ending a token; [ ' ] otherwise */ const AzByte *data_end = data+len, *wp = data; for ( ; wp < data_end; ) { const AzByte *ptr = (AzByte *)memchr(wp, 0xE2, data_end-wp); if (ptr == NULL) { ptr = data_end; } int mvlen = Az64::ptr_diff(ptr-wp, eyec); if (mvlen > 0) { memcpy(mywp, wp, mvlen); mywp += mvlen; /* string before 0xE2 */ } if (ptr+3 <= data_end) { AzByte prevch = (ptr-1 >= data) ? *(ptr-1) : 0; AzByte nextch = (ptr+3 < data_end) ? *(ptr+3) : 0; AzBytArr s; if (*(ptr+1) == 0x80) { if (*(ptr+2) == 0x93) { /* en dash */ if (prevch>='0' && prevch<='9' && nextch>='0' && nextch<='9') s.c("-"); /* between digits */ else s.c(" - "); } else if (*(ptr+2) == 0x94) s.c(" - "); /* em dash */ else if (*(ptr+2) == 0x98) s.c(" ' "); else if (*(ptr+2) == 0x9c || *(ptr+2) == 0x9d) s.c(" \" "); /* double quote */ else if (*(ptr+2) == 0x99) { if (ptr+5<=data_end && memcmp(ptr+3, "s ", 2)==0) s.c(" '"); else s.c(" ' "); } else s.c(ptr, 3); } else s.c(ptr, 3); memcpy(mywp, s.point(), s.length()); mywp += s.length(); wp = ptr+3; } else { mvlen = Az64::ptr_diff(data_end-ptr, eyec); if (mvlen > 0) { memcpy(mywp, ptr, mvlen); mywp += mvlen; } wp = data_end; } } int mydata_len = Az64::ptr_diff(mywp-mydata); memcpy(data, mydata, mydata_len); mydata[mydata_len] = '\0'; return mydata_len; }
inline virtual void throw_if_missing(const char *kw, const AzBytArr &s_val, const char *eyec) const { if (s_val.length() <= 0) { AzBytArr s_kw; s_kw.inQuotes(kw, "\""); throw new AzException(AzInputMissing, eyec, s_kw.c_str(), "is missing"); } }
/*------------------------------------------------------------------*/ int AzSvDataS::if_sparse(AzBytArr &s_line, int expected_f_num, const char *str) { const char *eyec = "AzSvDataS::if_sparse"; int sparse_f_num = -1; AzBytArr s_sparse(str); AzStrPool sp_tok; AzTools::getStrings(s_line.point(), s_line.length(), &sp_tok); if (sp_tok.size() > 0 && s_sparse.compare(sp_tok.c_str(0)) == 0) { if (sp_tok.size() >= 2) { sparse_f_num = atol(sp_tok.c_str(1)); } if (sparse_f_num <= 0) { throw new AzException(AzInputError, eyec, "1st line of sparse data file must be \"sparse dd\" where dd is the feature dimensionality."); } if (expected_f_num > 0 && sparse_f_num != expected_f_num) { throw new AzException(AzInputError, eyec, "Conflict in feature dim: feature definition file vs. data file."); } } return sparse_f_num; }
/*-------------------------------------------------------------------------*/ void AzPrepText2::check_batch_id(const AzBytArr &s_batch_id) const { const char *eyec = "AzPrepText::check_batch_id"; if (s_batch_id.length() <= 0) return; AzBytArr s(kw_batch_id); s << " should look like \"1of5\""; const char *batch_id = s_batch_id.c_str(); const char *of_str = strstr(batch_id, "of"); AzX::throw_if((of_str == NULL), AzInputError, eyec, s.c_str()); for (const char *wp = batch_id; wp < batch_id+s_batch_id.length(); ++wp) { if (wp >= of_str && wp < of_str+2) continue; AzX::throw_if((*wp < '0' || *wp > '9'), AzInputError, eyec, s.c_str()); } int batch_no = atol(batch_id); int batch_num = atol(of_str+2); AzX::throw_if((batch_no < 1 || batch_no > batch_num), AzInputError, eyec, s.c_str(), " batch# must start with 1 and must not exceed the number of batches. "); }
inline void vLoss(const char *kw, AzLossType *out_loss) { if (param == NULL) return; AzBytArr s; vStr(kw, &s); if (s.length() > 0) { *out_loss = AzLoss::lossType(s.c_str()); } if (doCheck) sp_used_kw.put(kw); }
virtual void printParam(const AzOut &out) const { if (out.isNull()) return; AzPrint o(out); if (s_dataproc.length() > 0) { o.ppBegin("AzDataForTrTree", "Data processing"); o.printV(kw_dataproc, s_dataproc); o.ppEnd(); } }
virtual void resetParam(AzParam &azp, const char *pfx, bool is_warmstart) { azp.reset_prefix(pfx); if (!is_warmstart) { azp.vStr(kw_activ_typ, &s_activ_typ); if (s_activ_typ.length() > 0) typ = *s_activ_typ.point(); azp.vFloat(kw_trunc, &trunc); } azp.swOn(&do_stat, kw_do_stat); azp.reset_prefix(); }
/*--- for parameters ---*/ virtual void resetParam(AzParam &p) { p.vStr(kw_dataproc, &s_dataproc); dataproc = dataproc_Auto; if (s_dataproc.length() <= 0 || s_dataproc.compare("Auto") == 0); else if (s_dataproc.compare("Sparse") == 0) dataproc = dataproc_Sparse; else if (s_dataproc.compare("Dense") == 0) dataproc = dataproc_Dense; else { throw new AzException(AzInputNotValid, kw_dataproc, "must be either \"Auto\", \"Sparse\", or \"Dense\"."); } }
void resetParam(AzParam &azp, const char *pfx, bool is_warmstart) { azp.reset_prefix(pfx); if (!is_warmstart) { azp.vStr(kw_pl_type, &s_pl_type); if (s_pl_type.length() > 0) ptyp = *s_pl_type.point(); else ptyp = AzpPoolingDflt_None; azp.vInt(kw_pl_num, &pl_num); azp.vInt(kw_pl_sz, &pl_sz); azp.vInt(kw_pl_step, &pl_step); azp.swOff(&do_pl_simple_grid, kw_no_pl_simple_grid); } azp.reset_prefix(); }
/*-------------------------------------------------------------------------*/ template <class Vmat> /* Vmat: AzSmatVar | AzSmatcVar */ int AzPrepText2::_write_XY(AzDataArr<AzSmat> &am_xy, /* destroyed to save memory */ int data_num, const AzBytArr &s_batch_id, const char *outnm, const char *xy_ext) const { Vmat mv_xy; mv_xy.transfer_from(&am_xy, data_num); am_xy.reset(); AzBytArr s_xy(": "); AzTools::show_smat_stat(*mv_xy.data(), s_xy); AzBytArr s_xy_fn(outnm, xy_ext); if (s_batch_id.length() > 0) s_xy_fn << "." << s_batch_id.c_str(); const char *xy_fn = s_xy_fn.c_str(); AzTimeLog::print(xy_fn, s_xy.c_str(), out); if (AzBytArr::endsWith(xy_ext, "smat")) mv_xy.write_matrix(xy_fn); else mv_xy.write(xy_fn); return mv_xy.rowNum(); }
/*------------------------------------------------------------------*/ int AzSvDataS::countFeatures(const AzByte *line, const AzByte *line_end) { const AzByte *wp = line; int count = 0; for ( ; wp < line_end; ) { AzBytArr s; AzTools::getString(&wp, line_end, &s); if (s.length() > 0) { ++count; } } return count; }
/*-------------------*/ void toplevel_header(const char *desc, AzByte dlm='*') { if (out.isNull()) return; newline(); AzBytArr s; s.fill(dlm, 3); int dlm_len = Az64::cstrlen(desc) + s.length()*2; dlm_len = MIN(line_width, dlm_len); AzBytArr s_long; s_long.fill(dlm, dlm_len); AzPrint::writeln(out, s_long.c_str()); AzPrint::write(out, s.c_str()); AzPrint::write(out, desc); AzPrint::writeln(out, s.c_str()); AzPrint::writeln(out, s_long.c_str()); }
inline void printV_if_not_empty(const char *kw, const AzBytArr &s) { if (o == NULL) return; if (s.length() <= 0) return; itemBegin(); *o<<kw<<s.c_str(); }
void throw_if_empty(const char *kw, const AzBytArr &s) const { if (s.length() <= 0) throw new AzException(AzInputError, "AzsLinear::resetParam", kw, "must be specified."); }