/*------------------------------------------------------------------*/ int AzSvDataS::if_sparse(AzBytArr &s_line, int expected_f_num, const char *str) { const char *eyec = "AzSvDataS::if_sparse"; int sparse_f_num = -1; AzBytArr s_sparse(str); AzStrPool sp_tok; AzTools::getStrings(s_line.point(), s_line.length(), &sp_tok); if (sp_tok.size() > 0 && s_sparse.compare(sp_tok.c_str(0)) == 0) { if (sp_tok.size() >= 2) { sparse_f_num = atol(sp_tok.c_str(1)); } if (sparse_f_num <= 0) { throw new AzException(AzInputError, eyec, "1st line of sparse data file must be \"sparse dd\" where dd is the feature dimensionality."); } if (expected_f_num > 0 && sparse_f_num != expected_f_num) { throw new AzException(AzInputError, eyec, "Conflict in feature dim: feature definition file vs. data file."); } } return sparse_f_num; }
/*-------------------------------------------------------------------------*/ int AzTools_text::replace_utf8dashes(AzByte *data, int len) /* inout */ { const char *eyec = "AzTools_text:;replace_utf8dashes"; AzBytArr s_mydata; AzByte *mydata = s_mydata.reset(len*2, 0); AzByte *mywp = mydata; /* 0xe28093: en dash (often used as in 1900-1935) */ /* 0xe28094: em dash (long dash) */ /* 0xe2809c: double quote begin -> [ " ] */ /* 0xe2809d: duoble quote end -> [ " ] */ /* 0xe28098: single quote begin -> [ ' ]*/ /* 0xe28099: single quote end -> [ 's] if ending a token; [ ' ] otherwise */ const AzByte *data_end = data+len, *wp = data; for ( ; wp < data_end; ) { const AzByte *ptr = (AzByte *)memchr(wp, 0xE2, data_end-wp); if (ptr == NULL) { ptr = data_end; } int mvlen = Az64::ptr_diff(ptr-wp, eyec); if (mvlen > 0) { memcpy(mywp, wp, mvlen); mywp += mvlen; /* string before 0xE2 */ } if (ptr+3 <= data_end) { AzByte prevch = (ptr-1 >= data) ? *(ptr-1) : 0; AzByte nextch = (ptr+3 < data_end) ? *(ptr+3) : 0; AzBytArr s; if (*(ptr+1) == 0x80) { if (*(ptr+2) == 0x93) { /* en dash */ if (prevch>='0' && prevch<='9' && nextch>='0' && nextch<='9') s.c("-"); /* between digits */ else s.c(" - "); } else if (*(ptr+2) == 0x94) s.c(" - "); /* em dash */ else if (*(ptr+2) == 0x98) s.c(" ' "); else if (*(ptr+2) == 0x9c || *(ptr+2) == 0x9d) s.c(" \" "); /* double quote */ else if (*(ptr+2) == 0x99) { if (ptr+5<=data_end && memcmp(ptr+3, "s ", 2)==0) s.c(" '"); else s.c(" ' "); } else s.c(ptr, 3); } else s.c(ptr, 3); memcpy(mywp, s.point(), s.length()); mywp += s.length(); wp = ptr+3; } else { mvlen = Az64::ptr_diff(data_end-ptr, eyec); if (mvlen > 0) { memcpy(mywp, ptr, mvlen); mywp += mvlen; } wp = data_end; } } int mydata_len = Az64::ptr_diff(mywp-mydata); memcpy(data, mydata, mydata_len); mydata[mydata_len] = '\0'; return mydata_len; }
virtual void resetParam(AzParam &azp, const char *pfx, bool is_warmstart) { azp.reset_prefix(pfx); if (!is_warmstart) { azp.vStr(kw_activ_typ, &s_activ_typ); if (s_activ_typ.length() > 0) typ = *s_activ_typ.point(); azp.vFloat(kw_trunc, &trunc); } azp.swOn(&do_stat, kw_do_stat); azp.reset_prefix(); }
void resetParam(AzParam &azp, const char *pfx, bool is_warmstart) { azp.reset_prefix(pfx); if (!is_warmstart) { azp.vStr(kw_pl_type, &s_pl_type); if (s_pl_type.length() > 0) ptyp = *s_pl_type.point(); else ptyp = AzpPoolingDflt_None; azp.vInt(kw_pl_num, &pl_num); azp.vInt(kw_pl_sz, &pl_sz); azp.vInt(kw_pl_step, &pl_step); azp.swOff(&do_pl_simple_grid, kw_no_pl_simple_grid); } azp.reset_prefix(); }