// [[Rcpp::export]] RObject tokenize_(List sourceSpec, List tokenizerSpec, int n_max) { Warnings warnings; SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); tokenizer->setWarnings(&warnings); std::vector<std::vector<std::string> > rows; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { if (n_max > 0 && t.row() >= (size_t) n_max) break; if (t.row() >= rows.size()) { rows.resize(t.row() + 1); } std::vector<std::string>& row = rows[t.row()]; if (t.col() >= row.size()) row.resize(t.col() + 1); row[t.col()] = t.asString(); } RObject out = wrap(rows); return warnings.addAsAttribute(out); }
// [[Rcpp::export]] SEXP parse_vector_(CharacterVector x, List collectorSpec, List locale_, const std::vector<std::string>& na) { Warnings warnings; int n = x.size(); LocaleInfo locale(locale_); boost::shared_ptr<Collector> col = Collector::create(collectorSpec, &locale); col->setWarnings(&warnings); col->resize(n); for (int i = 0; i < n; ++i) { Token t; if (x[i] == NA_STRING) { t = Token(TOKEN_MISSING, i, -1); } else { SEXP string = x[i]; t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false); t.trim(); t.flagNA(na); } col->setValue(i, t); } return warnings.addAsAttribute(col->vector()); }
// [[Rcpp::export]] RObject read_tokens(List sourceSpec, List tokenizerSpec, ListOf<List> colSpecs, CharacterVector colNames, List locale_, int n_max = -1, bool progress = true) { Warnings warnings; LocaleInfo locale(locale_); SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); tokenizer->setWarnings(&warnings); std::vector<CollectorPtr> collectors = collectorsCreate(colSpecs, &locale, &warnings); Progress progressBar; // Work out how many output columns we have size_t p = collectors.size(); size_t pOut = 0; for (size_t j = 0; j < p; ++j) { if (collectors[j]->skip()) continue; pOut++; } // Match colNames to with non-skipped collectors if (p != (size_t) colNames.size()) stop("colSpec and colNames must be same size"); CharacterVector outNames(pOut); int cj = 0; for (size_t j = 0; j < p; ++j) { if (collectors[j]->skip()) continue; outNames[cj] = colNames[j]; cj++; } size_t n = (n_max < 0) ? 1000 : n_max; collectorsResize(collectors, n); int i = -1, j = -1, cells = 0; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { if (progress && (cells++) % 250000 == 0) progressBar.show(tokenizer->progress()); if (t.col() == 0 && i != -1) checkColumns(&warnings, i, j, p); if (t.row() >= n) { if (n_max >= 0) break; // Estimate rows in full dataset n = (i / tokenizer->progress().first) * 1.2; collectorsResize(collectors, n); } if (t.col() < p) collectors[t.col()]->setValue(t.row(), t); i = t.row(); j = t.col(); } if (i != -1) checkColumns(&warnings, i, j, p); if (progress) progressBar.show(tokenizer->progress()); progressBar.stop(); if (i != (int) n - 1) { collectorsResize(collectors, i + 1); } // Save individual columns into a data frame List out(pOut); j = 0; for(CollectorItr cur = collectors.begin(); cur != collectors.end(); ++cur) { if ((*cur)->skip()) continue; out[j] = (*cur)->vector(); j++; } out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); out.attr("row.names") = IntegerVector::create(NA_INTEGER, -(i + 1)); out.attr("names") = outNames; return warnings.addAsAttribute(out); }
// [[Rcpp::export]] RObject read_tokens(List sourceSpec, List tokenizerSpec, ListOf<List> colSpecs, CharacterVector col_names, int n_max = -1, bool progress = true) { Warnings warnings; SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); tokenizer->setWarnings(&warnings); std::vector<CollectorPtr> collectors = collectorsCreate(colSpecs, &warnings); Progress progressBar; size_t p = collectors.size(); // Work out how many output columns we have size_t pOut = 0; for(CollectorItr cur = collectors.begin(); cur != collectors.end(); ++cur) { if (!(*cur)->skip()) pOut++; } // Allow either one name for column, or one name per output col if (p != pOut && (size_t) col_names.size() == p) { CharacterVector col_names2(pOut); int cj = 0; for (size_t j = 0; j < p; ++j) { if (collectors[j]->skip()) continue; col_names2[cj++] = col_names[j]; } col_names = col_names2; } if (pOut != (size_t) col_names.size()) { Rcpp::stop("You have %i column names, but %i columns", col_names.size(), pOut); } size_t n = (n_max < 0) ? 1000 : n_max; collectorsResize(collectors, n); size_t i = 0, cells = 0; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { if (progress && (cells++) % 250000 == 0) progressBar.show(tokenizer->progress()); if (t.col() >= p) { warnings.addWarning(t.row(), t.col(), tfm::format("Only %i columns", p), ""); continue; } if (t.row() >= n) { if (n_max >= 0) break; // Estimate rows in full dataset n = (i / tokenizer->progress().first) * 1.2; collectorsResize(collectors, n); } collectors[t.col()]->setValue(t.row(), t); i = t.row(); } progressBar.show(tokenizer->progress()); progressBar.stop(); if (i <= n) { collectorsResize(collectors, i + 1); } // Save individual columns into a data frame List out(pOut); int j = 0; for(CollectorItr cur = collectors.begin(); cur != collectors.end(); ++cur) { if ((*cur)->skip()) continue; out[j] = (*cur)->vector(); j++; } out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); out.attr("row.names") = IntegerVector::create(NA_INTEGER, -(i + 1)); out.attr("names") = col_names; return warnings.addAsAttribute(out); }