Example #1
0
// [[Rcpp::export]]
RObject tokenize_(List sourceSpec, List tokenizerSpec, int n_max) {
  Warnings warnings;

  SourcePtr source = Source::create(sourceSpec);
  TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
  tokenizer->tokenize(source->begin(), source->end());
  tokenizer->setWarnings(&warnings);

  std::vector<std::vector<std::string> > rows;

  for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) {
    if (n_max > 0 && t.row() >= (size_t) n_max)
      break;

    if (t.row() >= rows.size()) {
      rows.resize(t.row() + 1);
    }

    std::vector<std::string>& row = rows[t.row()];
    if (t.col() >= row.size())
      row.resize(t.col() + 1);

    row[t.col()] = t.asString();
  }

  RObject out = wrap(rows);
  return warnings.addAsAttribute(out);
}
Example #2
0
// [[Rcpp::export]]
SEXP parse_vector_(CharacterVector x, List collectorSpec,
                   List locale_, const std::vector<std::string>& na) {
  Warnings warnings;
  int n = x.size();

  LocaleInfo locale(locale_);

  boost::shared_ptr<Collector> col = Collector::create(collectorSpec, &locale);
  col->setWarnings(&warnings);
  col->resize(n);

  for (int i = 0; i < n; ++i) {
    Token t;
    if (x[i] == NA_STRING) {
      t = Token(TOKEN_MISSING, i, -1);
    } else {
      SEXP string = x[i];
      t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false);
      t.trim();
      t.flagNA(na);
    }
    col->setValue(i, t);
  }

  return warnings.addAsAttribute(col->vector());
}
Example #3
0
// [[Rcpp::export]]
RObject read_tokens(List sourceSpec, List tokenizerSpec, ListOf<List> colSpecs,
                    CharacterVector colNames, List locale_, int n_max = -1,
                    bool progress = true) {

  Warnings warnings;
  LocaleInfo locale(locale_);

  SourcePtr source = Source::create(sourceSpec);

  TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
  tokenizer->tokenize(source->begin(), source->end());
  tokenizer->setWarnings(&warnings);

  std::vector<CollectorPtr> collectors = collectorsCreate(colSpecs, &locale, &warnings);

  Progress progressBar;

  // Work out how many output columns we have
  size_t p = collectors.size();
  size_t pOut = 0;
  for (size_t j = 0; j < p; ++j) {
    if (collectors[j]->skip())
      continue;
    pOut++;
  }

  // Match colNames to with non-skipped collectors
  if (p != (size_t) colNames.size())
    stop("colSpec and colNames must be same size");

  CharacterVector outNames(pOut);
  int cj = 0;
  for (size_t j = 0; j < p; ++j) {
    if (collectors[j]->skip())
      continue;

    outNames[cj] = colNames[j];
    cj++;
  }

  size_t n = (n_max < 0) ? 1000 : n_max;
  collectorsResize(collectors, n);

  int i = -1, j = -1, cells = 0;
  for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) {
    if (progress && (cells++) % 250000 == 0)
      progressBar.show(tokenizer->progress());

    if (t.col() == 0 && i != -1)
      checkColumns(&warnings, i, j, p);

    if (t.row() >= n) {
      if (n_max >= 0)
        break;

      // Estimate rows in full dataset
      n = (i / tokenizer->progress().first) * 1.2;
      collectorsResize(collectors, n);
    }

    if (t.col() < p)
      collectors[t.col()]->setValue(t.row(), t);

    i = t.row();
    j = t.col();
  }
  if (i != -1)
    checkColumns(&warnings, i, j, p);

  if (progress)
    progressBar.show(tokenizer->progress());
  progressBar.stop();

  if (i != (int) n - 1) {
    collectorsResize(collectors, i + 1);
  }

  // Save individual columns into a data frame
  List out(pOut);
  j = 0;
  for(CollectorItr cur = collectors.begin(); cur != collectors.end(); ++cur) {
    if ((*cur)->skip())
      continue;

    out[j] = (*cur)->vector();
    j++;
  }

  out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
  out.attr("row.names") = IntegerVector::create(NA_INTEGER, -(i + 1));
  out.attr("names") = outNames;

  return warnings.addAsAttribute(out);
}
Example #4
0
// [[Rcpp::export]]
RObject read_tokens(List sourceSpec, List tokenizerSpec, ListOf<List> colSpecs,
                    CharacterVector col_names, int n_max = -1,
                    bool progress = true) {
  Warnings warnings;

  SourcePtr source = Source::create(sourceSpec);

  TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
  tokenizer->tokenize(source->begin(), source->end());
  tokenizer->setWarnings(&warnings);

  std::vector<CollectorPtr> collectors = collectorsCreate(colSpecs, &warnings);

  Progress progressBar;

  size_t p = collectors.size();
  // Work out how many output columns we have
  size_t pOut = 0;
  for(CollectorItr cur = collectors.begin(); cur != collectors.end(); ++cur) {
    if (!(*cur)->skip())
      pOut++;
  }

  // Allow either one name for column, or one name per output col
  if (p != pOut && (size_t) col_names.size() == p) {
    CharacterVector col_names2(pOut);
    int cj = 0;
    for (size_t j = 0; j < p; ++j) {
      if (collectors[j]->skip())
        continue;
      col_names2[cj++] = col_names[j];
    }
    col_names = col_names2;
  }

  if (pOut != (size_t) col_names.size()) {
    Rcpp::stop("You have %i column names, but %i columns",
      col_names.size(), pOut);
  }

  size_t n = (n_max < 0) ? 1000 : n_max;
  collectorsResize(collectors, n);

  size_t i = 0, cells = 0;
  for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) {
    if (progress && (cells++) % 250000 == 0)
      progressBar.show(tokenizer->progress());

    if (t.col() >= p) {
      warnings.addWarning(t.row(), t.col(), tfm::format("Only %i columns", p), "");
      continue;
    }

    if (t.row() >= n) {
      if (n_max >= 0)
        break;

      // Estimate rows in full dataset
      n = (i / tokenizer->progress().first) * 1.2;
      collectorsResize(collectors, n);
    }

    collectors[t.col()]->setValue(t.row(), t);
    i = t.row();
  }
  progressBar.show(tokenizer->progress());
  progressBar.stop();

  if (i <= n) {
    collectorsResize(collectors, i + 1);
  }


  // Save individual columns into a data frame
  List out(pOut);
  int j = 0;
  for(CollectorItr cur = collectors.begin(); cur != collectors.end(); ++cur) {
    if ((*cur)->skip())
      continue;

    out[j] = (*cur)->vector();
    j++;
  }

  out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
  out.attr("row.names") = IntegerVector::create(NA_INTEGER, -(i + 1));
  out.attr("names") = col_names;

  return warnings.addAsAttribute(out);
}