Example #1
0
_Bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, int ncol)
{
  // use typeSize superfluously to avoid not-used warning; otherwise could move typeSize from fread.h into fread.c
  if (typeSize[CT_BOOL8]!=1) STOP("Internal error: typeSize[CT_BOOL8] != 1");
  if (typeSize[CT_STRING]!=8) STOP("Internal error: typeSize[CT_STRING] != 1");
  colNamesSxp = NULL;
  if (colNames!=NULL) {
    colNamesSxp = PROTECT(allocVector(STRSXP, ncol));
    protecti++;
    for (int i=0; i<ncol; i++) {
      SEXP this;
      if (colNames[i].len<=0) {
        char buff[10];
        sprintf(buff,"V%d",i+1);
        this = mkChar(buff);
      } else {
        this = mkCharLenCE(anchor+colNames[i].off, colNames[i].len, ienc);
      }
      SET_STRING_ELT(colNamesSxp, i, this);
    }
  }
  if (length(colClassesSxp)) {
    SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT));
    protecti++;
    for (int i=0; i<NUT; i++) SET_STRING_ELT(typeRName_sxp, i, mkChar(typeRName[i]));
    if (isString(colClassesSxp)) {
      SEXP typeEnum_idx = PROTECT(chmatch(colClassesSxp, typeRName_sxp, NUT, FALSE));
      protecti++;
      if (LENGTH(colClassesSxp)==1) {
        signed char newType = typeEnum[INTEGER(typeEnum_idx)[0]-1];
        if (newType == CT_DROP) STOP("colClasses='drop' is not permitted; i.e. to drop all columns and load nothing");
        for (int i=0; i<ncol; i++) type[i]=newType;   // freadMain checks bump up only not down
      } else if (LENGTH(colClassesSxp)==ncol) {
        for (int i=0; i<ncol; i++) {
          if (STRING_ELT(colClassesSxp,i)==NA_STRING) continue; // user is ok with inherent type for this column
          type[i] = typeEnum[INTEGER(typeEnum_idx)[i]-1];
        }
      } else {
        STOP("colClasses is an unnamed character vector but its length is %d. Must be length 1 or ncol (%d in this case) when unnamed. To specify types for a subset of columns you can either name the items with the column names or pass list() format to colClasses using column names or column numbers. See examples in ?fread.",
              LENGTH(colClassesSxp), ncol);
      }
    } else {
      if (!isNewList(colClassesSxp)) STOP("CfreadR: colClasses is not type list");
      if (!length(getAttrib(colClassesSxp, R_NamesSymbol))) STOP("CfreadR: colClasses is type list but has no names");
      SEXP typeEnum_idx = PROTECT(chmatch(getAttrib(colClassesSxp, R_NamesSymbol), typeRName_sxp, NUT, FALSE));
      protecti++;
      for (int i=0; i<LENGTH(colClassesSxp); i++) {
        SEXP items;
        signed char thisType = typeEnum[INTEGER(typeEnum_idx)[i]-1];
        items = VECTOR_ELT(colClassesSxp,i);
        if (thisType == CT_DROP) {
          if (!isNull(dropSxp) || !isNull(selectSxp)) STOP("Can't use NULL in colClasses when select or drop is used as well.");
          dropSxp = items;
          continue;
        }
        SEXP itemsInt;
        if (isString(items)) itemsInt = PROTECT(chmatch(items, colNamesSxp, NA_INTEGER, FALSE));
        else itemsInt = PROTECT(coerceVector(items, INTSXP));
        protecti++;
        for (int j=0; j<LENGTH(items); j++) {
          int k = INTEGER(itemsInt)[j];
          if (k==NA_INTEGER) {
            if (isString(items)) STOP("Column name '%s' in colClasses[[%d]] not found", CHAR(STRING_ELT(items, j)),i+1);
            else STOP("colClasses[[%d]][%d] is NA", i+1, j+1);
          } else {
            if (k<1 || k>ncol) STOP("Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]",k,i+1,j+1,ncol);
            k--;
            if (type[k]<0) STOP("Column '%s' appears more than once in colClasses", CHAR(STRING_ELT(colNamesSxp,k)));
            type[k] = -thisType;
            // freadMain checks bump up only not down.  Deliberately don't catch here to test freadMain; e.g. test 959
          }
        }
      }
      for (int i=0; i<ncol; i++) if (type[i]<0) type[i] *= -1;  // undo sign; was used to detect duplicates
    }
  }
  if (readInt64As != CT_INT64) {
    for (int i=0; i<ncol; i++) if (type[i]==CT_INT64) type[i] = readInt64As;
  }
  if (length(dropSxp)) {
    SEXP itemsInt;
    if (isString(dropSxp)) itemsInt = PROTECT(chmatch(dropSxp, colNamesSxp, NA_INTEGER, FALSE));
    else itemsInt = PROTECT(coerceVector(dropSxp, INTSXP));
    protecti++;
    for (int j=0; j<LENGTH(itemsInt); j++) {
      int k = INTEGER(itemsInt)[j];
      if (k==NA_INTEGER) {
        if (isString(dropSxp)) {
          DTWARN("Column name '%s' in 'drop' not found", CHAR(STRING_ELT(dropSxp, j)));
        } else {
          DTWARN("drop[%d] is NA", j+1);
        }
      } else {
        if (k<1 || k>ncol) {
          DTWARN("Column number %d (drop[%d]) is out of range [1,ncol=%d]",k,j+1,ncol);
        } else {
          if (type[k-1] == CT_DROP) STOP("Duplicates detected in drop");
          type[k-1] = CT_DROP;
        }
      }
    }
  } else if (length(selectSxp)) {
    SEXP tt;
    if (isString(selectSxp)) {
      // invalid cols check part of #1445 moved here (makes sense before reading the file)
      tt = PROTECT(chmatch(selectSxp, colNamesSxp, NA_INTEGER, FALSE));
      protecti++;
      for (int i=0; i<length(selectSxp); i++) if (INTEGER(tt)[i]==NA_INTEGER)
        DTWARN("Column name '%s' not found in column name header (case sensitive), skipping.", CHAR(STRING_ELT(selectSxp, i)));
    } else tt = selectSxp;
    for (int i=0; i<LENGTH(tt); i++) {
      int k = isInteger(tt) ? INTEGER(tt)[i] : (int)REAL(tt)[i];
      if (k == NA_INTEGER) continue;
      if (k<1 || k>ncol) STOP("Column number %d (select[%d]) is out of range [1,ncol=%d]",k,i+1,ncol);
      if (type[k-1]<0) STOP("Column number %d ('%s') has been selected twice by select=", k, STRING_ELT(colNames,k-1));
      type[k-1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all
    }
    for (int i=0; i<ncol; i++) {
      if (type[i]<0) type[i] *= -1;
      else type[i]=CT_DROP;
    }
  }
  return TRUE;  // continue
}
Example #2
0
_Bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, int ncol)
{
  // use typeSize superfluously to avoid not-used warning; otherwise could move typeSize from fread.h into fread.c
  if (typeSize[CT_BOOL8_N]!=1) STOP("Internal error: typeSize[CT_BOOL8_N] != 1"); // # nocov
  if (typeSize[CT_STRING]!=8) STOP("Internal error: typeSize[CT_STRING] != 1"); // # nocov
  colNamesSxp = R_NilValue;
  if (colNames!=NULL) {
    SET_VECTOR_ELT(RCHK, 1, colNamesSxp=allocVector(STRSXP, ncol));
    for (int i=0; i<ncol; i++) {
      SEXP elem;
      if (colNames[i].len<=0) {
        char buff[12];
        sprintf(buff,"V%d",i+1);
        elem = mkChar(buff);  // no PROTECT as passed immediately to SET_STRING_ELT
      } else {
        elem = mkCharLenCE(anchor+colNames[i].off, colNames[i].len, ienc);  // no PROTECT as passed immediately to SET_STRING_ELT
      }
      SET_STRING_ELT(colNamesSxp, i, elem);
    }
  }
  if (length(colClassesSxp)) {
    SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT));
    for (int i=0; i<NUT; i++) SET_STRING_ELT(typeRName_sxp, i, mkChar(typeRName[i]));
    if (isString(colClassesSxp)) {
      SEXP typeEnum_idx = PROTECT(chmatch(colClassesSxp, typeRName_sxp, NUT, FALSE));
      if (LENGTH(colClassesSxp)==1) {
        signed char newType = typeEnum[INTEGER(typeEnum_idx)[0]-1];
        if (newType == CT_DROP) STOP("colClasses='NULL' is not permitted; i.e. to drop all columns and load nothing");
        for (int i=0; i<ncol; i++) type[i]=newType;   // freadMain checks bump up only not down
      } else if (LENGTH(colClassesSxp)==ncol) {
        for (int i=0; i<ncol; i++) {
          if (STRING_ELT(colClassesSxp,i)==NA_STRING) continue; // user is ok with inherent type for this column
          type[i] = typeEnum[INTEGER(typeEnum_idx)[i]-1];
        }
      } else {
        STOP("colClasses is an unnamed character vector but its length is %d. Must be length 1 or ncol (%d in this case) when unnamed. To specify types for a subset of columns you can either name the items with the column names or pass list() format to colClasses using column names or column numbers. See examples in ?fread.",
              LENGTH(colClassesSxp), ncol);
      }
      UNPROTECT(1); // typeEnum_idx
    } else {
      if (!isNewList(colClassesSxp)) STOP("CfreadR: colClasses is not type list");
      if (!length(getAttrib(colClassesSxp, R_NamesSymbol))) STOP("CfreadR: colClasses is type list but has no names");
      SEXP typeEnum_idx = PROTECT(chmatch(PROTECT(getAttrib(colClassesSxp, R_NamesSymbol)), typeRName_sxp, NUT, FALSE));
      for (int i=0; i<LENGTH(colClassesSxp); i++) {
        SEXP items;
        signed char thisType = typeEnum[INTEGER(typeEnum_idx)[i]-1];
        items = VECTOR_ELT(colClassesSxp,i);
        if (thisType == CT_DROP) {
          if (!isNull(dropSxp) || !isNull(selectSxp)) {
            if (dropSxp!=items) DTWARN("Ignoring the NULL item in colClasses= because select= or drop= has been used.");
            // package damr has a nice workaround for when NULL didn't work before v1.12.0: it sets drop=col_class$`NULL`. So allow that unambiguous case with no warning.
          } else {
            dropSxp = items;
          }
          continue;
        }
        SEXP itemsInt;
        if (isString(items)) itemsInt = PROTECT(chmatch(items, colNamesSxp, NA_INTEGER, FALSE));
        else                 itemsInt = PROTECT(coerceVector(items, INTSXP));
        // UNPROTECTed directly just after this for loop. No protecti++ here is correct.
        for (int j=0; j<LENGTH(items); j++) {
          int k = INTEGER(itemsInt)[j];
          if (k==NA_INTEGER) {
            if (isString(items)) STOP("Column name '%s' in colClasses[[%d]] not found", CHAR(STRING_ELT(items, j)),i+1);
            else STOP("colClasses[[%d]][%d] is NA", i+1, j+1);
          } else {
            if (k<1 || k>ncol) STOP("Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]",k,i+1,j+1,ncol);
            k--;
            if (type[k]<0) STOP("Column '%s' appears more than once in colClasses", CHAR(STRING_ELT(colNamesSxp,k)));
            type[k] = -thisType;
            // freadMain checks bump up only not down.  Deliberately don't catch here to test freadMain; e.g. test 959
          }
        }
        UNPROTECT(1); // UNPROTECTing itemsInt inside loop to save protection stack
      }
      for (int i=0; i<ncol; i++) if (type[i]<0) type[i] *= -1;  // undo sign; was used to detect duplicates
      UNPROTECT(2);  // typeEnum_idx (+1 for its protect of getAttrib)
    }
    UNPROTECT(1);  // typeRName_sxp
  }
  if (readInt64As != CT_INT64) {
    for (int i=0; i<ncol; i++) if (type[i]==CT_INT64) type[i] = readInt64As;
  }
  if (length(dropSxp)) {
    SEXP itemsInt;
    if (isString(dropSxp)) itemsInt = PROTECT(chmatch(dropSxp, colNamesSxp, NA_INTEGER, FALSE));
    else                   itemsInt = PROTECT(coerceVector(dropSxp, INTSXP));
    for (int j=0; j<LENGTH(itemsInt); j++) {
      int k = INTEGER(itemsInt)[j];
      if (k==NA_INTEGER) {
        if (isString(dropSxp)) {
          DTWARN("Column name '%s' in 'drop' not found", CHAR(STRING_ELT(dropSxp, j)));
        } else {
          DTWARN("drop[%d] is NA", j+1);
        }
      } else {
        if (k<1 || k>ncol) {
          DTWARN("Column number %d (drop[%d]) is out of range [1,ncol=%d]",k,j+1,ncol);
        } else {
          // if (type[k-1] == CT_DROP) DTWARN("drop= contains duplicates");
          // NULL in colClasses didn't work between 1.11.0 and 1.11.8 so people have been using drop= to re-specify the NULL columns in colClasses. Now that NULL in colClasses works
          // from v1.12.0 there is no easy way to distinguish dups in drop= from drop overlapping with NULLs in colClasses. But it's unambiguous that it was intended to remove these
          // columns, so no need for warning.
          type[k-1] = CT_DROP;
        }
      }
    }
    UNPROTECT(1); // itemsInt
  } else if (length(selectSxp)) {
    SEXP tt;
    if (isString(selectSxp)) {
      // invalid cols check part of #1445 moved here (makes sense before reading the file)
      tt = PROTECT(chmatch(selectSxp, colNamesSxp, NA_INTEGER, FALSE));
      for (int i=0; i<length(selectSxp); i++) if (INTEGER(tt)[i]==NA_INTEGER)
        DTWARN("Column name '%s' not found in column name header (case sensitive), skipping.", CHAR(STRING_ELT(selectSxp, i)));
    } else {
      tt = PROTECT(selectSxp); // harmless superfluous PROTECT, for ease of balancing
    }
    for (int i=0; i<LENGTH(tt); i++) {
      int k = isInteger(tt) ? INTEGER(tt)[i] : (int)REAL(tt)[i];
      if (k == NA_INTEGER) continue;
      if (k<0) STOP("Column number %d (select[%d]) negative but should be in the range [1,ncol=%d]. Consider drop= for column exclusion.",k,i+1,ncol);
      if (k==0) STOP("select = 0 (select[%d]) has no meaning. All values of select should be in the range [1,ncol=%d].",i+1,ncol);
      if (k>ncol) STOP("Column number %d (select[%d]) is too large for this table, which only has %d columns.",k,i+1,ncol);
      if (type[k-1]<0) STOP("Column number %d ('%s') has been selected twice by select=", k, CHAR(STRING_ELT(colNamesSxp,k-1)));
      type[k-1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all
    }
    for (int i=0; i<ncol; i++) {
      if (type[i]<0) type[i] *= -1;
      else type[i]=CT_DROP;
    }
    UNPROTECT(1); // tt
  }
  return true;
}