//' Calculate pairwise equivalence between sequences //' //' \code{pairwiseEqual} determined pairwise equivalence between a pairs in a //' set of sequences, excluding ambiguous positions (Ns and gaps). //' //' @param seq character vector containing a DNA sequences. //' //' @return A logical matrix of equivalence between each entry in \code{seq}. //' Values are \code{TRUE} when sequences are equivalent and \code{FALSE} //' when they are not. //' //' @seealso Uses \link{seqEqual} for testing equivalence between pairs. //' See \link{pairwiseDist} for generating a sequence distance matrix. //' //' @examples //' # Gaps and Ns will match any character //' seq <- c(A="ATGGC", B="ATGGG", C="ATGGG", D="AT--C", E="NTGGG") //' d <- pairwiseEqual(seq) //' rownames(d) <- colnames(d) <- seq //' d //' //' @export // [[Rcpp::export]] LogicalMatrix pairwiseEqual (StringVector seq) { // allocate the matrix we will return LogicalMatrix rmat(seq.length(), seq.length()); for (int i = 0; i < rmat.nrow(); i++) { for (int j = 0; j <= i; j++) { // check seq equal std::string row_seq = as<std::string>(seq[i]); std::string col_seq = as<std::string>(seq[j]); bool is_equal = seqEqual(row_seq, col_seq); // write to output matrix rmat(i,j) = is_equal; rmat(j,i) = is_equal; } } // Add row and column names Rcpp::List dimnames = Rcpp::List::create(seq.attr("names"), seq.attr("names")); rmat.attr("dimnames") = dimnames; return rmat; }
void ConfigScope::dump( StringBuffer & buf, bool wantExpandedUidNames, int indentLevel) const { int i; int len; StringVector nameVec; ConfigItem * item; //-------- // First pass. Dump the variables //-------- listLocalNames(Configuration::CFG_VARIABLES, nameVec); nameVec.sort(); len = nameVec.length(); for (i = 0; i < len; i++) { item = findItem(nameVec[i]); assert(item->type() & Configuration::CFG_VARIABLES); item->dump(buf, item->name(), wantExpandedUidNames, indentLevel); } //-------- // Second pass. Dump the nested scopes //-------- listLocalNames(Configuration::CFG_SCOPE, nameVec); nameVec.sort(); len = nameVec.length(); for (i = 0; i < len; i++) { item = findItem(nameVec[i]); assert(item->type() == Configuration::CFG_SCOPE); item->dump(buf, item->name(), wantExpandedUidNames, indentLevel); } }
// pairwiseDist // [[Rcpp::export]] NumericMatrix pairwiseDistRcpp (StringVector seq, NumericMatrix dist_mat) { // allocate the matrix we will return NumericMatrix rmat(seq.length(), seq.length()); for (int i = 0; i < rmat.nrow(); i++) { for (int j = 0; j < i; j++) { // check seq equal std::string row_seq = as<std::string>(seq[i]); std::string col_seq = as<std::string>(seq[j]); double distance = seqDistRcpp(row_seq, col_seq, dist_mat); // write to output matrix rmat(i,j) = distance; rmat(j,i) = distance; } } // Add row and column names Rcpp::List dimnames = Rcpp::List::create(seq.attr("names"), seq.attr("names")); rmat.attr("dimnames") = dimnames; return rmat; }
void calculateRuleForName( const Configuration * cfg, const char * name, const char * uName, const StringVector & wildcardedNamesAndTypes, StringBuffer & rule) { int i; int len; const char * str; const char * keyword; const char * wildcardedName; const char * type; rule.empty(); len = wildcardedNamesAndTypes.length(); for (i = 0; i < len; i+=3) { keyword = wildcardedNamesAndTypes[i+0]; // @optional or @required wildcardedName = wildcardedNamesAndTypes[i+1]; type = wildcardedNamesAndTypes[i+2]; if (Configuration::patternMatch(uName, wildcardedName)) { rule << keyword << " " << uName << " = " << type; return; } } //-------- // We couldn's determine the type from the wildcarded_names_and_types // table. So we fall back to using heuristics to guess a good type. //-------- if (cfg->type("", name) == Configuration::CFG_SCOPE) { rule << uName << " = scope"; } else if (cfg->type("", name) == Configuration::CFG_LIST) { rule << uName << " = list[string]"; } else { str = cfg->lookupString("", name); if (cfg->isBoolean(str)) { rule << uName << " = boolean"; } else if (cfg->isInt(str)) { rule << uName << " = int"; } else if (cfg->isFloat(str)) { rule << uName << " = float"; } else if (cfg->isDurationSeconds(str)) { rule << uName << " = durationSeconds"; } else if (cfg->isDurationMilliseconds(str)) { rule << uName << " = durationMilliseconds"; } else if (cfg->isDurationMicroseconds(str)) { rule << uName << " = durationMicroseconds"; } else if (cfg->isMemorySizeBytes(str)) { rule << uName << " = memorySizeBytes"; } else if (cfg->isMemorySizeKB(str)) { rule << uName << " = memorySizeKB"; } else if (cfg->isMemorySizeMB(str)) { rule << uName << " = memorySizeMB"; } else { rule << uName << " = string"; } } }
bool ConfigScope::listFilter( const char * name, const StringVector & filterPatterns) const { int i; int len; const char * unexpandedName; const char * pattern; StringBuffer buf; UidIdentifierProcessor uidProc; len = filterPatterns.length(); if (len == 0) { return true; } unexpandedName = uidProc.unexpand(name, buf); for (i = 0; i < len; i++) { pattern = filterPatterns[i]; if (Configuration::patternMatch(unexpandedName, pattern)) { return true; } } return false; }
void checkForUnmatchedPatterns( const Configuration * cfg, const StringVector & namesList, const StringVector & wildcardedNamesAndTypes, StringVector & unmatchedPatterns) throw(ConfigurationException) { int i; int len; const char * wildcardedName; unmatchedPatterns.empty(); //-------- // Check if there is a wildcarded name that does not match anything //-------- len = wildcardedNamesAndTypes.length(); for (i = 0; i < len; i += 3) { wildcardedName = wildcardedNamesAndTypes[i+1]; if (!doesPatternMatchAnyUnexpandedNameInList(cfg, wildcardedName, namesList)) { unmatchedPatterns.add(wildcardedName); } } }
bool SchemaTypeInt::isA( const SchemaValidator * sv, const Configuration * cfg, const char * value, const char * typeName, const StringVector & typeArgs, int indentLevel, StringBuffer & errSuffix) const { int val; int min; int max; try { val = cfg->stringToInt("", "", value); } catch (const ConfigurationException & ex) { return false; } if (typeArgs.length() == 0) { return true; } min = cfg->stringToInt("", "", typeArgs[0]); max = cfg->stringToInt("", "", typeArgs[1]); if (val < min || val > max) { errSuffix << "the value is outside the permitted range [" << typeArgs[0] << ", " << typeArgs[1] << "]"; return false; } return true; }
bool SchemaTypeDurationSeconds::isA( const SchemaValidator * sv, const Configuration * cfg, const char * value, const char * typeName, const StringVector & typeArgs, int indentLevel, StringBuffer & errSuffix) const { bool ok; int min; int max; int val; try { val = cfg->stringToDurationSeconds("", "", value); } catch (const ConfigurationException & ex) { errSuffix << "the value should be in the format '<units> <float>' " << "where <units> is one of: " << "second, seconds, " << "minute, minutes, " << "hour, hours, " << "day, days, " << "week, weeks; " << "alternatively, you can use 'infinite'"; return false; } if (typeArgs.length() == 0) { return true; } min = cfg->stringToDurationSeconds("", "", typeArgs[0]); max = cfg->stringToDurationSeconds("", "", typeArgs[1]); //-------- // We want to test for "min <= val && val <= max", but this is // is complicated by using "-1" for the numerical value of "infinite". //-------- if (min == -1) { assert(max == -1); ok = (val == -1); } else if (val == -1 && max == -1) { ok = true; } else if (val >= min && (val <= max || max == -1)) { ok = true; } else { ok = false; } if (!ok) { errSuffix << "the value is outside the permitted range [" << typeArgs[0] << ", " << typeArgs[1] << "]"; return false; } return true; }
void SchemaTypeTuple::checkRule( const SchemaValidator * sv, const Configuration * cfg, const char * typeName, const StringVector & typeArgs, const char * rule) const throw(ConfigurationException) { (void) cfg; StringBuffer msg; int i; int len; const char * elemType; SchemaType * typeDef; //-------- // Check there is at least one pair of type and name arguments. //-------- len = typeArgs.length(); if ((len == 0) || (len % 2 != 0)) { msg << "the '" << typeName << "' type requires pairs of type and " << "name arguments in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } //-------- // Check that all the type arguments are valid types. //-------- for (i = 0; i < len; i+=2) { elemType = typeArgs[i+0]; typeDef = findType(sv, elemType); if (typeDef == 0) { msg << "unknown type '" << elemType << "' in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } switch (typeDef->cfgType()) { case Configuration::CFG_STRING: break; case Configuration::CFG_LIST: msg << "you cannot embed a list type ('" << elemType << "') inside a " << "tuple in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); case Configuration::CFG_SCOPE: msg << "you cannot embed a scope type ('" << elemType << "') inside a " << "tuple in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); default: assert(0); // Bug! } } }
bool doesVectorcontainString(const StringVector & vec, const char * str) { int i; int len; len = vec.length(); for (i = 0; i < len; i++) { if (strcmp(vec[i], str) == 0) { return true; } } return false; }
void StringVector::add(const StringVector & other) { int i; int otherLen; otherLen = other.length(); if (m_currSize + otherLen >= m_maxSize) { ensureCapacity( (m_currSize + otherLen) * 2 ); } for (i = 0; i < otherLen; i++) { add(other[i]); } }
void SchemaTypeTypedef::checkRule( const SchemaValidator * sv, const Configuration * cfg, const char * typeName, const StringVector & typeArgs, const char * rule) const throw(ConfigurationException) { StringBuffer msg; if (typeArgs.length() != 0) { msg << "you cannot specify arguments when using user-defined type '" << typeName << "' in '" << rule << "'"; throw ConfigurationException(msg.c_str()); } }
void SchemaTypeScope::checkRule( const SchemaValidator * sv, const Configuration * cfg, const char * typeName, const StringVector & typeArgs, const char * rule) const throw(ConfigurationException) { StringBuffer msg; if (typeArgs.length() != 0) { msg << "the '" << typeName << "' type should not take arguments " << "in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } }
void SchemaTypeInt::checkRule( const SchemaValidator * sv, const Configuration * cfg, const char * typeName, const StringVector & typeArgs, const char * rule) const throw(ConfigurationException) { StringBuffer msg; int len; int min; int max; len = typeArgs.length(); if (len == 0) { return; } if (len != 2) { msg << "the '" << typeName << "' type should take either no " << "arguments or 2 arguments (denoting min and max values) " << "in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } try { min = cfg->stringToInt("", "", typeArgs[0]); } catch (const ConfigurationException & ex) { msg << "non-integer value for the first ('min') argument in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } try { max = cfg->stringToInt("", "", typeArgs[1]); } catch (const ConfigurationException & ex) { msg << "non-integer value for the second ('max') argument in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } if (min > max) { msg << "the first ('min') value is larger than the second ('max') " << "argument " << "in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } }
bool doesPatternMatchAnyUnexpandedNameInList( const Configuration * cfg, const char * pattern, const StringVector & namesList) { int i; int len; const char * uName; StringBuffer buf; len = namesList.length(); for (i = 0; i < len; i++) { uName = cfg->unexpandUid(namesList[i], buf); if (Configuration::patternMatch(uName, pattern)) { return true; } } return false; }
void StringVector::addWithOwnership(StringVector & other) { int i; int otherLen; otherLen = other.length(); if (m_currSize + otherLen >= m_maxSize) { ensureCapacity( (m_currSize + otherLen) * 2 ); } for (i = 0; i < otherLen; i++) { m_array[m_currSize] = other.m_array[i]; other.m_array[i] = 0; m_currSize ++; } m_array[m_currSize + 1] = 0; other.m_currSize = 0; other.m_array[0] = 0; }
bool SchemaTypeTypedef::isA( const SchemaValidator * sv, const Configuration * cfg, const char * value, const char * typeName, const StringVector & typeArgs, int indentLevel, StringBuffer & errSuffix) const { (void) value; (void) typeName; assert(typeArgs.length() == 0); const char* baseTypeName = m_baseTypeName.c_str(); SchemaType* baseTypeDef = findType(sv, baseTypeName); assert(baseTypeDef != 0); bool result = callIsA(baseTypeDef, sv, cfg, value, baseTypeName, m_baseTypeArgs, indentLevel + 1, errSuffix); return result; }
void ConfigScope::listScopedNamesHelper( const char * prefix, Configuration::Type typeMask, bool recursive, const StringVector & filterPatterns, StringVector & vec) const { int i; ConfigScopeEntry * entry; StringBuffer scopedName; //-------- // Iterate over all the entries in the hash table and copy // their locally-scoped names into the StringVector //-------- vec.ensureCapacity(vec.length() + m_numEntries); for (i = 0; i < m_tableSize; i++) { entry = m_table[i].m_next; while (entry) { scopedName = prefix; if (prefix[0] != '\0') { scopedName.append("."); } scopedName.append(entry->name()); if ((entry->type() & typeMask) && listFilter(scopedName.c_str(), filterPatterns)) { vec.add(scopedName); } if (recursive && entry->type() == Configuration::CFG_SCOPE) { entry->item()->scopeVal()->listScopedNamesHelper( scopedName.c_str(), typeMask, true, filterPatterns, vec); } entry = entry->m_next; } } }
void SchemaTypeTypedef::validate( const SchemaValidator * sv, const Configuration * cfg, const char * scope, const char * name, const char * typeName, const char * origTypeName, const StringVector & typeArgs, int indentLevel) const throw(ConfigurationException) { StringBuffer msg; SchemaType * baseTypeDef; const char * baseTypeName; assert(typeArgs.length() == 0); baseTypeName = m_baseTypeName.c_str(); baseTypeDef = findType(sv, baseTypeName); callValidate(baseTypeDef, sv, cfg, scope, name, baseTypeName, origTypeName, m_baseTypeArgs, indentLevel + 1); }
void calculateSchema( const Configuration * cfg, const StringVector & namesList, const StringVector & recipeUserTypes, const StringVector & wildcardedNamesAndTypes, const StringVector & recipeIgnoreRules, StringVector & schema) throw(ConfigurationException) { int i; int len; StringBuffer rule; StringBuffer buf; const char * name; const char * uName; StringVector uidNames; schema.empty(); schema.add(recipeIgnoreRules); schema.add(recipeUserTypes); len = namesList.length(); for (i = 0; i < len; i++) { name = namesList[i]; if (strstr(name, "uid-") == 0) { calculateRuleForName(cfg, name, name, wildcardedNamesAndTypes,rule); schema.add(rule); } else { uName = cfg->unexpandUid(name, buf); if (!doesVectorcontainString(uidNames, uName)) { uidNames.add(uName); calculateRuleForName(cfg, name, uName, wildcardedNamesAndTypes, rule); schema.add(rule); } } } }
vector<Example> createExampleVectorFromDataFrame(DataFrame data) { vector<Example> examples; int example_number = data.nrows(); StringVector labels; NumericVector weights; bool labelsExist=false; bool weightsExist=false; StringVector names = data.names(); StringVector feature_names; for (int i = 0; i < names.length(); ++i) { if (names[i] == "label") { labels = data["label"]; labelsExist = true; } else if (names[i] == "weight") { weights = data["weight"]; weightsExist = true; } else { feature_names.push_back(names[i]); } } for (int i = 0; i < example_number; ++i) { Example *example = new Example; if (labelsExist) { example -> label = std::stoi(as<std::string>(labels[i])); } else { example -> label = (int)NULL; } if(weightsExist) { example -> weight = weights[i]; } else { example -> weight = 1.0/example_number; } example -> values = vector<Value>(); examples.push_back(*example); } for (int j = 0; j < feature_names.size(); ++j) { NumericVector current_feature = data[String(feature_names[j])]; for (int i = 0; i < example_number; ++i) { examples[i].values.push_back(current_feature[i]); } } return examples; }
int main(int argc, char ** argv) { bool ok; Configuration * cfg; Configuration * schemaCfg; Config2Cpp util("config2cpp"); StringVector namesList; StringVector recipeUserTypes; StringVector wildcardedNamesAndTypes; StringVector recipeIgnoreRules; StringVector unmatchedPatterns; StringVector schema; SchemaValidator sv; const char * scope; int i; int len; const char * overrideSchema[] = { "@typedef keyword = enum[\"@optional\", \"@required\"]", "user_types = list[string]", "wildcarded_names_and_types = table[keyword,keyword, " "string,wildcarded-name, string,type]", "ignore_rules = list[string]", 0 // null-terminated array }; ok = util.parseCmdLineArgs(argc, argv); cfg = Configuration::create(); schemaCfg = Configuration::create(); if (ok && util.wantSchema()) { try { cfg->parse(util.cfgFileName()); cfg->listFullyScopedNames("", "", Configuration::CFG_SCOPE_AND_VARS, true, namesList); if (util.schemaOverrideCfg() != 0) { schemaCfg->parse(util.schemaOverrideCfg()); scope = util.schemaOverrideScope(); sv.parseSchema(overrideSchema); sv.validate(schemaCfg, scope, ""); schemaCfg->lookupList(scope, "user_types", recipeUserTypes); schemaCfg->lookupList(scope, "wildcarded_names_and_types", wildcardedNamesAndTypes); schemaCfg->lookupList(scope, "ignore_rules", recipeIgnoreRules); } calculateSchema(cfg, namesList, recipeUserTypes, wildcardedNamesAndTypes, recipeIgnoreRules, schema); checkForUnmatchedPatterns(cfg, namesList, wildcardedNamesAndTypes, unmatchedPatterns); } catch(const ConfigurationException & ex) { fprintf(stderr, "%s\n", ex.c_str()); ok = false; } len = unmatchedPatterns.length(); if (len != 0) { fprintf(stderr, "%s %s\n", "Error: the following patterns in the schema", "recipe did not match anything"); for (i = 0; i < len; i++) { fprintf(stderr, "\t'%s'\n", unmatchedPatterns[i]); } ok = false; } } if (ok) { ok = util.generateFiles(schema.c_array(), schema.length()); } cfg->destroy(); if (ok) { return 0; } else { return 1; } }
void SchemaTypeTuple::validate( const SchemaValidator * sv, const Configuration * cfg, const char * scope, const char * name, const char * typeName, const char * origTypeName, const StringVector & typeArgs, int indentLevel) const throw(ConfigurationException) { (void) origTypeName; StringBuffer msg; StringBuffer errSuffix; StringBuffer fullyScopedName; const char ** list; const char * elemValue; const char * elemTypeName; int i; int listSize; int typeArgsSize; int elemNameIndex; int typeIndex; int numElems; SchemaType * elemTypeDef; StringVector emptyArgs; bool ok; const char * sep; //-------- // Check the length of the list matches the size of the tuple //-------- typeArgsSize = typeArgs.length(); assert(typeArgsSize != 0); assert(typeArgsSize % 2 == 0); numElems = typeArgsSize / 2; cfg->lookupList(scope, name, list, listSize); if (listSize != numElems) { cfg->mergeNames(scope, name, fullyScopedName); msg << cfg->fileName() << ": there should be " << numElems << " entries in the '" << fullyScopedName << "' " << typeName << "; entries denote"; for (i = 0; i < numElems; i++) { msg << " '" << typeArgs[i*2+0] << "'"; if (i < numElems-1) { msg << ","; } } throw ConfigurationException(msg.c_str()); } //-------- // Check each item is of the type specified in the tuple //-------- for (i = 0; i < listSize; i++) { typeIndex = (i * 2 + 0) % typeArgsSize; elemNameIndex = (i * 2 + 1) % typeArgsSize; elemValue = list[i]; elemTypeName = typeArgs[typeIndex]; elemTypeDef = findType(sv, elemTypeName); ok = callIsA(elemTypeDef, sv, cfg, elemValue, elemTypeName, emptyArgs, indentLevel + 1, errSuffix); if (!ok) { if (errSuffix.length() == 0) { sep = ""; } else { sep = "; "; } cfg->mergeNames(scope, name, fullyScopedName); msg << cfg->fileName() << ": bad " << elemTypeName << " value ('" << elemValue << "') for element " << i+1 << " ('" << typeArgs[elemNameIndex] << "') of the '" << fullyScopedName << "' " << typeName << sep << errSuffix; throw ConfigurationException(msg.c_str()); } } }
void SchemaTypeDurationSeconds::checkRule( const SchemaValidator * sv, const Configuration * cfg, const char * typeName, const StringVector & typeArgs, const char * rule) const throw(ConfigurationException) { StringBuffer msg; int len; int min; int max; len = typeArgs.length(); if (len == 0) { return; } if (len != 2) { msg << "The '" << typeName << "' type should take " << "either no arguments or 2 arguments (denoting " << "min and max durations) in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } try { min = cfg->stringToDurationSeconds("", "", typeArgs[0]); } catch (const ConfigurationException & ex) { msg << "Bad " << typeName << " value for the first ('min') " << "argument in rule '" << rule << "'; should be 'infinite' " << "or in the format '<float> <units>' where <units> is one of: " << "'second', 'seconds', " << "'minute', 'minutes', " << "'hour', 'hours', " << "'day', 'days', " << "'week', 'weeks"; throw ConfigurationException(msg.c_str()); } try { max = cfg->stringToDurationSeconds("", "", typeArgs[1]); } catch (const ConfigurationException & ex) { msg << "Bad " << typeName << " value for the second ('max') " << "argument in rule '" << rule << "'; should be 'infinite' " << "or in the format '<float> <units>' where <units> is one of: " << "'second', 'seconds', " << "'minute', 'minutes', " << "'hour', 'hours', " << "'day', 'days', " << "'week', 'weeks"; throw ConfigurationException(msg.c_str()); } if ((min < -1) || (max < -1)) { msg << "The 'min' and 'max' of a " << typeName << " cannot be negative in rule '" << rule << "'" << "; min=" << min << "; max=" << max; throw ConfigurationException(msg.c_str()); } if ((max != -1) && (min == -1 || min > max)) { msg << "The first ('min') argument is larger than the second " << "('max') argument in rule '" << rule << "'"; throw ConfigurationException(msg.c_str()); } }