int main(void) { char letter; int i, ch, flag = 0, letterFound = 0, isInput = 0, num = 0, numFirst = 0; unsigned int letterSet = 0, glasSet = 0, tmpSet = 0; for (i = 0; i < 26; i++) addToSet(&letterSet, 'a' + i); addToSet(&glasSet, 'a'); addToSet(&glasSet, 'o'); addToSet(&glasSet, 'e'); addToSet(&glasSet, 'i'); addToSet(&glasSet, 'u'); addToSet(&glasSet, 'y'); letterSet = setSubtract(letterSet, glasSet); tmpSet = glasSet; while ((ch = getchar()) != EOF) { ch = toLower(ch); if (isLetter(ch)) { isInput = 1; if (isKeyExists(letterSet, ch)) continue; if (isKeyExists(glasSet, ch)) removeFromSet(&glasSet, ch); else if (!letterFound) { letterFound = 1; numFirst = num + 1; } } else if (isSpace(ch)) { if (isInput) num++; isInput = 0; glasSet = tmpSet; } else isInput = 1; } if (letterFound) printf("Слово #%d - содержит одинаковые гласные\n", numFirst); else printf("Не найдено ни одного слова с повторяющимися гласными\n"); return 0; }
int DataLoader::loadMultiplePhenotype(const std::string& multiplePhenotype, const std::string& pheno, const std::string& covar) { this->FLAG_pheno = pheno; this->FLAG_cov = covar; this->FLAG_multiplePheno = multiplePhenotype; if (covar.empty()) { this->FLAG_cov = this->FLAG_pheno; } // read in analysis template TextMatrix textMat; textMat.readFile(FLAG_multiplePheno, TextMatrix::HAS_HEADER); if (textMat.nrow() == 0 || which(textMat.header(), "pheno") < 0 || which(textMat.header(), "covar") < 0) { logger->warn( "Wrong multiple phenotype analysis file (no correct headers: \"pheno " "covar\")"); exit(1); } const int nTests = textMat.nrow(); const int phenoCol = which(textMat.getColName(), "pheno"); const int covCol = which(textMat.getColName(), "covar"); for (int i = 0; i < nTests; ++i) { formula.add(textMat[i][phenoCol], textMat[i][covCol]); } logger->info("Load [ %d ] test formulae", (int)formula.size()); std::vector<std::string> phenoLabel = formula.extractResponse(); std::vector<std::string> covarLabel = formula.extractPredictor(FormulaVector::NO_INTERCEPT); // std::vector<std::string> phenoLabel = textMat.extractCol("pheno"); // std::vector<std::string> covarLabel = // extractCovariate(textMat.extractCol("covar")); // read in ped TextMatrix pedMat; if (pedMat.readFile(FLAG_pheno, TextMatrix::HAS_HEADER)) { logger->error("Failed to load phenotype file [ %s ]!", FLAG_pheno.c_str()); exit(1); } if (pedMat.nrow() == 0 || pedMat.header()[0] != "fid" || pedMat.header()[1] != "iid") { logger->warn("Wrong phenotype file [ %s ]", pheno.c_str()); exit(1); } pedMat.setRowNameByCol("iid"); pedMat.keepCol(phenoLabel); if (pedMat.ncol() != (int)phenoLabel.size()) { logger->error( "Required responses [ %s ] cannot be found in [ %s ]", stringJoin(setSubtract(phenoLabel, pedMat.getColName()), ' ').c_str(), FLAG_pheno.c_str()); exit(1); } // read in cov TextMatrix covMat; if (covMat.readFile(FLAG_cov, TextMatrix::HAS_HEADER)) { logger->error("Failed to load covariate file [ %s ]!", FLAG_cov.c_str()); exit(1); } if (covMat.nrow() == 0 || tolower(covMat.header()[0]) != "fid" || tolower(covMat.header()[1]) != "iid") { logger->warn( "Wrong covariate file - empty or unrecognized header line [ %s ]", covar.c_str()); exit(1); } covMat.setRowNameByCol("iid"); covMat.keepCol(covarLabel); if (covMat.ncol() != (int)covarLabel.size()) { logger->error( "Required covariates [ %s ] cannot be found in [ %s ]", stringJoin(setSubtract(covarLabel, covMat.getColName()), ' ').c_str(), FLAG_cov.c_str()); exit(1); } // orangize ped/cov pedMat.convert(&phenotype); covMat.convert(&covariate); // make sure covarite and phenotype have the sample sets of samples std::vector<std::string> commonSample = intersect((phenotype.getRowName()), (covariate.getRowName())); phenotype.keepRow(commonSample); covariate.keepRow(commonSample); // drop all missing rows std::vector<int> missing = intersect((phenotype.allMissingRows()), (covariate.allMissingRows())); phenotype.dropRow(missing); covariate.dropRow(missing); // NOTE: do not take imputePheno, imputeCov // NOTE: do not center, scale ... // Actual regression model will center phenotype and covariate return 0; }
/** * Rearrange phenotype rows in the order given by @param names * @param droppedNamed will store sample names who are in @param names but are * not listed in phenotype. * * NOTE: @param names usually is the VCF sample names * NOTE: Current phenotype data may have NAN values */ int DataLoader::arrangePhenotype(const std::vector<std::string>& names, std::vector<std::string>* droppedNames) { if (!isUnique(names)) { logger->error("VCF file have duplicated sample ids. Quitting!"); abort(); } // not impute phentoype if (!FLAG_imputePheno) { std::vector<std::string> noGenotypeSamples = setSubtract(phenotype.getRowName(), names); *droppedNames = setSubtract(names, phenotype.getRowName()); const int n = noGenotypeSamples.size(); if (n) { logger->info("Discard [ %d ] samples as they do not have genotypes", n); } phenotype.dropRow(noGenotypeSamples); phenotype.reorderRow(names); // TODO: print some info here return 0; } // imputation std::vector<std::string> noPhenotypeSamples = setSubtract(names, phenotype.getRowName()); const int n = noPhenotypeSamples.size(); if (n) { logger->info( "Impute [ %d ] phenotypes of [ %d ] samples to the mean values", phenotype.ncol(), n); phenotype.addRow(noPhenotypeSamples, NAN); phenotype.imputeMissingToMeanByCol(); } phenotype.reorderRow(names); return 0; #if 0 // phenotype names (vcf sample names) arranged in the same order as in VCF std::vector<std::string> phenotypeNameInOrder; std::vector<double> phenotypeValueInOrder; // phenotype arranged in the same order as in VCF // TODO: better support for multiple phenotypes std::map<std::string, double> phenoDict; for (int i = 0; i < phenotype.nrow(); ++i) { phenoDict[phenotype.getRowName()[i]] = phenotype[i][0]; } rearrange(phenoDict, names, droppedNames, &phenotypeNameInOrder, &phenotypeValueInOrder, FLAG_imputePheno); // rearrange(phenoDict, vcfSampleNames, &vcfSampleToDrop, // &phenotypeNameInOrder, // &phenotypeInOrder, FLAG_imputePheno); phenotype.resize(phenotypeNameInOrder.size(), 1); phenotype.setRowName(phenotypeNameInOrder); phenotype.setCol(0, phenotypeValueInOrder); if (phenotypeValueInOrder.size() != phenoDict.size()) { logger->warn( "Drop [ %d ] samples from phenotype file due to missing genotypes from " "VCF files", (int)(phenoDict.size() - phenotypeValueInOrder.size())); // We may output these samples by comparing keys of phenotype and // phenotypeNameInOrder } #endif return 0; }