// Strip the leading directories and // the last trailling suffix from a filename std::string stripFilename(const std::string& filename) { std::string out = stripDirectories(filename); // Remove the gzip extension if necessary if(isGzip(out)) out = stripExtension(out); return stripExtension(out); }
// // Handle command line arguments // void parseRmdupOptions(int argc, char** argv) { // Set defaults bool die = false; for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { case 'p': arg >> opt::prefix; break; case 'o': arg >> opt::outFile; break; case 'e': arg >> opt::errorRate; break; case 'd': arg >> opt::sampleRate; break; case 't': arg >> opt::numThreads; break; case 'v': opt::verbose++; break; case OPT_HELP: std::cout << RMDUP_USAGE_MESSAGE; exit(EXIT_SUCCESS); case OPT_VERSION: std::cout << RMDUP_VERSION_MESSAGE; exit(EXIT_SUCCESS); } } if (argc - optind < 1) { std::cerr << SUBPROGRAM ": missing arguments\n"; die = true; } else if (argc - optind > 1) { std::cerr << SUBPROGRAM ": too many arguments\n"; die = true; } if (die) { std::cerr << "Try `" << SUBPROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } // Parse the input filenames opt::readsFile = argv[optind++]; if(opt::prefix.empty()) { opt::prefix = stripExtension(opt::readsFile); } if(opt::outFile.empty()) { opt::outFile = stripExtension(opt::readsFile) + ".rmdup.fa"; } }
NamespaceDef::NamespaceDef(const char *df,int dl,int dc, const char *name,const char *lref, const char *fName, const char*type, bool isPublished) : Definition(df,dl,dc,name) ,m_isPublished(isPublished) { if (fName) { if (lref) { fileName = stripExtension(fName); } else { fileName = convertNameToFile(stripExtension(fName)); } } else { setFileName(name); } classSDict = new ClassSDict(17); namespaceSDict = new NamespaceSDict(17); m_innerCompounds = new SDict<Definition>(17); usingDirList = 0; usingDeclList = 0; m_allMembersDict = 0; setReference(lref); memberGroupSDict = new MemberGroupSDict; memberGroupSDict->setAutoDelete(TRUE); visited=FALSE; m_subGrouping=Config_getBool(SUBGROUPING); if (type && !strcmp("module", type)) { m_type = MODULE; } else if (type && !strcmp("constants", type)) { m_type = CONSTANT_GROUP; } else if (type && !strcmp("library", type)) { m_type = LIBRARY; } else { m_type = NAMESPACE; } }
NamespaceDef::NamespaceDef(const char *df,int dl, const char *name,const char *lref, const char *fName) : Definition(df,dl,name) { if (fName) { fileName = stripExtension(fName); } else { fileName="namespace"; fileName+=name; } classSDict = new ClassSDict(17); namespaceSDict = new NamespaceSDict(17); m_innerCompounds = new SDict<Definition>(17); usingDirList = 0; usingDeclList = 0; m_allMembersDict = 0; setReference(lref); memberGroupSDict = new MemberGroupSDict; memberGroupSDict->setAutoDelete(TRUE); visited=FALSE; m_subGrouping=Config_getBool("SUBGROUPING"); }
GroupDef::GroupDef(const char *df,int dl,const char *na,const char *t, const char *refFileName) : Definition(df,dl,na) { fileList = new FileList; classSDict = new ClassSDict(17); groupList = new GroupList; namespaceSDict = new NamespaceSDict(17); pageDict = new PageSDict(17); exampleDict = new PageSDict(17); dirList = new DirList; allMemberNameInfoSDict = new MemberNameInfoSDict(17); if (refFileName) { fileName=stripExtension(refFileName); } else { fileName = (QCString)"group_"+na; } setGroupTitle( t ); memberGroupSDict = new MemberGroupSDict; memberGroupSDict->setAutoDelete(TRUE); allMemberList = new MemberList(MemberList::allMembersList); visited = 0; groupScope = 0; }
string writeDependency(const FileDescriptor* desc, void* param) { string out; out.append("\t\t\t<include schemaLocation=\"").append(stripExtension(desc->name())).append(".xsd\"/>\n"); return out; }
GroupDef::GroupDef(const char *df,int dl,const char *na,const char *t, const char *refFileName) : Definition(df,dl,1,na) { fileList = new FileList; classSDict = new ClassSDict(17); groupList = new GroupList; namespaceSDict = new NamespaceSDict(17); pageDict = new PageSDict(17); exampleDict = new PageSDict(17); dirList = new DirList; allMemberNameInfoSDict = new MemberNameInfoSDict(17); allMemberNameInfoSDict->setAutoDelete(TRUE); if (refFileName) { fileName=stripExtension(refFileName); } else { fileName = convertNameToFile(QCString("group_")+na); } setGroupTitle( t ); memberGroupSDict = new MemberGroupSDict; memberGroupSDict->setAutoDelete(TRUE); allMemberList = new MemberList(MemberListType_allMembersList); visited = 0; groupScope = 0; m_subGrouping=Config_getBool(SUBGROUPING); }
// // Handle command line arguments // void parseFMMergeOptions(int argc, char** argv) { bool die = false; for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { case 'm': arg >> opt::minOverlap; break; case 'p': arg >> opt::prefix; break; case 'o': arg >> opt::outFile; break; case 't': arg >> opt::numThreads; break; case '?': die = true; break; case 'v': opt::verbose++; break; case OPT_HELP: std::cout << FMMERGE_USAGE_MESSAGE; exit(EXIT_SUCCESS); case OPT_VERSION: std::cout << FMMERGE_VERSION_MESSAGE; exit(EXIT_SUCCESS); } } if (argc - optind < 1) { std::cerr << SUBPROGRAM ": missing arguments\n"; die = true; } else if (argc - optind > 1) { std::cerr << SUBPROGRAM ": too many arguments\n"; die = true; } if(opt::numThreads <= 0) { std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::numThreads << "\n"; die = true; } if (die) { std::cout << "\n" << FMMERGE_USAGE_MESSAGE; exit(EXIT_FAILURE); } // Parse the input filenames opt::readsFile = argv[optind++]; if(opt::prefix.empty()) { opt::prefix = stripExtension(opt::readsFile); } if(opt::outFile.empty()) { opt::outFile = opt::prefix + ".merged.fa"; } }
// creates a file with the right extension for the plugin type. // as of Aalto 1.6 / Kaivo this is always .mlpreset. // input: a file path relative to the presets root, without extension. // void MLPluginProcessor::saveStateToRelativePath(const std::string& path) { #if DEMO debug() << "DEMO version. Saving is disabled.\n"; #else // the Model param contains the file path relative to the root. std::string shortPath = stripExtension(path); setProperty("preset", shortPath); std::string extension (".mlpreset"); std::string extPath = shortPath + extension; const MLFilePtr f = mPresetFiles->createFile(extPath); if(!f->getJuceFile().exists()) { f->getJuceFile().create(); } f->getJuceFile().replaceWithText(getStateAsText()); // reset state stack and push current state for recall mpPatchState->clearStateStack(); mpPatchState->pushStateToStack(); #endif // DEMO }
SmallImage::SmallImage(string fileName) { mImage = imread(fileName); if (mImage.empty()) { throw * (new CannotOpenImageException(fileName)); } mName = stripExtension(stripDirectory(fileName)); }
void MLPluginProcessor::loadStateFromPath(const std::string& path) { if(path != std::string()) { const MLFilePtr f = mPresetFiles->getFileByName(path); if(f != MLFilePtr()) { loadStateFromFile(f->mFile); std::string shortPath = stripExtension(path); setProperty("preset", shortPath); } } }
bool WsdlGeneratorHelper::createOutputFile(void) { bool success = true; try { protoFilenameNoExt = stripExtension(proto->name()); protoFilenameNoExtNoPath = stripPath(protoFilenameNoExt); string wsdlName = protoFilenameNoExtNoPath; wsdlName.append(".wsdl"); wsdl = outputDirectory->Open(wsdlName); success = writeFileContents(); } catch (string s) { success = false; *error = s; } return success; }
void GameParamUserInterface::onActivate() { TNLAssert(getUIManager()->cameFrom<EditorUserInterface>(), "GameParamUserInterface should only be called from the editor!"); Level *level = getUIManager()->getUI<EditorUserInterface>()->getLevel(); const GameType *gameType = level->getGameType(); // Force rebuild of all params for current gameType; this will make sure we have the latest info if we've loaded a new level, // but will also preserve any values entered for gameTypes that are not current. clearCurrentGameTypeParams(gameType); // Load filename from editor only when we activate the menu mLevelFilename = stripExtension(getUIManager()->getUI<EditorUserInterface>()->getLevelFileName()); if(mLevelFilename == EditorUserInterface::UnnamedFile) mLevelFilename = ""; updateMenuItems(gameType); mOrigGameParams = level->toLevelCode(); // Save a copy of the params coming in for comparison when we leave to see what changed Cursor::disableCursor(); }
GroupDef::GroupDef(const char *df,int dl,const char *na,const char *t, const char *refFileName) : Definition(df,dl,na) { fileList = new FileList; classSDict = new ClassSDict(17); groupList = new GroupList; namespaceSDict = new NamespaceSDict(17); pageDict = new PageSDict(17); exampleDict = new PageSDict(17); dirList = new DirList; allMemberList = new MemberList; allMemberNameInfoSDict = new MemberNameInfoSDict(17); if (refFileName) { fileName=stripExtension(refFileName); } else { fileName = (QCString)"group_"+na; } setGroupTitle( t ); memberGroupSDict = new MemberGroupSDict; memberGroupSDict->setAutoDelete(TRUE); decDefineMembers.setInGroup(TRUE); decProtoMembers.setInGroup(TRUE); decTypedefMembers.setInGroup(TRUE); decEnumMembers.setInGroup(TRUE); decEnumValMembers.setInGroup(TRUE); decFuncMembers.setInGroup(TRUE); decVarMembers.setInGroup(TRUE); docDefineMembers.setInGroup(TRUE); docProtoMembers.setInGroup(TRUE); docTypedefMembers.setInGroup(TRUE); docEnumMembers.setInGroup(TRUE); docFuncMembers.setInGroup(TRUE); docVarMembers.setInGroup(TRUE); visited = 0; }
void MLMenuButton::doPropertyChangeAction(MLSymbol property, const MLProperty& val) { if (property == "text") { // TODO this file-specific stuff should not be here. std::string processedText; const std::string str = val.getStringValue(); if(getFloatProperty("strip")) { processedText = stripExtension(getShortName(str)); } else { processedText = str; } setProperty("processed_text", processedText); repaint(); } else { MLButton::doPropertyChangeAction(property, val); } }
int variantDetectabilityMain(int argc, char** argv) { parseVarDetectOptions(argc, argv); // Load the reference BWT std::string bwt_name = stripExtension(opt::referenceFile) + BWT_EXT; BWTIndexSet ref_index; ref_index.pBWT = new BWT(bwt_name); ref_index.pCache = new BWTIntervalCache(11, ref_index.pBWT); // Read reference ReadTable ref_table(opt::referenceFile); // Convert to string vector StringVector ref_sequences; for(size_t i = 0; i < ref_table.getCount(); ++i) { ref_sequences.push_back(ref_table.getRead(i).seq.toString()); } computeDetectableSampling(ref_sequences, ref_index); delete ref_index.pBWT; delete ref_index.pCache; return 0; }
void rmdup() { StringVector hitsFilenames; BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate); BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate); OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, opt::errorRate, 0, 0, false); Timer* pTimer = new Timer(PROGRAM_IDENT); if(opt::numThreads <= 1) { printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT); computeRmdupHitsSerial(opt::prefix, opt::readsFile, pOverlapper, hitsFilenames); } else { printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads); computeRmdupHitsParallel(opt::numThreads, opt::prefix, opt::readsFile, pOverlapper, hitsFilenames); } delete pOverlapper; delete pBWT; delete pRBWT; delete pTimer; std::string out_prefix = stripExtension(opt::outFile); std::string dupsFile = parseDupHits(hitsFilenames, out_prefix); // Rebuild the indices without the duplicated sequences if(opt::bReindex) { std::cout << "Rebuilding indices without duplicated reads\n"; removeReadsFromIndices(opt::prefix, dupsFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads); removeReadsFromIndices(opt::prefix, dupsFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads); } }
int fillAaMain(int argc, char** argv) { parseFillAaOptions(argc, argv); string line; // for reading the input files std::istream* vcfFile = createReader(opt::vcfFile.c_str()); std::ifstream* ancSeqFile = new std::ifstream(opt::ancSeqFile.c_str()); string refFastaFileRoot; if (opt::out.empty()) { refFastaFileRoot = stripExtension(opt::vcfFile); } else { refFastaFileRoot = opt::out; } string outFN = refFastaFileRoot + "_AAfilled.vcf"; std::ofstream* outFile = new std::ofstream(outFN.c_str()); // Read in the whole ancestral sequence std::map<string, string> ancSeqs; getline(*ancSeqFile, line); string currentScaffold = line.substr(1,string::npos); ancSeqs[currentScaffold] = ""; ancSeqs[currentScaffold].reserve(50000000); while (getline(*ancSeqFile, line)) { if (line[0] != '>') { ancSeqs[currentScaffold].append(line); } else { // std::cerr << currentScaffold << " length: " << ancSeqs[currentScaffold].length() << std::endl; currentScaffold = line.substr(1,string::npos); ancSeqs[currentScaffold] = ""; ancSeqs[currentScaffold].reserve(50000000); } } // Now go through the vcf and add the AA fields int totalVariantNumber = 0; int aaDashCount = 0; int aaRefCount = 0; int aaAltCount = 0; int aaDiffCount = 0; int aaNcount = 0; while (getline(*vcfFile, line)) { if (line[0] == '#' && line[1] == '#') *outFile << line << std::endl; else if (line[0] == '#' && line[1] == 'C') { *outFile << "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral allele\">" << std::endl; *outFile << line << std::endl; } else { totalVariantNumber++; std::vector<std::string> fields = split(line, '\t'); std::vector<std::string> info = split(fields[7], ';'); if (info[0] != "INDEL") { assert(ancSeqs.find(fields[0]) != ancSeqs.end()); char AA; if (ancSeqs[fields[0]].length() == 0) { AA = 'N'; } else { AA = ancSeqs[fields[0]][atoi(fields[1].c_str())-1]; if (AA == '-') { aaDashCount++; } else if (AA == 'N') { aaNcount++; } else if (AA == fields[3][0]) { aaRefCount++; } else if (AA == fields[4][0]) { aaAltCount++; } else if (!((AA == fields[3][0]) || (AA == fields[4][0]))) { aaDiffCount++; // std::cerr << fields[0] << "\t" << fields[1] << "\t" << fields[3] << "\t" << fields[4] << "\t" << AA << std::endl; } // assert((AA == fields[3][0]) || (AA == fields[4][0])); } fields[7] += ";AA="; fields[7] += AA; print_vector(fields, *outFile, '\t'); } else { *outFile << line << std::endl; } if (totalVariantNumber % 100000 == 0) { double totalAAfilled = aaRefCount + aaAltCount + aaDashCount + aaDiffCount + aaNcount; std::cerr << totalVariantNumber << " variants processed. AA=Ref:" << aaRefCount << "("<< aaRefCount/totalAAfilled <<"%); AA=Alt:" << aaAltCount << "("<< 100*(aaAltCount/totalAAfilled) <<"%); AA='-':" << aaDashCount << "("<< 100*(aaDashCount/totalAAfilled) << "%); AA=?(Neither Ref nor Alt):" << aaDiffCount << "("<< 100*(aaDiffCount/totalAAfilled) <<"%); AA=N:" << aaNcount << "("<< 100*(aaNcount/totalAAfilled) << "%)" << std::endl; } } } // Final summary double totalAAfilled = aaRefCount + aaAltCount + aaDashCount + aaDiffCount; std::cerr << std::endl; std::cerr << "All " << totalVariantNumber << " variants processed. AA=Ref:" << aaRefCount << "("<< 100*(aaRefCount/totalAAfilled) <<"%); AA=Alt:" << aaAltCount << "("<< 100*(aaAltCount/totalAAfilled) <<"%); AA='-':" << aaDashCount << "("<< 100*(aaDashCount/totalAAfilled) << "%); AA=?(Neither Ref nor Alt):" << aaDiffCount << "("<< 100*(aaDiffCount/totalAAfilled) <<"%)" << std::endl; return 0; }
//! \brief creates a list of what to generate //! //! \param startDir The directory from which to start scanning bool TalkFileCreator::createTalkList(QDir startDir) { m_talkList.clear(); // create Iterator QDirIterator::IteratorFlags flags = QDirIterator::NoIteratorFlags; if(m_recursive) flags = QDirIterator::Subdirectories; QDirIterator it(startDir,flags); //create temp directory QDir tempDir(QDir::tempPath()+ "/talkfiles/"); if(!tempDir.exists()) tempDir.mkpath(QDir::tempPath()+ "/talkfiles/"); // read in Maps of paths - file/dirnames while (it.hasNext()) { it.next(); if(m_abort) { return false; } QFileInfo fileInf = it.fileInfo(); // its a dir if(fileInf.isDir()) { QDir dir = fileInf.dir(); // insert into List if(!dir.dirName().isEmpty() && m_talkFolders) { // check if we should ignore it if(m_generateOnlyNew && QFileInfo(dir.path() + "/_dirname.talk").exists()) { continue; } //generate entry TalkGenerator::TalkEntry entry; entry.toSpeak = dir.dirName(); entry.wavfilename = QDir::tempPath() + "/talkfiles/" + QCryptographicHash::hash(entry.toSpeak.toUtf8(), QCryptographicHash::Md5).toHex() + ".wav"; entry.talkfilename = QDir::tempPath() + "/talkfiles/" + QCryptographicHash::hash(entry.toSpeak.toUtf8(), QCryptographicHash::Md5).toHex() + ".talk"; entry.target = dir.path() + "/_dirname.talk"; entry.voiced = false; entry.encoded = false; qDebug() << "[TalkFileCreator] toSpeak:" << entry.toSpeak << "target:" << entry.target << "intermediates:" << entry.wavfilename << entry.talkfilename; m_talkList.append(entry); } } else // its a File { // insert into List if( !fileInf.fileName().isEmpty() && !fileInf.fileName().endsWith(".talk") && m_talkFiles) { //test if we should ignore this file bool match = false; for(int i=0; i < m_ignoreFiles.size();i++) { QRegExp rx(m_ignoreFiles[i].trimmed()); rx.setPatternSyntax(QRegExp::Wildcard); if(rx.exactMatch(fileInf.fileName())) match = true; } if(match) continue; // check if we should ignore it if(m_generateOnlyNew && QFileInfo(fileInf.path() + "/" + fileInf.fileName() + ".talk").exists()) { continue; } //generate entry TalkGenerator::TalkEntry entry; if(m_stripExtensions) entry.toSpeak = stripExtension(fileInf.fileName()); else entry.toSpeak = fileInf.fileName(); entry.wavfilename = QDir::tempPath() + "/talkfiles/" + QCryptographicHash::hash(entry.toSpeak.toUtf8(), QCryptographicHash::Md5).toHex() + ".wav"; entry.talkfilename = QDir::tempPath() + "/talkfiles/" + QCryptographicHash::hash(entry.toSpeak.toUtf8(), QCryptographicHash::Md5).toHex() + ".talk"; entry.target = fileInf.path() + "/" + fileInf.fileName() + ".talk"; entry.voiced = false; entry.encoded = false; qDebug() << "[TalkFileCreator] toSpeak:" << entry.toSpeak << "target:" << entry.target << "intermediates:" << entry.wavfilename << entry.talkfilename; m_talkList.append(entry); } } QCoreApplication::processEvents(); } return true; }
int reorderMain(int argc, char** argv) { parseReorderOptions(argc, argv); string fileRoot = stripExtension(opt::vcfFile); std::cerr << "Reordering columns in: " << opt::vcfFile << std::endl; std::cerr << "using ordering in: " << opt::newOrderFile << std::endl; // Open connection to read from the vcf file std::ifstream* vcfFile = new std::ifstream(opt::vcfFile.c_str()); string reorderedFileName = fileRoot + opt::runName + "_reordered.vcf"; std::ofstream* pReordered = new std::ofstream(reorderedFileName.c_str()); int numChromosomes; int totalVariantNumber = 0; string line; std::vector<string> sampleNames; std::vector<string> newOrder = readSampleNamesFromTextFile(opt::newOrderFile); std::vector<string> fields; std::map<string, size_t> link; while (getline(*vcfFile, line)) { if (line[0] == '#' && line[1] == '#') { *pReordered << line << std::endl; } else if (line[0] == '#' && line[1] == 'C') { std::vector<std::string> fields = split(line, '\t'); const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS; numChromosomes = (int)numSamples * 2; // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl; if (opt::sampleNameFile.empty()) { for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) { sampleNames.push_back(fields[i]); } } else { sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile); } assert(sampleNames.size() == newOrder.size()); link = linkVectors(newOrder, sampleNames); for (std::vector<std::string>::size_type i = 0; i != NUM_NON_GENOTYPE_COLUMNS; i++) { *pReordered << fields[i] << "\t"; } for (std::vector<std::string>::size_type i = 0; i != sampleNames.size() - 1; i++) { *pReordered << newOrder[i] << "\t"; } *pReordered << newOrder[newOrder.size()-1] << std::endl; } else { totalVariantNumber++; std::vector<std::string> fields = split(line, '\t'); for (std::vector<std::string>::size_type i = 0; i != NUM_NON_GENOTYPE_COLUMNS; i++) { *pReordered << fields[i] << "\t"; } for (std::vector<std::string>::size_type i = 0; i != sampleNames.size() - 1; i++) { *pReordered << fields[link[sampleNames[i]]+NUM_NON_GENOTYPE_COLUMNS] << "\t"; } *pReordered << fields[link[sampleNames[sampleNames.size()-1]]+NUM_NON_GENOTYPE_COLUMNS] << std::endl; } } return 0; }
int linkGNMain(int argc, char** argv) { linkGNOptions(argc, argv); string gpFileRoot = stripExtension(opt::gpFile); std::ofstream* gpOutFile; std::ofstream* refLinkFile; std::ofstream* goBedFile; std::ofstream* fullBedFile; if (opt::NtoN) { goBedFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_GOBed.txt"); fullBedFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_FullBed.txt"); gpOutFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_RefGene.gp"); refLinkFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_RefLink.gp"); } else { goBedFile = new std::ofstream(gpFileRoot + opt::out + "_GOBed.txt"); fullBedFile = new std::ofstream(gpFileRoot + opt::out + "_FullBed.txt"); gpOutFile = new std::ofstream(gpFileRoot + opt::out + "_RefGene.gp"); refLinkFile = new std::ofstream(gpFileRoot + opt::out + "_RefLink.gp"); } string line; int geneNum = 1; // Load David Brawand's assignment of orthologs // Mapping from cichlid IDs to a zebrafish homolog (or medaka // stickleback, tetraodon, if zebrafish not available) std::map<string,string> cichlidHomolog; std::map<string,string> cichlidDanRerCopyNum; if (opt::v2orthologsFile != "") { std::cerr << "Reading the v2 full orthologs file:" << std::endl; std::ifstream* ocFile = new std::ifstream(opt::v2orthologsFile); while (getline(*ocFile, line)) { std::vector<string> orthVec = split(line, '\t'); int c = getSpeciesColumn(opt::species); if (orthVec[c] != "NA") { if (orthVec[8] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[8]; cichlidDanRerCopyNum[orthVec[c]] = "1-1"; } // Zebrafish else if (orthVec[5] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[5]; } // Medaka else if (orthVec[7] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[7]; } // Stickleback else if (orthVec[6] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[6]; } // Tetraodon else { cichlidHomolog[orthVec[c]] = "novelCichlidGene"; } } else { continue; } } ocFile->close(); } if (opt::v1orthologousClustersFile != "") { int copiesInCichlid = 0; int copiesInDanRer = 0; string cichlidGene = ""; string homologGene = ""; std::cerr << "Reading the v1 orthologous cluster file: " << std::endl; std::ifstream* ocFile = new std::ifstream(opt::v1orthologousClustersFile); while (getline(*ocFile, line)) { std::vector<string> idAndNum = split(line, '\t'); string thisLineGeneID = idAndNum[0]; int thisLineGeneClusterNumber = atoi(idAndNum[1].c_str()); // Another line for the same cluster if (thisLineGeneClusterNumber == geneNum) { if (thisLineGeneID.substr(0,2) == opt::species) { if (cichlidGene == "") { // First copy in the cichlid species (e.g. mz) cichlidGene = thisLineGeneID; } else { // There is more than one copy in the cichlid if (homologGene != "") { if (copiesInDanRer <= 1 || opt::NtoN) { attemptMappingUpdate(cichlidHomolog, cichlidGene, homologGene + "/" + numToString(copiesInCichlid)); if (copiesInDanRer == 1) cichlidDanRerCopyNum[cichlidGene] = "N-1"; else if (copiesInDanRer > 1) cichlidDanRerCopyNum[cichlidGene] = "N-N"; } cichlidGene = thisLineGeneID; } } copiesInCichlid++; } else if (thisLineGeneID.substr(0,6) == "ENSDAR") { copiesInDanRer++; if (homologGene == "") { homologGene = thisLineGeneID; } else { if (rand() < 0.5) homologGene = thisLineGeneID; // 50% chance of using this zfish copy (hacky!!!) } } else if (thisLineGeneID.substr(0,6) == "ENSGAC") { if (homologGene == "") { homologGene = thisLineGeneID; } } else if (thisLineGeneID.substr(0,6) == "ENSORL") { if (homologGene == "" || homologGene.substr(0,6) == "ENSGAC") { homologGene = thisLineGeneID; } } else if (thisLineGeneID.substr(0,6) == "ENSTNI") { if (homologGene == "") { homologGene = thisLineGeneID; } } // std::cerr << atoi(idAndNum[1].c_str()) << "\t" << geneNum << std::endl; } else { // First line for a new cluster read // so first add the mapping for the previous cluster if (cichlidGene != "" && homologGene != "") { assert(copiesInCichlid > 0); if (copiesInDanRer == 1) { if (copiesInCichlid == 1) { cichlidDanRerCopyNum[cichlidGene] = "1-1"; attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene); } else if (copiesInCichlid > 1) { cichlidDanRerCopyNum[cichlidGene] = "N-1"; attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene + "/" + numToString(copiesInCichlid)); } } else if (copiesInDanRer > 1) { if (copiesInCichlid == 1) { cichlidDanRerCopyNum[cichlidGene] = "1-N"; if (opt::NtoN) attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene); } else if (copiesInCichlid > 1) { cichlidDanRerCopyNum[cichlidGene] = "N-N"; if (opt::NtoN) attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene + "/" + numToString(copiesInCichlid)); } } else { if (copiesInCichlid == 1) { attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene); } else if (copiesInCichlid > 1) { attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene + "/" + numToString(copiesInCichlid)); } } } // then start looking through the next cluster cichlidGene = ""; homologGene = ""; copiesInDanRer = 0; copiesInCichlid = 0; geneNum = thisLineGeneClusterNumber; if (thisLineGeneID.substr(0,2) == opt::species) { cichlidGene = thisLineGeneID; } else if (thisLineGeneID.substr(0,6) == "ENSDAR") { copiesInDanRer++; homologGene = thisLineGeneID; } else if (thisLineGeneID.substr(0,6) == "ENSGAC") { homologGene = thisLineGeneID; } else if (thisLineGeneID.substr(0,6) == "ENSORL") { homologGene = thisLineGeneID; } else if (thisLineGeneID.substr(0,6) == "ENSTNI") { homologGene = thisLineGeneID; } } } ocFile->close(); } if (opt::sepByCopyNumberPrefix != "") { std::ofstream* OneOneFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_1-1.txt"); std::ofstream* NOneFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_N-1.txt"); std::ofstream* OneNFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_1-N.txt"); std::ofstream* NNFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_N-N.txt"); for (std::map<string, string>::iterator it = cichlidDanRerCopyNum.begin(); it != cichlidDanRerCopyNum.end(); it++) { if (it->second == "1-1") { *OneOneFile << it->first << std::endl; } else if (it->second == "N-1") { *NOneFile << it->first << std::endl; } else if (it->second == "1-N") { *OneNFile << it->first << std::endl; } else if (it->second == "N-N") { *NNFile << it->first << std::endl; } } } // Load gene names and descriptions from ENSEMBL std::map<string,string> ensGeneMap; std::map<string,string> ensGeneDescriptionMap; std::map<string,string> ensEntrezMap; if (!opt::ensGeneFile.empty()) { std::ifstream* egFile = new std::ifstream(opt::ensGeneFile); while (getline(*egFile, line)) { std::vector<string> ensGene = split(line, '\t'); if (ensGene.size() == 4) { ensGeneMap[ensGene[0]] = ensGene[3]; ensGeneDescriptionMap[ensGene[0]] = ensGene[2]; // Sometimes there are two Entrez records for one Ensembl gene, the first Entrez record tends to be the more informative one if ( ensEntrezMap.find(ensGene[0]) == ensEntrezMap.end() ) { if (ensGene[1] != "") {ensEntrezMap[ensGene[0]] = ensGene[1]; } else { ensEntrezMap[ensGene[0]] = "0"; } } } else if (ensGene.size() == 3) { ensGeneMap[ensGene[0]] = "NA"; if (ensGene[2] != "") { ensGeneDescriptionMap[ensGene[0]] = ensGene[2]; } else { ensGeneDescriptionMap[ensGene[0]] = "no description: " + ensGene[0]; } // Sometimes there are two Entrez records for one Ensembl gene, the first Entrez record tends to be the more informative one if ( ensEntrezMap.find(ensGene[0]) == ensEntrezMap.end() ) { if (ensGene[1] != "") {ensEntrezMap[ensGene[0]] = ensGene[1]; } else { ensEntrezMap[ensGene[0]] = "0"; } } } else { //std::cerr << ensGene.size() << std::endl; print_vector_stream(ensGene, std::cerr); } // std::cout << ensGene[0] << "\t" << ensGene[2] << std::endl; } } // Go through the gene prediction file and generate the final outputs std::ifstream* gpFile = new std::ifstream(opt::gpFile); int countNovel = 1; int countUnknown = 1; int countNotInEnsembl = 1; while (getline(*gpFile, line)) { std::vector<string> gpVec = split(line, '\t'); if ( cichlidHomolog.count(gpVec[0]) == 1) { std::vector<string> ensembl = split(cichlidHomolog[gpVec[0]], '/'); std::vector<string> myNameVec = split(gpVec[0], '.'); std::string nameWdots = gpVec[0]; gpVec[0] = myNameVec[0] + "_" + myNameVec[1] + "_" + myNameVec[2] + "_" + myNameVec[3]; if ( ensGeneMap.find(ensembl[0]) != ensGeneMap.end() ) { if (ensembl.size() == 1) { std::cout << nameWdots << "\t" << ensembl[0] << "\t" << ensEntrezMap[ensembl[0]] << "\t" << ensGeneMap[ensembl[0]] << std::endl; gpVec[11] = ensGeneMap[ensembl[0]]; print_vector(gpVec, *gpOutFile); *refLinkFile << ensGeneMap[ensembl[0]] << "\t" << ensGeneDescriptionMap[ensembl[0]] << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << ensEntrezMap[ensembl[0]] << "\t0" << std::endl; *fullBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl; if (ensEntrezMap[ensembl[0]] != "0") { *goBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl; } } else { std::cout << nameWdots << "\t" << ensembl[0] << "\t" << ensEntrezMap[ensembl[0]] << "\t" << ensGeneMap[ensembl[0]] << "/" << ensembl[1] << std::endl; gpVec[11] = ensGeneMap[ensembl[0]]+"/"+ensembl[1]; print_vector(gpVec, *gpOutFile); *refLinkFile << ensGeneMap[ensembl[0]] << "/" << ensembl[1] << "\t" << ensGeneDescriptionMap[ensembl[0]] << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << ensEntrezMap[ensembl[0]] << "\t0" << std::endl; *fullBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl; if (ensEntrezMap[ensembl[0]] != "0") { *goBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl; } } } else if (ensembl[0] == "novelCichlidGene") { std::cout << nameWdots << "\t" << ensembl[0] << "\t0" << "\t" << opt::species + ".novel." + numToString(countNovel) << std::endl; gpVec[11] = opt::species + ".novel." + numToString(countNovel); countNovel++; print_vector(gpVec, *gpOutFile); *refLinkFile << opt::species + ".novel." + numToString(countNovel) << "\t" << "novel gene found only in cichlids" << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << "0" << "\t0" << std::endl; } else { std::cout << nameWdots << "\t" << "noOrthologAssigned" << "\t" << "0" << "\t" << opt::species + ".orthologNotInEnsembl." + numToString(countNotInEnsembl) << std::endl; *refLinkFile << opt::species + ".orthologNotInEnsembl." + numToString(countUnknown) << "\t" << "ortholog from Brawand data not foud in Ensembl v75" << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << "0" << "\t0" << std::endl; gpVec[11] = opt::species + ".orthologNotInEnsembl." + numToString(countNotInEnsembl); print_vector(gpVec, *gpOutFile); //std::cerr << ensembl[0] << std::endl; } //std::cout << "hello" << std::endl; } else { std::vector<string> myNameVec = split(gpVec[0], '.'); std::string nameWdots = gpVec[0] + ".1"; gpVec[0] = myNameVec[0] + "_" + myNameVec[1] + "_" + myNameVec[2] + "_" + myNameVec[3]; std::cout << nameWdots << "\t" << "noOrthologAssigned" << "\t" << "0" << "\t" << opt::species + ".unknown." + numToString(countUnknown) << std::endl; *refLinkFile << opt::species + ".unknown." + numToString(countUnknown) << "\t" << "unknown - no ortholog from Brawand data" << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << "0" << "\t0" << std::endl; gpVec[11] = opt::species + ".unknown." + numToString(countUnknown); print_vector(gpVec, *gpOutFile); countUnknown++; } } return 0; }
void getFstFromVCF() { std::cerr << "Calculating Fst using variants from: " << opt::vcfFile << std::endl; std::cerr << "Between the two 'populations' defined in: " << opt::sampleSets << std::endl; if (opt::windowSize > 0) { std::cerr << "also using a sliding window of size: " << opt::windowSize << " variants and sliding in steps of: " << opt::windowStep << std::endl; } string fileRoot = stripExtension(opt::sampleSets); //std::cerr << "Still alive: " << std::endl; // Open connection to read from the vcf file std::istream* vcfFile = createReader(opt::vcfFile.c_str()); //std::cerr << "Hello: " << std::endl; std::ifstream* setsFile = new std::ifstream(opt::sampleSets.c_str()); std::ifstream* annotFile; std::ofstream* snpCategoryFstFile; std::ofstream* regionsAboveFstFile; bool inRegAbove = false; std::ofstream* fstDxyFixedWindowFile; std::ifstream* ancSetsFile; std::ofstream* ancSetsOutFile; std::vector<string> ancSet1; std::vector<string> ancSet2; Annotation wgAnnotation; if (!opt::annotFile.empty()) { annotFile = new std::ifstream(opt::annotFile.c_str()); Annotation Annot(annotFile, false); // Does not use transcripts annotated as 5' or 3' partial wgAnnotation = Annot; string snpCategoryFstFileName = fileRoot + "_" + opt::runName + "SNPcategory_fst.txt"; snpCategoryFstFile = new std::ofstream(snpCategoryFstFileName.c_str()); *snpCategoryFstFile << "SNPcategory" << "\t" << "thisSNPFst" << "\t" << "thisSNPDxy" << "\t" << "scaffold" << "\t" << "position" << std::endl; } if (!opt::ancSets.empty()) { ancSetsFile = new std::ifstream(opt::ancSets); string ancOutFileName = fileRoot + "_" + opt::runName + "ancestralSNPs_fst.txt"; ancSetsOutFile = new std::ofstream(ancOutFileName); *ancSetsOutFile << "scaffold" << "\t" << "position" << "\t" << "AncAllelePopulation" << "\t" << "Fst" << "\t" << "ancSet1_segregating" << "\t" << "ancSet2_segregating" << std::endl; string ancSet1String; string ancSet2String; getline(*ancSetsFile, ancSet1String); getline(*ancSetsFile, ancSet2String); ancSet1 = split(ancSet1String, ','); ancSet2 = split(ancSet2String, ','); std::sort(ancSet1.begin(),ancSet1.end()); std::sort(ancSet2.begin(),ancSet2.end()); } if (opt::regAbove > 0) { string regionsAboveFstFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst_above" + numToString(opt::regAbove) + ".txt"; regionsAboveFstFile = new std::ofstream(regionsAboveFstFileName.c_str()); } string FstResultsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst.txt"; std::ofstream* pFst = new std::ofstream(FstResultsFileName.c_str()); string fstDxyFixedWindowFileName = fileRoot + "dXY_fixedWindow.txt"; fstDxyFixedWindowFile = new std::ofstream(fstDxyFixedWindowFileName.c_str()); string heterozygositySetsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_heterozygosity.txt"; *fstDxyFixedWindowFile << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy" << "\t" << "Set1_pi" << "\t" << "Set2_pi" << std::endl; std::ofstream* pHetSets = new std::ofstream(heterozygositySetsFileName.c_str()); //std::cerr << "Still alive: " << std::endl; string set1String; string set2String; getline(*setsFile, set1String); getline(*setsFile, set2String); std::vector<string> set1 = split(set1String, ','); std::vector<string> set2 = split(set2String, ','); std::sort(set1.begin(),set1.end()); std::sort(set2.begin(),set2.end()); int numChromosomes; int totalVariantNumber = 0; int countedVariantNumber = 0; string windowMiddleVariant = "first\tWindow"; string windowStartEnd = "scaffold_0\t0"; int windowStart = 0; int windowEnd; int fixedWindowStart = 0; std::vector<double> fixedWindowDxyVector; std::vector<double> fixedWindowFstNumVector; std::vector<double> fixedWindowFstDenomVector; std::vector<double> fixedWindowHet1Vector; std::vector<double> fixedWindowHet2Vector; std::vector<double> fixedWindowPi1Vector; std::vector<double> fixedWindowPi2Vector; std::vector<string> sampleNames; std::vector<string> fields; std::vector<size_t> set1Loci; std::vector<size_t> set2Loci; std::vector<size_t> ancSet1Loci; std::vector<size_t> ancSet2Loci; short n1; short n2; short n1anc; short n2anc; string line; std::map<std::string, double> loc_pval; std::vector<double> fstNumerators; fstNumerators.reserve(30000000); std::vector<double> fstDenominators; fstDenominators.reserve(30000000); std::vector<double> DxyVector; DxyVector.reserve(30000000); std::vector<std::vector<double> > heterozygositiesVector; heterozygositiesVector.reserve(30000000); std::vector<double> set1heterozygositiesSimple; set1heterozygositiesSimple.reserve(30000000); std::vector<double> set2heterozygositiesSimple; set2heterozygositiesSimple.reserve(30000000); std::vector<double> set1heterozygositiesNei; set1heterozygositiesNei.reserve(30000000); std::vector<double> set2heterozygositiesNei; set2heterozygositiesNei.reserve(30000000); std::vector<double> set1heterozygositiesPi; set1heterozygositiesPi.reserve(30000000); std::vector<double> set2heterozygositiesPi; set2heterozygositiesPi.reserve(30000000); while (getline(*vcfFile, line)) { if (line[0] == '#' && line[1] == '#') { } else if (line[0] == '#' && line[1] == 'C') { std::vector<std::string> fields = split(line, '\t'); const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS; numChromosomes = (int)numSamples * 2; // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl; if (opt::sampleNameFile.empty()) { for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) { sampleNames.push_back(fields[i]); } } else { sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile); } set1Loci = locateSet(sampleNames, set1); set2Loci = locateSet(sampleNames, set2); n1 = set1Loci.size()*2; n2 = set2Loci.size()*2; std::cerr << "Set1 loci: " << std::endl; print_vector_stream(set1Loci, std::cerr); std::cerr << "Set2 loci: " << std::endl; print_vector_stream(set2Loci, std::cerr); if (!opt::ancSets.empty()) { ancSet1Loci = locateSet(sampleNames, ancSet1); ancSet2Loci = locateSet(sampleNames, ancSet2); std::cerr << "Ancestral Set1 loci: " << std::endl; print_vector_stream(ancSet1Loci, std::cerr); std::cerr << "Ancestral Set2 loci: " << std::endl; print_vector_stream(ancSet2Loci, std::cerr); n1anc = ancSet1Loci.size() * 2; n2anc = ancSet2Loci.size() * 2; } if (opt::windowSize > 0) { if (opt::windowSize == opt::windowStep) { *pHetSets << "scaffold" << "\t" << "Start" << "\t" << "End" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl; *pFst << "var_num" << "\t" << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy_onlyVaiants" << "\t" << "Dxy_AllSites" << "\t" << "windowSize" << std::endl; if (opt::regAbove > 0) *regionsAboveFstFile << "scaffold" << "\t" << "Start" << "\t" << "End" << std::endl; } else { *pHetSets << "Middle_SNP_position" << "\t" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl; } } } else { totalVariantNumber++; std::vector<std::string> fields = split(line, '\t'); std::vector<std::string> info = split(fields[7], ';'); if (info[0] != "INDEL") { // Without indels SetCounts counts = getVariantCountsForFst(fields,set1Loci,set2Loci); //std::cerr << "Still here: " << counts.set1HaplotypeVariant.size() << "\t" << counts.set1individualsWithVariant.size() << "\t" << n1 << std::endl; //std::cerr << "Still here: " << counts.set2HaplotypeVariant.size() << "\t" << counts.set2individualsWithVariant.size() << "\t" << n2 << std::endl; //print_vector_stream(counts.set1HaplotypeVariant, std::cerr); //print_vector_stream(counts.set1individualsWithVariant, std::cerr); //print_vector_stream(counts.set2HaplotypeVariant, std::cerr); if ((counts.set1Count > 0 || counts.set2Count > 0) && (counts.set1Count < n1 || counts.set2Count < n2)) { countedVariantNumber++; double FstNumerator = calculateFstNumerator(counts, n1, n2); fstNumerators.push_back(FstNumerator); fixedWindowFstNumVector.push_back(FstNumerator); double FstDenominator = calculateFstDenominator(counts, n1, n2); fstDenominators.push_back(FstDenominator); fixedWindowFstDenomVector.push_back(FstDenominator); assert(FstDenominator != 0); double thisSNPDxy = calculateDxy(counts, n1, n2); DxyVector.push_back(thisSNPDxy); fixedWindowDxyVector.push_back(thisSNPDxy); std::vector<double> thisSNPhet = getSetHeterozygozities(counts, n1, n2); heterozygositiesVector.push_back(thisSNPhet); std::vector<double> thisSNPpis = calculatePiTwoSets(counts, n1, n2); fixedWindowPi1Vector.push_back(thisSNPpis[0]); fixedWindowPi2Vector.push_back(thisSNPpis[1]); set1heterozygositiesPi.push_back(thisSNPpis[0]); set2heterozygositiesPi.push_back(thisSNPpis[1]); // std::cerr << "Still here: " << thisSNPpis[0] << std::endl; set1heterozygositiesSimple.push_back(thisSNPhet[0]); set2heterozygositiesSimple.push_back(thisSNPhet[1]); fixedWindowHet1Vector.push_back(thisSNPhet[0]); set1heterozygositiesNei.push_back(thisSNPhet[2]); set2heterozygositiesNei.push_back(thisSNPhet[3]); fixedWindowHet2Vector.push_back(thisSNPhet[1]); if (!opt::annotFile.empty()) { string scaffold = fields[0]; string loc = fields[1]; // Scaffold string SNPcategory = wgAnnotation.getCategoryOfSNP(scaffold, loc); double thisSNPFst = FstNumerator/FstDenominator; *snpCategoryFstFile << SNPcategory << "\t" << thisSNPFst << "\t" << thisSNPDxy << "\t" << scaffold << "\t" << loc << std::endl; } if (!opt::ancSets.empty()) { double thisSNPFst = FstNumerator/FstDenominator; if (thisSNPFst < 0) { thisSNPFst = 0; } string AA = split(info[info.size()-1],'=')[1]; //std::cerr << "AA=" << " " << AA << std::endl; FourSetCounts c; if (AA == fields[3]) { c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"ref"); *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t"; if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; } if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; } } else if (AA == fields[4]) { c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"alt"); *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t"; if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; } if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; } // std::cerr << "AA=alt" << " " << c.set1daAF << " " << c.set2daAF << std::endl; } else { c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"N"); *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << "-888" << "\t" << thisSNPFst << "\t"; if (c.set3AltAF > 0 & c.set3AltAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; } if (c.set4AltAF > 0 & c.set4AltAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; } } } std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { if (atoi(fields[1].c_str()) > (fixedWindowStart+10000)) { double thisFixedWindowDxy = vector_average_withRegion(fixedWindowDxyVector, 10000); double thisFixedWindowFst = calculateFst(fixedWindowFstNumVector, fixedWindowFstDenomVector); //double thisFixedWindowHet1 = vector_average_withRegion(fixedWindowHet1Vector, 10000); //double thisFixedWindowHet2 = vector_average_withRegion(fixedWindowHet2Vector, 10000); double thisFixedWindowPi1 = vector_average_withRegion(fixedWindowPi1Vector, 10000); double thisFixedWindowPi2 = vector_average_withRegion(fixedWindowPi2Vector, 10000); *fstDxyFixedWindowFile << fields[0] << "\t" << fixedWindowStart << "\t" << fixedWindowStart+10000 << "\t" << thisFixedWindowFst << "\t" << thisFixedWindowDxy << "\t" << thisFixedWindowPi1 << "\t" << thisFixedWindowPi2 << std::endl; fixedWindowDxyVector.clear(); fixedWindowFstNumVector.clear(); fixedWindowFstDenomVector.clear(); fixedWindowHet1Vector.clear(); fixedWindowHet2Vector.clear(); fixedWindowPi1Vector.clear(); fixedWindowPi2Vector.clear(); fixedWindowStart= fixedWindowStart+10000; } } else { fixedWindowStart = 0; } if (opt::windowSize == 1) { double Fst = FstNumerator/FstDenominator; if (Fst < 0) Fst = 0; *pFst << countedVariantNumber << "\t" << fields[0] + "\t" + fields[1] << "\t" << Fst << "\t" << thisSNPDxy << std::endl; } else if ((opt::windowSize > 0) && (countedVariantNumber % opt::windowStep == 0) && countedVariantNumber >= opt::windowSize) { std::vector<double> windowFstNumerators(fstNumerators.end()-opt::windowSize, fstNumerators.end()); std::vector<double> windowFstDenominators(fstDenominators.end()-opt::windowSize, fstDenominators.end()); double windowFst = calculateFst(windowFstNumerators, windowFstDenominators); if (windowFst < 0) windowFst = 0; std::vector<double> windowDxyVec(DxyVector.end()-opt::windowSize, DxyVector.end()); double windowDxy = vector_average(windowDxyVec); if (opt::windowSize == opt::windowStep) { std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { windowStartEnd = windowStartEnd + "\t" + fields[1]; windowEnd = atoi(fields[1].c_str()); double windowDxyIncNonSeg = vector_average_withRegion(windowDxyVec, windowEnd-windowStart); *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowStartEnd << "\t" << windowFst << "\t" << windowDxy << "\t" << windowDxyIncNonSeg << "\t" << windowFstDenominators.size() << std::endl; if (opt::regAbove > 0) { if (windowFst >= opt::regAbove && !inRegAbove) { inRegAbove = true; *regionsAboveFstFile << s[0] << "\t" << s[1] << "\t"; } else if (windowFst < opt::regAbove && inRegAbove) { inRegAbove = false; *regionsAboveFstFile << s[1] << std::endl; } } } } else { *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowMiddleVariant << "\t" << windowFst << "\t" << windowDxy << "\t" << windowFstDenominators.size() << std::endl; } // Now calculate and output expected heterozygosities for this window std::vector<double> windowHetS1Vec(set1heterozygositiesSimple.end()-opt::windowSize, set1heterozygositiesSimple.end()); double windowHetS1 = vector_average(windowHetS1Vec); std::vector<double> windowHetS2Vec(set2heterozygositiesSimple.end()-opt::windowSize, set2heterozygositiesSimple.end()); double windowHetS2 = vector_average(windowHetS2Vec); std::vector<double> windowHetNei1Vec(set1heterozygositiesNei.end()-opt::windowSize, set1heterozygositiesNei.end()); double windowHetNei1 = vector_average(windowHetNei1Vec); std::vector<double> windowHetNei2Vec(set2heterozygositiesNei.end()-opt::windowSize, set2heterozygositiesNei.end()); double windowHetNei2 = vector_average(windowHetNei2Vec); std::vector<double> windowHetPi1Vec(set1heterozygositiesPi.end()-opt::windowSize, set1heterozygositiesPi.end()); double windowHetPi1 = vector_average_withRegion(windowHetPi1Vec, windowEnd-windowStart); std::vector<double> windowHetPi2Vec(set2heterozygositiesPi.end()-opt::windowSize, set2heterozygositiesPi.end()); double windowHetPi2 = vector_average_withRegion(windowHetPi2Vec, windowEnd-windowStart); if (opt::windowSize == opt::windowStep) { std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { *pHetSets << windowStartEnd << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << "\t" << windowHetPi1 << "\t" << windowHetPi2 << std::endl; windowStartEnd = fields[0] + "\t" + fields[1]; windowStart = atoi(fields[1].c_str()); } else { windowStartEnd = fields[0] + "\t0"; windowStart = 0; } } else { *pHetSets << windowMiddleVariant << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << std::endl; windowMiddleVariant = fields[0] + "\t" + fields[1]; // works only if STEP is half SIZE for the window } } } } if (totalVariantNumber % 100000 == 0) { double Fst = calculateFst(fstNumerators, fstDenominators); std::cerr << totalVariantNumber << " variants processed... Fst: " << Fst << std::endl; } } } double Fst = calculateFst(fstNumerators, fstDenominators); double overallHetS1 = vector_average(set1heterozygositiesSimple); double overallHetS2 = vector_average(set2heterozygositiesSimple); double overallHetNei1 = vector_average(set1heterozygositiesNei); double overallHetNei2 = vector_average(set2heterozygositiesNei); std::cerr << "Fst: " << Fst << std::endl; std::cerr << "Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl; *pHetSets << "#Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl; }
void doAbbaBaba() { string line; // for reading the input files std::istream* vcfFile = createReader(opt::vcfFile.c_str()); std::ifstream* setsFile = new std::ifstream(opt::setsFile.c_str()); string setsFileRoot = stripExtension(opt::setsFile); std::ofstream* outFile = new std::ofstream(setsFileRoot+ "_" + opt::runName + "_abbaBaba.txt"); string windowStartEnd = "scaffold_0\t0"; // Get the sample sets string outgroupString; std::vector<size_t> Opos; std::vector<string> outgroup; if (!opt::bAaEqO) { getline(*setsFile, outgroupString); outgroup = split(outgroupString, ','); } else { outgroupString = "VCF AA field"; } string P3string; getline(*setsFile, P3string); std::vector<string> P3 = split(P3string, ','); std::vector<size_t> P3pos; string P2string; getline(*setsFile, P2string); std::vector<string> P2 = split(P2string, ','); std::vector<size_t> P2pos; string P1string; getline(*setsFile, P1string); std::vector<string> P1 = split(P1string, ','); std::vector<size_t> P1pos; if (!opt::bFrequency && (P1.size() > 1 || P2.size() > 1 || P3.size() > 1)) { std::cerr << "There are more than one individual on some line of the SETS.txt file" << std::endl; std::cerr << "Perhaps you want to use the -f option?" << std::endl; exit(1); } // Now go through the vcf and calculate D int totalVariantNumber = 0; ABBA_BABA_Freq_allResults r; int lastPrint = 0; int lastWindowVariant = 0; std::vector<double> regionDs; std::vector<double> region_f_Gs; std::vector<double> region_f_Ds; std::vector<double> region_f_DMs; std::vector<string> sampleNames; while (getline(*vcfFile, line)) { if (line[0] == '#' && line[1] == '#') continue; else if (line[0] == '#' && line[1] == 'C') { std::vector<std::string> fields = split(line, '\t'); if (opt::sampleNameFile.empty()) { for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) { sampleNames.push_back(fields[i]); } } else { sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile); } if (!opt::bAaEqO) { Opos = locateSet(sampleNames, outgroup); } P3pos = locateSet(sampleNames, P3); P2pos = locateSet(sampleNames, P2); P1pos = locateSet(sampleNames, P1); if (!opt::bAaEqO) { std::cerr << "Outgroup: "; print_vector_stream(outgroup, std::cerr); } else { std::cerr << "Outgroup: " << outgroupString << std::endl; } std::cerr << "P3: "; print_vector_stream(P3, std::cerr); std::cerr << "P2: "; print_vector_stream(P2, std::cerr); std::cerr << "P1: "; print_vector_stream(P1, std::cerr); } else { totalVariantNumber++; std::vector<std::string> fields = split(line, '\t'); std::vector<std::string> info = split(fields[7], ';'); if (info[0] != "INDEL") { if (!opt::bAaEqO) { ThreeSetCounts c; if (opt::bNoAaO) { c = getThreeSetVariantCountsAA4(fields,P1pos,P2pos,P3pos,Opos); if (opt::bFrequency) { incrementDnumDdenomFrequency(c, r); } else { incrementDnumDdenomSingleSequence(c, r); } } else { FourSetCounts c; string AA = getAAfromInfo(info); if (AA == fields[3]) { c = getFourSetVariantCounts(fields,P1pos,P2pos,P3pos,Opos,"ref"); } else if (AA == fields[4]) { c = getFourSetVariantCounts(fields,P1pos,P2pos,P3pos,Opos,"alt"); } r.Dnumerator += ((1-c.set1daAF)*c.set2daAF*c.set3daAF*(1-c.set4daAF)) - (c.set1daAF*(1-c.set2daAF)*c.set3daAF*(1-c.set4daAF)); r.Ddenominator += ((1-c.set1daAF)*c.set2daAF*c.set3daAF*(1-c.set4daAF)) + (c.set1daAF*(1-c.set2daAF)*c.set3daAF*(1-c.set4daAF)); if (c.set2daAF > c.set3daAF) { r.f_d_denominator += ((1-c.set1daAF)*c.set2daAF*c.set2daAF*(1-c.set4daAF)) - (c.set1daAF*(1-c.set2daAF)*c.set2daAF*(1-c.set4daAF)); } else { r.f_d_denominator += ((1-c.set1daAF)*c.set3daAF*c.set3daAF*(1-c.set4daAF)) - (c.set1daAF*(1-c.set3daAF)*c.set3daAF*(1-c.set4daAF)); } } } else { string AA = getAAfromInfo(info); ThreeSetCounts c; if (AA == fields[3]) { c = getThreeSetVariantCounts(fields,P1pos,P2pos,P3pos,"ref"); } else if (AA == fields[4]) { c = getThreeSetVariantCounts(fields,P1pos,P2pos,P3pos,"alt"); } if (opt::bFrequency) { incrementDnumDdenomFrequency(c, r); } else { incrementDnumDdenomSingleSequence(c, r); } } // if (totalVariantNumber % 100000 == 0) { std::cerr << Dnumerator << std::endl; } } else { ABBABABAcounts::indels++; } if (ABBABABAcounts::usedVariantsCounter % opt::windowSize == 0 && ABBABABAcounts::usedVariantsCounter != lastWindowVariant) { std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { windowStartEnd = windowStartEnd + "\t" + fields[1]; if ((double)r.windowDnum/r.window_f_dM_denominator > 1) { std::cerr << "D num" << r.windowDnum << std::endl; std::cerr << "f_dM denom" << r.window_f_dM_denominator << std::endl; } *outFile << windowStartEnd << "\t" << (double)r.windowDnum/r.windowDdenom << "\t" << (double)r.window_f_d_num/r.window_f_d_denominator << "\t" << (double)r.windowDnum/r.window_f_dM_denominator << std::endl; windowStartEnd = fields[0] + "\t" + fields[1]; } else { windowStartEnd = fields[0] + "\t0"; } r.windowDnum = 0; r.windowDdenom = 0; r.window_f_d_num = 0; r.window_f_d_denominator = 0; r.window_f_dM_denominator = 0; lastWindowVariant = ABBABABAcounts::usedVariantsCounter; } if (ABBABABAcounts::usedVariantsCounter % opt::jackKniveWindowSize == 0 && ABBABABAcounts::usedVariantsCounter != lastPrint) { //if (totalVariantNumber % 100000 == 0) { if (opt::bFrequency) assert(ABBABABAcounts::XXAA + ABBABABAcounts::AABA + ABBABABAcounts::BBBA + ABBABABAcounts::indels + ABBABABAcounts::noDafInfo + ABBABABAcounts::usedVariantsCounter + ABBABABAcounts::p1p2 == totalVariantNumber); if (ABBABABAcounts::usedVariantsCounter > (6 * opt::jackKniveWindowSize)) { double Dstd_err = jackknive_std_err(regionDs); double f_Gstd_err = jackknive_std_err(region_f_Gs); double f_Dstd_err = jackknive_std_err(region_f_Ds); double f_DMstd_err = jackknive_std_err(region_f_DMs); std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tD=" << (double)r.Dnumerator/r.Ddenominator << " std_err=" << Dstd_err << std::endl; std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_G=" << (double)r.f_G_num/r.f_G_denom << " std_err=" << f_Gstd_err << std::endl; std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_d=" << (double)r.f_d_num/r.f_d_denominator << " std_err=" << f_Dstd_err << std::endl; std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_dM=" << (double)r.Dnumerator/r.f_dM_denominator << " std_err=" << f_DMstd_err << std::endl; } else { std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tD=" << (double)r.Dnumerator/r.Ddenominator << std::endl; std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_G=" << (double)r.f_G_num/r.f_G_denom << std::endl; std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_d=" << (double)r.f_d_num/r.f_d_denominator << std::endl; std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_dM=" << (double)r.Dnumerator/r.f_dM_denominator << std::endl; } std::cerr << "Last used "<< opt::jackKniveWindowSize << " variants \t\t\t\tD=" << r.lastVarsDnum/r.lastVarsDdenom << std::endl; // std::cerr << "AAAA=" << XXAA << "; AABA=" << AABA << "; BBBA=" << BBBA << std::endl; regionDs.push_back(r.lastVarsDnum/r.lastVarsDdenom); region_f_Gs.push_back(r.lastVarsF_G_num/r.lastVarsF_G_denom); region_f_Ds.push_back(r.lastVarsF_d_num/r.lastVarsF_d_denom); region_f_DMs.push_back(r.lastVarsDnum/r.lastVarsF_dM_denom); r.lastVarsDnum = 0; r.lastVarsDdenom = 0; r.lastVarsF_d_num = 0; r.lastVarsF_d_denom = 0; r.lastVarsF_G_num = 0; r.lastVarsF_G_denom = 0; r.lastVarsF_dM_denom = 0; lastPrint = ABBABABAcounts::usedVariantsCounter; } } } double Dstd_err = jackknive_std_err(regionDs); double f_Gstd_err = jackknive_std_err(region_f_Gs); double f_Dstd_err = jackknive_std_err(region_f_Ds); double f_DMstd_err = jackknive_std_err(region_f_DMs); std::cerr << std::endl; std::cerr << totalVariantNumber << " variants processed. D=" << (double)r.Dnumerator/r.Ddenominator << " std_err=" << Dstd_err << std::endl; std::cerr << totalVariantNumber << " variants processed. f_G=" << (double)r.f_G_num/r.f_G_denom << " std_err=" << f_Gstd_err << std::endl; std::cerr << totalVariantNumber << " variants processed. f_d=" << (double)r.f_d_num/r.f_d_denominator << " std_err=" << f_Dstd_err << std::endl; std::cerr << totalVariantNumber << " variants processed. f_dM=" << (double)r.Dnumerator/r.f_dM_denominator << " std_err=" << f_DMstd_err << std::endl; }
int polymorphicMain(int argc, char** argv) { parsePolymorphicOptions(argc, argv); string fileRoot = stripExtension(opt::sampleSets); std::cerr << "Filtering a VCF file: " << opt::vcfFile << std::endl; std::cerr << "so that only sites that are ploymorphic in sample sets defined in: " << opt::sampleSets << " are output" << std::endl; // Open connection to read from the vcf file std::ifstream* vcfFile = new std::ifstream(opt::vcfFile.c_str()); std::ifstream* setsFile = new std::ifstream(opt::sampleSets.c_str()); string PolymorphicFileName = fileRoot + "_" + opt::runName + "_polymorphic.vcf"; std::ofstream* pPolymorphicVCF = new std::ofstream(PolymorphicFileName.c_str()); std::vector<std::vector<string> > sets; string setString; while (getline(*setsFile, setString)) { std::vector<string> thisSet = split(setString, ','); std::sort(thisSet.begin(),thisSet.end()); sets.push_back(thisSet); } int numChromosomes; int totalVariantNumber = 0; string line; std::vector<string> sampleNames; std::vector<string> fields; std::vector<std::vector<size_t> > setsLoci; while (getline(*vcfFile, line)) { if (line[0] == '#' && line[1] == '#') { *pPolymorphicVCF << line << std::endl; } else if (line[0] == '#' && line[1] == 'C') { std::vector<std::string> fields = split(line, '\t'); const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS; numChromosomes = (int)numSamples * 2; // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl; if (opt::sampleNameFile.empty()) { for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) { sampleNames.push_back(fields[i]); } } else { sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile); } for (std::vector<std::vector<string> >::size_type i = 0; i != sets.size(); i++) { std::vector<size_t> thisSetLoci = locateSet(sampleNames, sets[i]); setsLoci.push_back(thisSetLoci); std::cerr << "Set" << i << " loci: " << std::endl; print_vector_stream(thisSetLoci, std::cerr); } *pPolymorphicVCF << line << std::endl; } else { totalVariantNumber++; std::vector<std::string> fields = split(line, '\t'); bool polymorphicInSets = true; for (std::vector<std::vector<size_t> >::size_type i = 0; i != setsLoci.size(); i++) { if(!findIfPolymorhicInSet(fields, setsLoci[i])) { polymorphicInSets = false; } } if (polymorphicInSets) *pPolymorphicVCF << line << std::endl; } } return 0; }
void getFstFromMs() { std::cerr << "Calculating Fst using variants from: " << opt::msFile << std::endl; std::cerr << "and outputting chi-sq test p-vals < " << opt::msPvalCutoff << std::endl; std::ifstream* msFile = new std::ifstream(opt::msFile.c_str()); string fileRoot = stripExtension(opt::msFile); string PvalFileName = fileRoot + "_" + opt::runName + "_pvals.txt"; std::ofstream* pValFile; if (opt::msPvalCutoff > 0) { pValFile = new std::ofstream(PvalFileName.c_str()); *pValFile << "Fisher p-val" << "\t" << "chi-sq pval" << "\t" << "set1Alt" << "\t" << "set1Ref" << "\t" << "set2Alt" << "\t" << "set2Ref" << "\t" << "Fst" << std::endl; } std::vector<int> set1_loci; std::vector<int> set2_loci; srand((int)time(NULL)); if (opt::msSet1FstSample == 0) { opt::msSet1FstSample = opt::msSet1Size; for (int i = 0; i != opt::msSet1FstSample; i++) { set1_loci.push_back(i); } } else { // Randomly sample individuals from population 1 for Fst calculation for (int i = 0; i != opt::msSet1FstSample; i++) { int rand_sample = (rand()%opt::msSet1Size); while (std::find(set1_loci.begin(),set1_loci.end(),rand_sample) != set1_loci.end()) { rand_sample = (rand()%opt::msSet1Size); } set1_loci.push_back(rand_sample); } } // Do the same for set2 if (opt::msSet2FstSample == 0) { opt::msSet2FstSample = opt::msSet2Size; for (int i = 0; i != opt::msSet2FstSample; i++) { set2_loci.push_back(i+opt::msSet1Size); } } else { // Randomly sample individuals from population 2 for Fst calculation for (int i = 0; i != opt::msSet2FstSample; i++) { int rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size; while (std::find(set2_loci.begin(),set2_loci.end(),rand_sample) != set2_loci.end()) { rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size; } set2_loci.push_back(rand_sample); } } std::cerr << "Selected population 1 individuals: "; print_vector_stream(set1_loci, std::cerr); std::cerr << "Selected population 2 individuals: "; print_vector_stream(set2_loci, std::cerr); if (opt::msSet1Size != opt::msSet1FstSample || opt::msSet2Size != opt::msSet2FstSample) { std::cerr << "Warning: the Fst column is going to contain '-1' values where the site is not a segregating site in the sampled individuals for Fst calcultation" << std::endl; } std::vector<double> fstNumerators; fstNumerators.reserve(500000000); std::vector<double> fstDenominators; fstDenominators.reserve(500000000); string line; int numFixedSites = 0; int numNearlyFixedSites = 0; std::vector<double> nullForChisq; std::vector<int> moreSet1; std::vector<int> lessSet1; std::vector<int> moreSet2; std::vector<int> lessSet2; SetCounts counts; while (getline(*msFile, line)) { counts.reset(); double thisFst = -1; for (std::vector<int>::iterator it = set1_loci.begin(); it != set1_loci.end(); it++) { // std::cerr << line[*it] << std::endl; if (line[*it] == '1') { counts.set1Count++; } } for (std::vector<int>::iterator it = set2_loci.begin(); it != set2_loci.end(); it++) { if (line[*it] == '1') { counts.set2Count++; } } //std::cerr << "counts.set1Count" << counts.set1Count << "\t" << "counts.set2Count" << counts.set2Count << std::endl; if (counts.set1Count > 0 || counts.set2Count > 0) { double FstNum = calculateFstNumerator(counts, opt::msSet1FstSample, opt::msSet2FstSample); double FstDenom = calculateFstDenominator(counts, opt::msSet1FstSample, opt::msSet2FstSample); thisFst = FstNum/FstDenom; if (thisFst < 0) thisFst = 0; fstNumerators.push_back(FstNum); fstDenominators.push_back(FstDenom); } if ((counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 0)) { numFixedSites++; } if ((counts.set1Count == 1 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample-1) || (counts.set1Count == opt::msSet1FstSample-1 && counts.set2Count == 0) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 1)) { numNearlyFixedSites++; } int set1WithoutVariant = opt::msSet1FstSample-counts.set1Count; int set2WithoutVariant = opt::msSet2FstSample-counts.set2Count; if (counts.set1Count >= set1WithoutVariant) { moreSet1.push_back(counts.set1Count); lessSet1.push_back(set1WithoutVariant); moreSet2.push_back(counts.set2Count); lessSet2.push_back(set2WithoutVariant); } else { moreSet1.push_back(set1WithoutVariant); lessSet1.push_back(counts.set1Count); moreSet2.push_back(set2WithoutVariant); lessSet2.push_back(counts.set2Count); } // std::cerr << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << std::endl; if ((counts.set1Count != 0 || counts.set2Count != 0) && (set1WithoutVariant != 0 || set2WithoutVariant != 0)) { if (opt::msSet1FstSample + opt::msSet2FstSample <= 60) { counts.fisher_pval = fisher_exact(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); // std::cerr << "Fisher: " << counts.fisher_pval << std::endl; counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); } else { counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); } } if (counts.fisher_pval < opt::msPvalCutoff || counts.chi_sq_pval < opt::msPvalCutoff) { *pValFile << counts.fisher_pval << "\t" << counts.chi_sq_pval << "\t" << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << "\t" << thisFst << std::endl; } } double Fst = calculateFst(fstNumerators, fstDenominators); std::cerr << "Fst: " << Fst << std::endl; std::cerr << "Fixed sites: " << numFixedSites << std::endl; std::cerr << "Tier2 sites: " << numNearlyFixedSites << std::endl; std::cerr << "Null ChiSq 1:" << vector_average(moreSet1)/opt::msSet1FstSample << "\t" << vector_average(lessSet1)/opt::msSet1FstSample << std::endl; std::cerr << "Null ChiSq 2:" << vector_average(moreSet2)/opt::msSet2FstSample << "\t" << vector_average(lessSet2)/opt::msSet2FstSample << std::endl; }
void summariseEigensoft() { std::ifstream* eigenFile = new std::ifstream(opt::eigensoftFile.c_str()); string fileRoot = stripExtension(opt::eigensoftFile); string FstResultsFileName = fileRoot + "_" + opt::runName + "_fst_matrix.forR"; std::ofstream* pFst = new std::ofstream(FstResultsFileName.c_str()); std::vector<std::vector<std::string> > fst_matrix; string line; getline(*eigenFile, line); // Get the first description line short type; if (line == "##") { type = 1; } else { type = 2; } std::cerr << "It is type: " << type << std::endl; if (type == 1) { getline(*eigenFile, line); std::vector<std::string> fields = split(line, '\t'); std::vector<std::string> this_indiv_fst; std::vector<std::string> all_indiv; string this_indiv = fields[0]; this_indiv_fst.push_back(fields[2]); while (getline(*eigenFile, line)) { fields = split(line, '\t'); std::cerr << "Indiv: " << fields[0] << std::endl; if (this_indiv == fields[0]) { this_indiv_fst.push_back(fields[2]); } else { fst_matrix.push_back(this_indiv_fst); all_indiv.push_back(this_indiv); this_indiv = fields[0]; this_indiv_fst.clear(); this_indiv_fst.push_back(fields[2]); } } all_indiv.push_back(this_indiv); fst_matrix.push_back(this_indiv_fst); this_indiv_fst.clear(); this_indiv_fst.push_back("0"); all_indiv.push_back(fields[1]); fst_matrix.push_back(this_indiv_fst); for (std::vector<std::vector<std::string> >::size_type i = 0; i != fst_matrix.size(); i++) { std::reverse(fst_matrix[i].begin(), fst_matrix[i].end()); fst_matrix[i].insert(fst_matrix[i].end(), "0"); while (fst_matrix[i].size() != fst_matrix[0].size()) { fst_matrix[i].insert(fst_matrix[i].end(), "0"); } } std::reverse(fst_matrix.begin(), fst_matrix.end()); std::reverse(all_indiv.begin(), all_indiv.end()); print_vector(all_indiv, *pFst); print_matrix(fst_matrix, *pFst); } else if (type == 2) { std::cerr << "type2" << std::endl; std::vector<std::string> fields = split(line, '\t'); std::vector<std::string> all_indiv(fields.begin()+1,fields.end()); getline(*eigenFile, line); getline(*eigenFile, line); std::vector<std::string> this_indiv_fst; while (getline(*eigenFile, line)) { fields = split(line, '\t'); std::copy(fields.begin()+1,fields.end(),std::back_inserter(this_indiv_fst)); for (std::vector<std::string>::size_type i = 0; i != this_indiv_fst.size(); i++) { double fst = convertToDouble(this_indiv_fst[i]) / 1000; this_indiv_fst[i] = numToString(fst); } fst_matrix.push_back(this_indiv_fst); this_indiv_fst.clear(); } print_vector(all_indiv, *pFst); print_matrix(fst_matrix, *pFst); } }