Ejemplo n.º 1
0
// Strip the leading directories and
// the last trailling suffix from a filename
std::string stripFilename(const std::string& filename)
{
    std::string out = stripDirectories(filename);
    // Remove the gzip extension if necessary
    if(isGzip(out))
        out = stripExtension(out);
    return stripExtension(out);
}
Ejemplo n.º 2
0
// 
// Handle command line arguments
//
void parseRmdupOptions(int argc, char** argv)
{
    // Set defaults
    bool die = false;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) 
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c) 
        {
            case 'p': arg >> opt::prefix; break;
            case 'o': arg >> opt::outFile; break;
            case 'e': arg >> opt::errorRate; break;
            case 'd': arg >> opt::sampleRate; break;
            case 't': arg >> opt::numThreads; break;
            case 'v': opt::verbose++; break;
            case OPT_HELP:
                std::cout << RMDUP_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
            case OPT_VERSION:
                std::cout << RMDUP_VERSION_MESSAGE;
                exit(EXIT_SUCCESS);
                
        }
    }

    if (argc - optind < 1) 
    {
        std::cerr << SUBPROGRAM ": missing arguments\n";
        die = true;
    } 
    else if (argc - optind > 1) 
    {
        std::cerr << SUBPROGRAM ": too many arguments\n";
        die = true;
    }

    if (die) 
    {
        std::cerr << "Try `" << SUBPROGRAM << " --help' for more information.\n";
        exit(EXIT_FAILURE);
    }

    // Parse the input filenames
    opt::readsFile = argv[optind++];

    if(opt::prefix.empty())
    {
        opt::prefix = stripExtension(opt::readsFile);
    }

    if(opt::outFile.empty())
    {
        opt::outFile = stripExtension(opt::readsFile) + ".rmdup.fa";
    }
}
Ejemplo n.º 3
0
NamespaceDef::NamespaceDef(const char *df,int dl,int dc,
                           const char *name,const char *lref,
                           const char *fName, const char*type,
                           bool isPublished) :
   Definition(df,dl,dc,name)
  ,m_isPublished(isPublished)
{
  if (fName)
  {
    if (lref)
    {
      fileName = stripExtension(fName);
    }
    else
    {
      fileName = convertNameToFile(stripExtension(fName));
    }
  }
  else
  {
    setFileName(name);
  }
  classSDict = new ClassSDict(17);
  namespaceSDict = new NamespaceSDict(17);
  m_innerCompounds = new SDict<Definition>(17);
  usingDirList = 0;
  usingDeclList = 0;
  m_allMembersDict = 0;
  setReference(lref);
  memberGroupSDict = new MemberGroupSDict;
  memberGroupSDict->setAutoDelete(TRUE);
  visited=FALSE;
  m_subGrouping=Config_getBool(SUBGROUPING);
  if (type && !strcmp("module", type))
  {
    m_type = MODULE;
  }
  else if (type && !strcmp("constants", type))
  {
    m_type = CONSTANT_GROUP;
  }
  else if (type && !strcmp("library", type))
  {
    m_type = LIBRARY;
  }
  else
  {
    m_type = NAMESPACE;
  }
}
Ejemplo n.º 4
0
NamespaceDef::NamespaceDef(const char *df,int dl,
                           const char *name,const char *lref,
                           const char *fName) : 
   Definition(df,dl,name)
{
  if (fName)
  {
    fileName = stripExtension(fName);
  }
  else
  {
    fileName="namespace";
    fileName+=name;
  }
  classSDict = new ClassSDict(17);
  namespaceSDict = new NamespaceSDict(17);
  m_innerCompounds = new SDict<Definition>(17);
  usingDirList = 0;
  usingDeclList = 0;
  m_allMembersDict = 0;
  setReference(lref);
  memberGroupSDict = new MemberGroupSDict;
  memberGroupSDict->setAutoDelete(TRUE);
  visited=FALSE;
  m_subGrouping=Config_getBool("SUBGROUPING");
}
Ejemplo n.º 5
0
GroupDef::GroupDef(const char *df,int dl,const char *na,const char *t,
                   const char *refFileName) : Definition(df,dl,na)
{
  fileList = new FileList;
  classSDict = new ClassSDict(17);
  groupList = new GroupList;
  namespaceSDict = new NamespaceSDict(17);
  pageDict = new PageSDict(17);
  exampleDict = new PageSDict(17);
  dirList = new DirList;
  allMemberNameInfoSDict = new MemberNameInfoSDict(17);
  if (refFileName)
  {
    fileName=stripExtension(refFileName);
  }
  else
  {
    fileName = (QCString)"group_"+na;
  }
  setGroupTitle( t );
  memberGroupSDict = new MemberGroupSDict;
  memberGroupSDict->setAutoDelete(TRUE);

  allMemberList = new MemberList(MemberList::allMembersList);

  visited = 0;
  groupScope = 0;
}
Ejemplo n.º 6
0
string
writeDependency(const FileDescriptor* desc, void* param)
{
    string out;
    out.append("\t\t\t<include schemaLocation=\"").append(stripExtension(desc->name())).append(".xsd\"/>\n");
    return out;
}
Ejemplo n.º 7
0
GroupDef::GroupDef(const char *df,int dl,const char *na,const char *t,
                   const char *refFileName) : Definition(df,dl,1,na)
{
  fileList = new FileList;
  classSDict = new ClassSDict(17);
  groupList = new GroupList;
  namespaceSDict = new NamespaceSDict(17);
  pageDict = new PageSDict(17);
  exampleDict = new PageSDict(17);
  dirList = new DirList;
  allMemberNameInfoSDict = new MemberNameInfoSDict(17);
  allMemberNameInfoSDict->setAutoDelete(TRUE);
  if (refFileName)
  {
    fileName=stripExtension(refFileName);
  }
  else
  {
    fileName = convertNameToFile(QCString("group_")+na);
  }
  setGroupTitle( t );
  memberGroupSDict = new MemberGroupSDict;
  memberGroupSDict->setAutoDelete(TRUE);

  allMemberList = new MemberList(MemberListType_allMembersList);

  visited = 0;
  groupScope = 0;
  m_subGrouping=Config_getBool(SUBGROUPING);
}
Ejemplo n.º 8
0
// 
// Handle command line arguments
//
void parseFMMergeOptions(int argc, char** argv)
{
    bool die = false;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) 
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c) 
        {
            case 'm': arg >> opt::minOverlap; break;
            case 'p': arg >> opt::prefix; break;
            case 'o': arg >> opt::outFile; break;
            case 't': arg >> opt::numThreads; break;
            case '?': die = true; break;
            case 'v': opt::verbose++; break;
            case OPT_HELP:
                std::cout << FMMERGE_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
            case OPT_VERSION:
                std::cout << FMMERGE_VERSION_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }

    if (argc - optind < 1) 
    {
        std::cerr << SUBPROGRAM ": missing arguments\n";
        die = true;
    } 
    else if (argc - optind > 1) 
    {
        std::cerr << SUBPROGRAM ": too many arguments\n";
        die = true;
    }

    if(opt::numThreads <= 0)
    {
        std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::numThreads << "\n";
        die = true;
    }

    if (die) 
    {
        std::cout << "\n" << FMMERGE_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }

    // Parse the input filenames
    opt::readsFile = argv[optind++];

    if(opt::prefix.empty())
    {
        opt::prefix = stripExtension(opt::readsFile);
    }

    if(opt::outFile.empty())
    {
        opt::outFile = opt::prefix + ".merged.fa";
    }
 
}
Ejemplo n.º 9
0
// creates a file with the right extension for the plugin type.
// as of Aalto 1.6 / Kaivo this is always .mlpreset.
// input: a file path relative to the presets root, without extension.
//
void MLPluginProcessor::saveStateToRelativePath(const std::string& path)
{
#if DEMO
	debug() << "DEMO version. Saving is disabled.\n";
#else
    
    // the Model param contains the file path relative to the root.
    std::string shortPath = stripExtension(path);
    setProperty("preset", shortPath);
	
    std::string extension (".mlpreset");
    std::string extPath = shortPath + extension;
    const MLFilePtr f = mPresetFiles->createFile(extPath);
    if(!f->getJuceFile().exists())
    {
        f->getJuceFile().create();
    }
	
    f->getJuceFile().replaceWithText(getStateAsText());
	
	// reset state stack and push current state for recall
	mpPatchState->clearStateStack();
	mpPatchState->pushStateToStack();
#endif // DEMO
    
}
Ejemplo n.º 10
0
SmallImage::SmallImage(string fileName)
{
    mImage = imread(fileName);
    if (mImage.empty()) {
        throw * (new CannotOpenImageException(fileName));
    }
    mName = stripExtension(stripDirectory(fileName));
}
Ejemplo n.º 11
0
void MLPluginProcessor::loadStateFromPath(const std::string& path)
{    
    if(path != std::string())
    {
        const MLFilePtr f = mPresetFiles->getFileByName(path);
        if(f != MLFilePtr())
        {
            loadStateFromFile(f->mFile);
            std::string shortPath = stripExtension(path);
            setProperty("preset", shortPath);
        }
    }
}
Ejemplo n.º 12
0
bool
WsdlGeneratorHelper::createOutputFile(void)
{
    bool success = true;
    try {
        protoFilenameNoExt = stripExtension(proto->name());
        protoFilenameNoExtNoPath = stripPath(protoFilenameNoExt);
        string wsdlName = protoFilenameNoExtNoPath;
        wsdlName.append(".wsdl");
        wsdl = outputDirectory->Open(wsdlName);
        success = writeFileContents();
    } catch (string s) {
        success = false;
        *error = s;
    }
    return success;
}
Ejemplo n.º 13
0
void GameParamUserInterface::onActivate()
{
   TNLAssert(getUIManager()->cameFrom<EditorUserInterface>(), "GameParamUserInterface should only be called from the editor!");

   Level *level = getUIManager()->getUI<EditorUserInterface>()->getLevel();
   const GameType *gameType = level->getGameType();

   // Force rebuild of all params for current gameType; this will make sure we have the latest info if we've loaded a new level,
   // but will also preserve any values entered for gameTypes that are not current.
   clearCurrentGameTypeParams(gameType);
   
   // Load filename from editor only when we activate the menu
   mLevelFilename = stripExtension(getUIManager()->getUI<EditorUserInterface>()->getLevelFileName());
   if(mLevelFilename == EditorUserInterface::UnnamedFile)
      mLevelFilename = "";

   updateMenuItems(gameType);   
   mOrigGameParams = level->toLevelCode();   // Save a copy of the params coming in for comparison when we leave to see what changed
   Cursor::disableCursor();
}
Ejemplo n.º 14
0
GroupDef::GroupDef(const char *df,int dl,const char *na,const char *t,
                   const char *refFileName) : Definition(df,dl,na)
{
    fileList = new FileList;
    classSDict = new ClassSDict(17);
    groupList = new GroupList;
    namespaceSDict = new NamespaceSDict(17);
    pageDict = new PageSDict(17);
    exampleDict = new PageSDict(17);
    dirList = new DirList;
    allMemberList = new MemberList;
    allMemberNameInfoSDict = new MemberNameInfoSDict(17);
    if (refFileName)
    {
        fileName=stripExtension(refFileName);
    }
    else
    {
        fileName = (QCString)"group_"+na;
    }
    setGroupTitle( t );
    memberGroupSDict = new MemberGroupSDict;
    memberGroupSDict->setAutoDelete(TRUE);

    decDefineMembers.setInGroup(TRUE);
    decProtoMembers.setInGroup(TRUE);
    decTypedefMembers.setInGroup(TRUE);
    decEnumMembers.setInGroup(TRUE);
    decEnumValMembers.setInGroup(TRUE);
    decFuncMembers.setInGroup(TRUE);
    decVarMembers.setInGroup(TRUE);

    docDefineMembers.setInGroup(TRUE);
    docProtoMembers.setInGroup(TRUE);
    docTypedefMembers.setInGroup(TRUE);
    docEnumMembers.setInGroup(TRUE);
    docFuncMembers.setInGroup(TRUE);
    docVarMembers.setInGroup(TRUE);

    visited = 0;
}
Ejemplo n.º 15
0
void MLMenuButton::doPropertyChangeAction(MLSymbol property, const MLProperty& val)
{
	if (property == "text")
	{
		// TODO this file-specific stuff should not be here. 
		std::string processedText;
		const std::string str = val.getStringValue();
		if(getFloatProperty("strip"))
		{
			processedText = stripExtension(getShortName(str));
		}
		else
		{
			processedText = str;
		}
		setProperty("processed_text", processedText);
		repaint();
	}
	else
	{
		MLButton::doPropertyChangeAction(property, val);
	}
}
Ejemplo n.º 16
0
int variantDetectabilityMain(int argc, char** argv)
{
    parseVarDetectOptions(argc, argv);

    // Load the reference BWT
    std::string bwt_name = stripExtension(opt::referenceFile) + BWT_EXT;
    BWTIndexSet ref_index;
    ref_index.pBWT = new BWT(bwt_name);
    ref_index.pCache = new BWTIntervalCache(11, ref_index.pBWT);

    // Read reference
    ReadTable ref_table(opt::referenceFile);

    // Convert to string vector
    StringVector ref_sequences;
    for(size_t i = 0; i < ref_table.getCount(); ++i) {
        ref_sequences.push_back(ref_table.getRead(i).seq.toString());
    }
    computeDetectableSampling(ref_sequences, ref_index);

    delete ref_index.pBWT;
    delete ref_index.pCache;
    return 0;
}
Ejemplo n.º 17
0
void rmdup()
{
    StringVector hitsFilenames;
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT, opt::sampleRate);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT, opt::sampleRate);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, 
                                                         opt::errorRate, 0, 
                                                         0, false);
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT);
        computeRmdupHitsSerial(opt::prefix, opt::readsFile, pOverlapper, hitsFilenames);
    }
    else
    {
        printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        computeRmdupHitsParallel(opt::numThreads, opt::prefix, opt::readsFile, pOverlapper, hitsFilenames);
    }

    delete pOverlapper;
    delete pBWT; 
    delete pRBWT;
    delete pTimer;
    
    std::string out_prefix = stripExtension(opt::outFile);
    std::string dupsFile = parseDupHits(hitsFilenames, out_prefix);

    // Rebuild the indices without the duplicated sequences
    if(opt::bReindex)
    {
        std::cout << "Rebuilding indices without duplicated reads\n";
        removeReadsFromIndices(opt::prefix, dupsFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads);
        removeReadsFromIndices(opt::prefix, dupsFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads);
    }
}
Ejemplo n.º 18
0
int fillAaMain(int argc, char** argv) {
    parseFillAaOptions(argc, argv);
    string line; // for reading the input files
    
    std::istream* vcfFile = createReader(opt::vcfFile.c_str());
    std::ifstream* ancSeqFile = new std::ifstream(opt::ancSeqFile.c_str());
    
    string refFastaFileRoot;
    if (opt::out.empty()) {
        refFastaFileRoot = stripExtension(opt::vcfFile);
    } else {
        refFastaFileRoot = opt::out;
    }
    string outFN = refFastaFileRoot + "_AAfilled.vcf";
    std::ofstream* outFile = new std::ofstream(outFN.c_str());
    
    // Read in the whole ancestral sequence
    std::map<string, string> ancSeqs;
    getline(*ancSeqFile, line);
    string currentScaffold = line.substr(1,string::npos);
    ancSeqs[currentScaffold] = ""; ancSeqs[currentScaffold].reserve(50000000);
    while (getline(*ancSeqFile, line)) {
        if (line[0] != '>') {
            ancSeqs[currentScaffold].append(line);
        } else {
            // std::cerr << currentScaffold << " length: " << ancSeqs[currentScaffold].length() << std::endl;
            currentScaffold = line.substr(1,string::npos);
            ancSeqs[currentScaffold] = ""; ancSeqs[currentScaffold].reserve(50000000);
        }
    }
    
    // Now go through the vcf and add the AA fields
    int totalVariantNumber = 0;
    int aaDashCount = 0; int aaRefCount = 0; int aaAltCount = 0; int aaDiffCount = 0; int aaNcount = 0;
    while (getline(*vcfFile, line)) {
        if (line[0] == '#' && line[1] == '#')
            *outFile << line << std::endl;
        else if (line[0] == '#' && line[1] == 'C') {
            *outFile << "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral allele\">" << std::endl;
            *outFile << line << std::endl;
        } else {
            totalVariantNumber++;
            std::vector<std::string> fields = split(line, '\t');
            std::vector<std::string> info = split(fields[7], ';');
            if (info[0] != "INDEL") {
                assert(ancSeqs.find(fields[0]) != ancSeqs.end());
                char AA;
                if (ancSeqs[fields[0]].length() == 0) {
                    AA = 'N';
                } else {
                    AA = ancSeqs[fields[0]][atoi(fields[1].c_str())-1];
                    if (AA == '-') { aaDashCount++; }
                    else if (AA == 'N') { aaNcount++; }
                    else if (AA == fields[3][0]) { aaRefCount++; }
                    else if (AA == fields[4][0]) { aaAltCount++; }
                    else if (!((AA == fields[3][0]) || (AA == fields[4][0]))) {
                        aaDiffCount++;
                        // std::cerr << fields[0] << "\t" << fields[1] << "\t" << fields[3] << "\t" << fields[4] << "\t" << AA << std::endl;
                    }
                    // assert((AA == fields[3][0]) || (AA == fields[4][0]));
                }
                fields[7] += ";AA="; fields[7] += AA;
                print_vector(fields, *outFile, '\t');
            } else {
                *outFile << line << std::endl;
            }
            if (totalVariantNumber % 100000 == 0) {
                double totalAAfilled = aaRefCount + aaAltCount + aaDashCount + aaDiffCount + aaNcount;
                std::cerr << totalVariantNumber << " variants processed. AA=Ref:" << aaRefCount << "("<< aaRefCount/totalAAfilled <<"%); AA=Alt:" << aaAltCount << "("<< 100*(aaAltCount/totalAAfilled) <<"%); AA='-':" << aaDashCount << "("<< 100*(aaDashCount/totalAAfilled) << "%); AA=?(Neither Ref nor Alt):" << aaDiffCount << "("<< 100*(aaDiffCount/totalAAfilled) <<"%); AA=N:" << aaNcount << "("<< 100*(aaNcount/totalAAfilled) << "%)" << std::endl;
            }
        }
    }
    // Final summary
    double totalAAfilled = aaRefCount + aaAltCount + aaDashCount + aaDiffCount;
    std::cerr << std::endl;
    std::cerr << "All " << totalVariantNumber << " variants processed. AA=Ref:" << aaRefCount << "("<< 100*(aaRefCount/totalAAfilled) <<"%); AA=Alt:" << aaAltCount << "("<< 100*(aaAltCount/totalAAfilled) <<"%); AA='-':" << aaDashCount << "("<< 100*(aaDashCount/totalAAfilled) << "%); AA=?(Neither Ref nor Alt):" << aaDiffCount << "("<< 100*(aaDiffCount/totalAAfilled) <<"%)" << std::endl;
    
    return 0;
}
Ejemplo n.º 19
0
//! \brief creates a list of what to generate
//!
//! \param startDir The directory from which to start scanning
bool TalkFileCreator::createTalkList(QDir startDir)
{
    m_talkList.clear();

     // create Iterator
    QDirIterator::IteratorFlags flags = QDirIterator::NoIteratorFlags;
    if(m_recursive)
        flags = QDirIterator::Subdirectories;

    QDirIterator it(startDir,flags);

    //create temp directory
    QDir tempDir(QDir::tempPath()+ "/talkfiles/");
    if(!tempDir.exists())
        tempDir.mkpath(QDir::tempPath()+ "/talkfiles/");

    // read in Maps of paths - file/dirnames
    while (it.hasNext())
    {
        it.next();
        if(m_abort)
        {
            return false;
        }

        QFileInfo fileInf = it.fileInfo();

        // its a dir
        if(fileInf.isDir())
        {
            QDir dir = fileInf.dir();

            // insert into List
            if(!dir.dirName().isEmpty() && m_talkFolders)
            {
                // check if we should ignore it
                if(m_generateOnlyNew && QFileInfo(dir.path() + "/_dirname.talk").exists())
                {
                    continue;
                }

                //generate entry
                TalkGenerator::TalkEntry entry;
                entry.toSpeak = dir.dirName();
                entry.wavfilename = QDir::tempPath() + "/talkfiles/"
                    + QCryptographicHash::hash(entry.toSpeak.toUtf8(),
                    QCryptographicHash::Md5).toHex() + ".wav";
                entry.talkfilename = QDir::tempPath() + "/talkfiles/"
                    + QCryptographicHash::hash(entry.toSpeak.toUtf8(),
                    QCryptographicHash::Md5).toHex() + ".talk";
                entry.target = dir.path() + "/_dirname.talk";
                entry.voiced = false;
                entry.encoded = false;
                qDebug() << "[TalkFileCreator] toSpeak:" << entry.toSpeak
                         << "target:" << entry.target
                         << "intermediates:" << entry.wavfilename << entry.talkfilename;
                m_talkList.append(entry);
            }
        }
        else  // its a File
        {
            // insert into List
            if( !fileInf.fileName().isEmpty() && !fileInf.fileName().endsWith(".talk") && m_talkFiles)
            {
                //test if we should ignore this file
                bool match = false;
                for(int i=0; i < m_ignoreFiles.size();i++)
                {
                    QRegExp rx(m_ignoreFiles[i].trimmed());
                    rx.setPatternSyntax(QRegExp::Wildcard);
                    if(rx.exactMatch(fileInf.fileName()))
                        match = true;
                }
                if(match)
                    continue;

                // check if we should ignore it
                if(m_generateOnlyNew && QFileInfo(fileInf.path() + "/" + fileInf.fileName() + ".talk").exists())
                {
                    continue;
                }

                //generate entry
                TalkGenerator::TalkEntry entry;
                if(m_stripExtensions)
                    entry.toSpeak = stripExtension(fileInf.fileName());
                else
                    entry.toSpeak = fileInf.fileName();
                entry.wavfilename = QDir::tempPath() + "/talkfiles/"
                    + QCryptographicHash::hash(entry.toSpeak.toUtf8(),
                    QCryptographicHash::Md5).toHex() + ".wav";
                entry.talkfilename = QDir::tempPath() + "/talkfiles/"
                    + QCryptographicHash::hash(entry.toSpeak.toUtf8(),
                    QCryptographicHash::Md5).toHex() + ".talk";
                entry.target =  fileInf.path() + "/" + fileInf.fileName() + ".talk";
                entry.voiced = false;
                entry.encoded = false;
                qDebug() << "[TalkFileCreator] toSpeak:" << entry.toSpeak
                         << "target:" << entry.target
                         << "intermediates:" <<
                            entry.wavfilename << entry.talkfilename;
                m_talkList.append(entry);
            }
        }
        QCoreApplication::processEvents();
    }
    return true;
}
Ejemplo n.º 20
0
int reorderMain(int argc, char** argv) {
    parseReorderOptions(argc, argv);
    string fileRoot = stripExtension(opt::vcfFile);
    
    std::cerr << "Reordering columns in: " << opt::vcfFile << std::endl;
    std::cerr << "using ordering in: " << opt::newOrderFile << std::endl;
    
    // Open connection to read from the vcf file
    std::ifstream* vcfFile = new std::ifstream(opt::vcfFile.c_str());
    string reorderedFileName = fileRoot + opt::runName + "_reordered.vcf";
    std::ofstream* pReordered = new std::ofstream(reorderedFileName.c_str());
    
    int numChromosomes;
    int totalVariantNumber = 0;
    string line;
    std::vector<string> sampleNames;
    std::vector<string> newOrder = readSampleNamesFromTextFile(opt::newOrderFile);
    std::vector<string> fields;
    std::map<string, size_t> link;
    while (getline(*vcfFile, line)) {
        if (line[0] == '#' && line[1] == '#') {
            *pReordered << line << std::endl;
        } else if (line[0] == '#' && line[1] == 'C') {
            std::vector<std::string> fields = split(line, '\t');
            const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS;
            numChromosomes = (int)numSamples * 2;
            // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl;
            if (opt::sampleNameFile.empty()) {
                for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) {
                    sampleNames.push_back(fields[i]);
                }
            } else {
                sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile);
            }
            assert(sampleNames.size() == newOrder.size());
            link = linkVectors(newOrder, sampleNames);
            for (std::vector<std::string>::size_type i = 0; i != NUM_NON_GENOTYPE_COLUMNS; i++) {
                *pReordered << fields[i] << "\t";
            }
            
            for (std::vector<std::string>::size_type i = 0; i != sampleNames.size() - 1; i++) {
                *pReordered << newOrder[i] << "\t";
            } *pReordered << newOrder[newOrder.size()-1] << std::endl;
        } else {
            totalVariantNumber++;
            
            std::vector<std::string> fields = split(line, '\t');
            for (std::vector<std::string>::size_type i = 0; i != NUM_NON_GENOTYPE_COLUMNS; i++) {
                *pReordered << fields[i] << "\t";
            }
            
            for (std::vector<std::string>::size_type i = 0; i != sampleNames.size() - 1; i++) {
                *pReordered << fields[link[sampleNames[i]]+NUM_NON_GENOTYPE_COLUMNS] << "\t";
            } *pReordered << fields[link[sampleNames[sampleNames.size()-1]]+NUM_NON_GENOTYPE_COLUMNS] << std::endl;
 
            
        }
    }
    
    
    
    return 0;
}
Ejemplo n.º 21
0
int linkGNMain(int argc, char** argv) {
    linkGNOptions(argc, argv);
    string gpFileRoot = stripExtension(opt::gpFile);
    std::ofstream* gpOutFile;
    std::ofstream* refLinkFile;
    std::ofstream* goBedFile;
    std::ofstream* fullBedFile;
    
    if (opt::NtoN) {
        goBedFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_GOBed.txt");
        fullBedFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_FullBed.txt");
        gpOutFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_RefGene.gp");
        refLinkFile = new std::ofstream(gpFileRoot + opt::out + "_NtoN_RefLink.gp");
    } else {
        goBedFile = new std::ofstream(gpFileRoot + opt::out + "_GOBed.txt");
        fullBedFile = new std::ofstream(gpFileRoot + opt::out + "_FullBed.txt");
        gpOutFile = new std::ofstream(gpFileRoot + opt::out + "_RefGene.gp");
        refLinkFile = new std::ofstream(gpFileRoot + opt::out + "_RefLink.gp");
    }
    
    string line;
    int geneNum = 1;
    
    // Load David Brawand's assignment of orthologs
    // Mapping from cichlid IDs to a zebrafish homolog (or medaka
    // stickleback, tetraodon, if zebrafish not available)
    std::map<string,string> cichlidHomolog;
    
    std::map<string,string> cichlidDanRerCopyNum;
    if (opt::v2orthologsFile != "") {
        std::cerr << "Reading the v2 full orthologs file:" << std::endl;
        std::ifstream* ocFile = new std::ifstream(opt::v2orthologsFile);
        while (getline(*ocFile, line)) {
            std::vector<string> orthVec = split(line, '\t');
            int c = getSpeciesColumn(opt::species);
            if (orthVec[c] != "NA") {
                if (orthVec[8] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[8]; cichlidDanRerCopyNum[orthVec[c]] = "1-1"; } // Zebrafish
                else if (orthVec[5] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[5]; } // Medaka
                else if (orthVec[7] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[7]; } // Stickleback
                else if (orthVec[6] != "NA") { cichlidHomolog[orthVec[c]] = orthVec[6]; } // Tetraodon
                else { cichlidHomolog[orthVec[c]] = "novelCichlidGene"; }
            } else { continue; }
        } ocFile->close();
    }
    
    
    if (opt::v1orthologousClustersFile != "") {
        int copiesInCichlid = 0; int copiesInDanRer = 0;
        string cichlidGene = ""; string homologGene = "";
        
        std::cerr << "Reading the v1 orthologous cluster file: " << std::endl;
        std::ifstream* ocFile = new std::ifstream(opt::v1orthologousClustersFile);
        while (getline(*ocFile, line)) {
            std::vector<string> idAndNum = split(line, '\t');
            string thisLineGeneID = idAndNum[0];  int thisLineGeneClusterNumber = atoi(idAndNum[1].c_str());
            // Another line for the same cluster
            if (thisLineGeneClusterNumber == geneNum) {
                if (thisLineGeneID.substr(0,2) == opt::species) {
                    if (cichlidGene == "") {            // First copy in the cichlid species (e.g. mz)
                        cichlidGene = thisLineGeneID;
                    } else {                            // There is more than one copy in the cichlid
                        if (homologGene != "") {
                            if (copiesInDanRer <= 1 || opt::NtoN) {
                                attemptMappingUpdate(cichlidHomolog, cichlidGene, homologGene + "/" + numToString(copiesInCichlid));
                                if (copiesInDanRer == 1)
                                    cichlidDanRerCopyNum[cichlidGene] = "N-1";
                                else if (copiesInDanRer > 1)
                                    cichlidDanRerCopyNum[cichlidGene] = "N-N";
                            }
                            cichlidGene = thisLineGeneID;
                        }
                    }
                    copiesInCichlid++;
                } else if (thisLineGeneID.substr(0,6) == "ENSDAR") {
                    copiesInDanRer++;
                    if (homologGene == "") { homologGene = thisLineGeneID; }
                    else {
                        if (rand() < 0.5) homologGene = thisLineGeneID;  // 50% chance of using this zfish copy (hacky!!!)
                    }
                } else if (thisLineGeneID.substr(0,6) == "ENSGAC") {
                    if (homologGene == "") { homologGene = thisLineGeneID; }
                } else if (thisLineGeneID.substr(0,6) == "ENSORL") {
                    if (homologGene == "" || homologGene.substr(0,6) == "ENSGAC") { homologGene = thisLineGeneID; }
                } else if (thisLineGeneID.substr(0,6) == "ENSTNI") {
                    if (homologGene == "") { homologGene = thisLineGeneID; }
                }
                // std::cerr << atoi(idAndNum[1].c_str()) << "\t" << geneNum << std::endl;
            } else { // First line for a new cluster read
                // so first add the mapping for the previous cluster
                if (cichlidGene != "" && homologGene != "") {
                    assert(copiesInCichlid > 0);
                    if (copiesInDanRer == 1) {
                        if (copiesInCichlid == 1) {
                            cichlidDanRerCopyNum[cichlidGene] = "1-1";
                            attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene);
                        } else if (copiesInCichlid > 1) {
                            cichlidDanRerCopyNum[cichlidGene] = "N-1";
                            attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene + "/" + numToString(copiesInCichlid));
                        }
                    } else if (copiesInDanRer > 1) {
                        if (copiesInCichlid == 1) {
                            cichlidDanRerCopyNum[cichlidGene] = "1-N";
                            if (opt::NtoN)
                                attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene);
                        } else if (copiesInCichlid > 1) {
                            cichlidDanRerCopyNum[cichlidGene] = "N-N";
                            if (opt::NtoN)
                                attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene + "/" + numToString(copiesInCichlid));
                        }
                    } else {
                        if (copiesInCichlid == 1) {
                            attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene);
                        } else if (copiesInCichlid > 1) {
                            attemptMappingUpdate(cichlidHomolog, cichlidGene,homologGene + "/" + numToString(copiesInCichlid));
                        }
                    }
                }
                
                // then start looking through the next cluster
                cichlidGene = ""; homologGene = ""; copiesInDanRer = 0; copiesInCichlid = 0;
                geneNum = thisLineGeneClusterNumber;
                if (thisLineGeneID.substr(0,2) == opt::species) {
                    cichlidGene = thisLineGeneID;
                } else if (thisLineGeneID.substr(0,6) == "ENSDAR") {
                    copiesInDanRer++; homologGene = thisLineGeneID;
                } else if (thisLineGeneID.substr(0,6) == "ENSGAC") {
                    homologGene = thisLineGeneID;
                } else if (thisLineGeneID.substr(0,6) == "ENSORL") {
                    homologGene = thisLineGeneID;
                } else if (thisLineGeneID.substr(0,6) == "ENSTNI") {
                    homologGene = thisLineGeneID;
                }
            }
        } ocFile->close();
    }
    
    if (opt::sepByCopyNumberPrefix != "") {
        std::ofstream* OneOneFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_1-1.txt");
        std::ofstream* NOneFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_N-1.txt");
        std::ofstream* OneNFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_1-N.txt");
        std::ofstream* NNFile = new std::ofstream(opt::sepByCopyNumberPrefix + "_N-N.txt");
        
        for (std::map<string, string>::iterator it = cichlidDanRerCopyNum.begin(); it != cichlidDanRerCopyNum.end(); it++) {
            if (it->second == "1-1") {
                *OneOneFile << it->first << std::endl;
            } else if (it->second == "N-1") {
                *NOneFile << it->first << std::endl;
            } else if (it->second == "1-N") {
                *OneNFile << it->first << std::endl;
            } else if (it->second == "N-N") {
                *NNFile << it->first << std::endl;
            }
        }
    
    }
    
    
    // Load gene names and descriptions from ENSEMBL
    std::map<string,string> ensGeneMap;
    std::map<string,string> ensGeneDescriptionMap;
    std::map<string,string> ensEntrezMap;
    if (!opt::ensGeneFile.empty()) {
        std::ifstream* egFile = new std::ifstream(opt::ensGeneFile);
        while (getline(*egFile, line)) {
            std::vector<string> ensGene = split(line, '\t');
            if (ensGene.size() == 4) {
                ensGeneMap[ensGene[0]] = ensGene[3];
                ensGeneDescriptionMap[ensGene[0]] = ensGene[2];
                // Sometimes there are two Entrez records for one Ensembl gene, the first Entrez record tends to be the more informative one
                if ( ensEntrezMap.find(ensGene[0]) == ensEntrezMap.end() ) {
                    if (ensGene[1] != "") {ensEntrezMap[ensGene[0]] = ensGene[1]; }
                    else { ensEntrezMap[ensGene[0]] = "0"; }
                }
            } else if (ensGene.size() == 3) {
                ensGeneMap[ensGene[0]] = "NA";
                if (ensGene[2] != "") { ensGeneDescriptionMap[ensGene[0]] = ensGene[2]; }
                else { ensGeneDescriptionMap[ensGene[0]] = "no description: " + ensGene[0]; }
                // Sometimes there are two Entrez records for one Ensembl gene, the first Entrez record tends to be the more informative one
                if ( ensEntrezMap.find(ensGene[0]) == ensEntrezMap.end() ) {
                    if (ensGene[1] != "") {ensEntrezMap[ensGene[0]] = ensGene[1]; }
                    else { ensEntrezMap[ensGene[0]] = "0"; }
                }
            } else {
                //std::cerr << ensGene.size() << std::endl;
                print_vector_stream(ensGene, std::cerr);
            }
           // std::cout << ensGene[0] << "\t" << ensGene[2] << std::endl;
        }
    }
    
  
    
    // Go through the gene prediction file and generate the final outputs
    std::ifstream* gpFile = new std::ifstream(opt::gpFile);
    int countNovel = 1; int countUnknown = 1; int countNotInEnsembl = 1;
    while (getline(*gpFile, line)) {
        std::vector<string> gpVec = split(line, '\t');
        if ( cichlidHomolog.count(gpVec[0]) == 1) {
            std::vector<string> ensembl = split(cichlidHomolog[gpVec[0]], '/');
            std::vector<string> myNameVec = split(gpVec[0], '.');
            std::string nameWdots = gpVec[0];
            gpVec[0] = myNameVec[0] + "_" + myNameVec[1] + "_" + myNameVec[2] + "_" + myNameVec[3];
            
            if ( ensGeneMap.find(ensembl[0]) != ensGeneMap.end() ) {
                if (ensembl.size() == 1) {
                    std::cout << nameWdots << "\t" << ensembl[0] << "\t" << ensEntrezMap[ensembl[0]] << "\t" << ensGeneMap[ensembl[0]] << std::endl;
                    gpVec[11] = ensGeneMap[ensembl[0]];
                    print_vector(gpVec, *gpOutFile);
                    *refLinkFile << ensGeneMap[ensembl[0]] << "\t" << ensGeneDescriptionMap[ensembl[0]] << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << ensEntrezMap[ensembl[0]] << "\t0" << std::endl;
                    *fullBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl;
                    if (ensEntrezMap[ensembl[0]] != "0") {
                        *goBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl;
                    }
                } else {
                    std::cout << nameWdots << "\t" << ensembl[0] << "\t" << ensEntrezMap[ensembl[0]] << "\t" << ensGeneMap[ensembl[0]] << "/" << ensembl[1] << std::endl;
                    gpVec[11] = ensGeneMap[ensembl[0]]+"/"+ensembl[1];
                    print_vector(gpVec, *gpOutFile);
                    *refLinkFile << ensGeneMap[ensembl[0]] << "/" << ensembl[1] << "\t" << ensGeneDescriptionMap[ensembl[0]] << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << ensEntrezMap[ensembl[0]] << "\t0" << std::endl;
                    *fullBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl;
                    if (ensEntrezMap[ensembl[0]] != "0") {
                        *goBedFile << gpVec[1] << "\t" << gpVec[3] << "\t" << gpVec[4] << "\t" << ensEntrezMap[ensembl[0]] << "\t0\t" << gpVec[2] << std::endl;
                    }
                }
            } else if (ensembl[0] == "novelCichlidGene") {
                std::cout << nameWdots << "\t" << ensembl[0] << "\t0" << "\t" << opt::species + ".novel." + numToString(countNovel) << std::endl;
                gpVec[11] = opt::species + ".novel." + numToString(countNovel);
                countNovel++;
                print_vector(gpVec, *gpOutFile);
                *refLinkFile << opt::species + ".novel." + numToString(countNovel) << "\t" << "novel gene found only in cichlids" << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << "0" << "\t0" << std::endl;
            } else {
                std::cout << nameWdots << "\t" << "noOrthologAssigned" << "\t" << "0" << "\t" << opt::species + ".orthologNotInEnsembl." + numToString(countNotInEnsembl) << std::endl;
                *refLinkFile << opt::species + ".orthologNotInEnsembl." + numToString(countUnknown) << "\t" << "ortholog from Brawand data not foud in Ensembl v75" << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << "0" << "\t0" << std::endl;
                gpVec[11] = opt::species + ".orthologNotInEnsembl." + numToString(countNotInEnsembl);
                print_vector(gpVec, *gpOutFile);
                //std::cerr << ensembl[0] << std::endl;
            }
            //std::cout << "hello" << std::endl;
        } else {
            std::vector<string> myNameVec = split(gpVec[0], '.');
            std::string nameWdots = gpVec[0] + ".1";
            gpVec[0] = myNameVec[0] + "_" + myNameVec[1] + "_" + myNameVec[2] + "_" + myNameVec[3];
            std::cout << nameWdots << "\t" << "noOrthologAssigned" << "\t" << "0" << "\t" << opt::species + ".unknown." + numToString(countUnknown) << std::endl;
            *refLinkFile << opt::species + ".unknown." + numToString(countUnknown) << "\t" << "unknown - no ortholog from Brawand data" << "\t" << gpVec[0] << "\tNP_X\t77\t88\t" << "0" << "\t0" << std::endl;
            gpVec[11] = opt::species + ".unknown." + numToString(countUnknown);
            print_vector(gpVec, *gpOutFile);
            countUnknown++;
        }
    }
    
    return 0;
}
Ejemplo n.º 22
0
void getFstFromVCF() {
    std::cerr << "Calculating Fst using variants from: " << opt::vcfFile << std::endl;
    std::cerr << "Between the two 'populations' defined in: " << opt::sampleSets << std::endl;
    if (opt::windowSize > 0) {
        std::cerr << "also using a sliding window of size: " << opt::windowSize << " variants and sliding in steps of: " << opt::windowStep << std::endl;
    }
    string fileRoot = stripExtension(opt::sampleSets);
    //std::cerr << "Still alive: " << std::endl;
    // Open connection to read from the vcf file
    std::istream* vcfFile = createReader(opt::vcfFile.c_str());
    //std::cerr << "Hello: " << std::endl;
    std::ifstream* setsFile = new std::ifstream(opt::sampleSets.c_str());
    std::ifstream* annotFile;
    std::ofstream* snpCategoryFstFile;
    std::ofstream* regionsAboveFstFile; bool inRegAbove = false;
    std::ofstream* fstDxyFixedWindowFile;
    std::ifstream* ancSetsFile; std::ofstream* ancSetsOutFile;
    std::vector<string> ancSet1; std::vector<string> ancSet2;
    Annotation wgAnnotation;
    if (!opt::annotFile.empty()) {
        annotFile = new std::ifstream(opt::annotFile.c_str());
        Annotation Annot(annotFile, false); // Does not use transcripts annotated as 5' or 3' partial
        wgAnnotation = Annot;
        string snpCategoryFstFileName = fileRoot + "_" + opt::runName + "SNPcategory_fst.txt";
        snpCategoryFstFile = new std::ofstream(snpCategoryFstFileName.c_str());
        *snpCategoryFstFile << "SNPcategory" << "\t" << "thisSNPFst" << "\t" << "thisSNPDxy" << "\t" << "scaffold" << "\t" << "position" << std::endl;
    }
    if (!opt::ancSets.empty()) {
        ancSetsFile = new std::ifstream(opt::ancSets);
        string ancOutFileName = fileRoot + "_" + opt::runName + "ancestralSNPs_fst.txt";
        ancSetsOutFile = new std::ofstream(ancOutFileName);
        *ancSetsOutFile << "scaffold" << "\t" << "position" << "\t" << "AncAllelePopulation" << "\t" << "Fst" << "\t" << "ancSet1_segregating" << "\t" << "ancSet2_segregating" << std::endl;
        string ancSet1String; string ancSet2String;
        getline(*ancSetsFile, ancSet1String);
        getline(*ancSetsFile, ancSet2String);
        ancSet1 = split(ancSet1String, ','); ancSet2 = split(ancSet2String, ',');
        std::sort(ancSet1.begin(),ancSet1.end()); std::sort(ancSet2.begin(),ancSet2.end());
    }
    
    if (opt::regAbove > 0) {
        string regionsAboveFstFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst_above" + numToString(opt::regAbove) + ".txt";
        regionsAboveFstFile = new std::ofstream(regionsAboveFstFileName.c_str());
    }
    
    string FstResultsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst.txt";
    std::ofstream* pFst = new std::ofstream(FstResultsFileName.c_str());
    string fstDxyFixedWindowFileName = fileRoot + "dXY_fixedWindow.txt";
    fstDxyFixedWindowFile = new std::ofstream(fstDxyFixedWindowFileName.c_str());
    string heterozygositySetsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_heterozygosity.txt";
    *fstDxyFixedWindowFile << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy" << "\t" << "Set1_pi" << "\t" << "Set2_pi" << std::endl;
    std::ofstream* pHetSets = new std::ofstream(heterozygositySetsFileName.c_str());
    //std::cerr << "Still alive: " << std::endl;
    
    string set1String; string set2String;
    getline(*setsFile, set1String);
    getline(*setsFile, set2String);
    std::vector<string> set1 = split(set1String, ',');
    std::vector<string> set2 = split(set2String, ',');
    std::sort(set1.begin(),set1.end());
    std::sort(set2.begin(),set2.end());
    
    int numChromosomes;
    int totalVariantNumber = 0;
    int countedVariantNumber = 0;
    string windowMiddleVariant = "first\tWindow";
    string windowStartEnd = "scaffold_0\t0";
    int windowStart = 0; int windowEnd;
    int fixedWindowStart = 0; std::vector<double> fixedWindowDxyVector; std::vector<double> fixedWindowFstNumVector; std::vector<double> fixedWindowFstDenomVector;
    std::vector<double> fixedWindowHet1Vector; std::vector<double> fixedWindowHet2Vector; std::vector<double> fixedWindowPi1Vector; std::vector<double> fixedWindowPi2Vector;
    std::vector<string> sampleNames;
    std::vector<string> fields;
    std::vector<size_t> set1Loci; std::vector<size_t> set2Loci;
    std::vector<size_t> ancSet1Loci; std::vector<size_t> ancSet2Loci;
    short n1; short n2; short n1anc; short n2anc;
    string line;
    std::map<std::string, double> loc_pval;
    std::vector<double> fstNumerators; fstNumerators.reserve(30000000);
    std::vector<double> fstDenominators; fstDenominators.reserve(30000000);
    std::vector<double> DxyVector; DxyVector.reserve(30000000);
    std::vector<std::vector<double> > heterozygositiesVector; heterozygositiesVector.reserve(30000000);
    std::vector<double> set1heterozygositiesSimple; set1heterozygositiesSimple.reserve(30000000);
    std::vector<double> set2heterozygositiesSimple; set2heterozygositiesSimple.reserve(30000000);
    std::vector<double> set1heterozygositiesNei; set1heterozygositiesNei.reserve(30000000);
    std::vector<double> set2heterozygositiesNei; set2heterozygositiesNei.reserve(30000000);
    std::vector<double> set1heterozygositiesPi; set1heterozygositiesPi.reserve(30000000);
    std::vector<double> set2heterozygositiesPi; set2heterozygositiesPi.reserve(30000000);
    while (getline(*vcfFile, line)) {
        if (line[0] == '#' && line[1] == '#') {
            
        } else if (line[0] == '#' && line[1] == 'C') {
            std::vector<std::string> fields = split(line, '\t');
            const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS;
            numChromosomes = (int)numSamples * 2;
            // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl;
            
            if (opt::sampleNameFile.empty()) {
                for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) {
                    sampleNames.push_back(fields[i]);
                }
            } else {
                sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile);
            }
            set1Loci = locateSet(sampleNames, set1);
            set2Loci = locateSet(sampleNames, set2);
            n1 = set1Loci.size()*2; n2 = set2Loci.size()*2;
            std::cerr << "Set1 loci: " << std::endl;
            print_vector_stream(set1Loci, std::cerr);
            std::cerr << "Set2 loci: " << std::endl;
            print_vector_stream(set2Loci, std::cerr);
            
            if (!opt::ancSets.empty()) {
                ancSet1Loci = locateSet(sampleNames, ancSet1);
                ancSet2Loci = locateSet(sampleNames, ancSet2);
                std::cerr << "Ancestral Set1 loci: " << std::endl;
                print_vector_stream(ancSet1Loci, std::cerr);
                std::cerr << "Ancestral Set2 loci: " << std::endl;
                print_vector_stream(ancSet2Loci, std::cerr);
                n1anc = ancSet1Loci.size() * 2; n2anc = ancSet2Loci.size() * 2;
            }
            
            if (opt::windowSize > 0) {
                if (opt::windowSize == opt::windowStep) {
                    *pHetSets << "scaffold" << "\t" << "Start" << "\t" << "End" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl;
                    *pFst << "var_num" << "\t" << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy_onlyVaiants" << "\t" << "Dxy_AllSites" << "\t" << "windowSize" << std::endl;
                    if (opt::regAbove > 0) *regionsAboveFstFile << "scaffold" << "\t" << "Start" << "\t" << "End" << std::endl;
                } else {
                    *pHetSets << "Middle_SNP_position" << "\t" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl;
                }
            }
        } else {
            totalVariantNumber++;
            
            std::vector<std::string> fields = split(line, '\t');
            std::vector<std::string> info = split(fields[7], ';');
            if (info[0] != "INDEL") {  // Without indels
                SetCounts counts = getVariantCountsForFst(fields,set1Loci,set2Loci);
                //std::cerr << "Still here: " << counts.set1HaplotypeVariant.size() << "\t" << counts.set1individualsWithVariant.size() << "\t" << n1 << std::endl;
                //std::cerr << "Still here: " << counts.set2HaplotypeVariant.size() << "\t" << counts.set2individualsWithVariant.size() << "\t" << n2 << std::endl;
                //print_vector_stream(counts.set1HaplotypeVariant, std::cerr);
                //print_vector_stream(counts.set1individualsWithVariant, std::cerr);
                //print_vector_stream(counts.set2HaplotypeVariant, std::cerr);
                if ((counts.set1Count > 0 || counts.set2Count > 0) && (counts.set1Count < n1 || counts.set2Count < n2)) {
                    countedVariantNumber++;
                    double FstNumerator = calculateFstNumerator(counts, n1, n2); fstNumerators.push_back(FstNumerator); fixedWindowFstNumVector.push_back(FstNumerator);
                    double FstDenominator = calculateFstDenominator(counts, n1, n2); fstDenominators.push_back(FstDenominator); fixedWindowFstDenomVector.push_back(FstDenominator);
                    assert(FstDenominator != 0);
                    double thisSNPDxy = calculateDxy(counts, n1, n2); DxyVector.push_back(thisSNPDxy); fixedWindowDxyVector.push_back(thisSNPDxy);
                    std::vector<double> thisSNPhet = getSetHeterozygozities(counts, n1, n2); heterozygositiesVector.push_back(thisSNPhet);
                    std::vector<double> thisSNPpis = calculatePiTwoSets(counts, n1, n2); fixedWindowPi1Vector.push_back(thisSNPpis[0]); fixedWindowPi2Vector.push_back(thisSNPpis[1]);
                    set1heterozygositiesPi.push_back(thisSNPpis[0]); set2heterozygositiesPi.push_back(thisSNPpis[1]);
                   // std::cerr << "Still here: " << thisSNPpis[0] << std::endl;
                    set1heterozygositiesSimple.push_back(thisSNPhet[0]); set2heterozygositiesSimple.push_back(thisSNPhet[1]); fixedWindowHet1Vector.push_back(thisSNPhet[0]);
                    set1heterozygositiesNei.push_back(thisSNPhet[2]); set2heterozygositiesNei.push_back(thisSNPhet[3]); fixedWindowHet2Vector.push_back(thisSNPhet[1]);
                    if (!opt::annotFile.empty()) {
                        string scaffold = fields[0]; string loc = fields[1]; // Scaffold
                        string SNPcategory = wgAnnotation.getCategoryOfSNP(scaffold, loc);
                        double thisSNPFst = FstNumerator/FstDenominator;
                        *snpCategoryFstFile << SNPcategory << "\t" << thisSNPFst << "\t" << thisSNPDxy << "\t" << scaffold << "\t" << loc << std::endl;
                    }
                    if (!opt::ancSets.empty()) {
                        double thisSNPFst = FstNumerator/FstDenominator;
                        if (thisSNPFst < 0) { thisSNPFst = 0; }
                        string AA = split(info[info.size()-1],'=')[1];
                        //std::cerr << "AA=" << " " << AA << std::endl;
                        FourSetCounts c;
                        if (AA == fields[3]) {
                            c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"ref");
                            *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t";
                            if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; }
                            if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; }
                        } else if (AA == fields[4]) {
                            c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"alt");
                            *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t";
                            if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; }
                            if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; }
                            // std::cerr << "AA=alt" << " " << c.set1daAF << " " << c.set2daAF << std::endl;
                        } else {
                            c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"N");
                            *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << "-888" << "\t" << thisSNPFst << "\t";
                            if (c.set3AltAF > 0 & c.set3AltAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; }
                            if (c.set4AltAF > 0 & c.set4AltAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; }
                        }
                        
                        
                    }
                    
                    std::vector<string> s = split(windowStartEnd, '\t');
                    if (s[0] == fields[0]) {
                        if (atoi(fields[1].c_str()) > (fixedWindowStart+10000)) {
                            double thisFixedWindowDxy = vector_average_withRegion(fixedWindowDxyVector, 10000);
                            double thisFixedWindowFst = calculateFst(fixedWindowFstNumVector, fixedWindowFstDenomVector);
                            //double thisFixedWindowHet1 = vector_average_withRegion(fixedWindowHet1Vector, 10000);
                            //double thisFixedWindowHet2 = vector_average_withRegion(fixedWindowHet2Vector, 10000);
                            double thisFixedWindowPi1 = vector_average_withRegion(fixedWindowPi1Vector, 10000);
                            double thisFixedWindowPi2 = vector_average_withRegion(fixedWindowPi2Vector, 10000);
                            *fstDxyFixedWindowFile << fields[0] << "\t" << fixedWindowStart << "\t" << fixedWindowStart+10000 << "\t" << thisFixedWindowFst << "\t" << thisFixedWindowDxy << "\t" << thisFixedWindowPi1 << "\t" << thisFixedWindowPi2 << std::endl;
                            fixedWindowDxyVector.clear(); fixedWindowFstNumVector.clear(); fixedWindowFstDenomVector.clear();
                            fixedWindowHet1Vector.clear(); fixedWindowHet2Vector.clear(); fixedWindowPi1Vector.clear(); fixedWindowPi2Vector.clear();
                            fixedWindowStart= fixedWindowStart+10000;
                        }
                    } else {
                        fixedWindowStart = 0;
                    }
                    
                    
                    
                    if (opt::windowSize == 1) {
                        double Fst = FstNumerator/FstDenominator;
                        if (Fst < 0) Fst = 0;
                        *pFst << countedVariantNumber << "\t" << fields[0] + "\t" + fields[1] << "\t" << Fst << "\t" << thisSNPDxy << std::endl;
                        
                    } else if ((opt::windowSize > 0) && (countedVariantNumber % opt::windowStep == 0) && countedVariantNumber >= opt::windowSize) {
                        std::vector<double> windowFstNumerators(fstNumerators.end()-opt::windowSize, fstNumerators.end());
                        std::vector<double> windowFstDenominators(fstDenominators.end()-opt::windowSize, fstDenominators.end());
                        double windowFst = calculateFst(windowFstNumerators, windowFstDenominators); if (windowFst < 0) windowFst = 0;
                        std::vector<double> windowDxyVec(DxyVector.end()-opt::windowSize, DxyVector.end());
                        double windowDxy = vector_average(windowDxyVec);
                        if (opt::windowSize == opt::windowStep) {
                            std::vector<string> s = split(windowStartEnd, '\t');
                            if (s[0] == fields[0]) {
                                windowStartEnd = windowStartEnd + "\t" + fields[1];
                                windowEnd = atoi(fields[1].c_str());
                                double windowDxyIncNonSeg = vector_average_withRegion(windowDxyVec, windowEnd-windowStart);
                                *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowStartEnd << "\t" << windowFst << "\t" << windowDxy << "\t" << windowDxyIncNonSeg << "\t" << windowFstDenominators.size() << std::endl;
                                if (opt::regAbove > 0) {
                                    if (windowFst >= opt::regAbove && !inRegAbove) {
                                        inRegAbove = true;
                                        *regionsAboveFstFile << s[0] << "\t" << s[1] << "\t";
                                    } else if (windowFst < opt::regAbove && inRegAbove) {
                                        inRegAbove = false;
                                        *regionsAboveFstFile << s[1] << std::endl;
                                    }
                                }
                            }
                        } else {
                            *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowMiddleVariant << "\t" << windowFst << "\t" << windowDxy << "\t" << windowFstDenominators.size() << std::endl;
                        }
                        // Now calculate and output expected heterozygosities for this window
                        std::vector<double> windowHetS1Vec(set1heterozygositiesSimple.end()-opt::windowSize, set1heterozygositiesSimple.end());
                        double windowHetS1 = vector_average(windowHetS1Vec);
                        std::vector<double> windowHetS2Vec(set2heterozygositiesSimple.end()-opt::windowSize, set2heterozygositiesSimple.end());
                        double windowHetS2 = vector_average(windowHetS2Vec);
                        std::vector<double> windowHetNei1Vec(set1heterozygositiesNei.end()-opt::windowSize, set1heterozygositiesNei.end());
                        double windowHetNei1 = vector_average(windowHetNei1Vec);
                        std::vector<double> windowHetNei2Vec(set2heterozygositiesNei.end()-opt::windowSize, set2heterozygositiesNei.end());
                        double windowHetNei2 = vector_average(windowHetNei2Vec);
                        std::vector<double> windowHetPi1Vec(set1heterozygositiesPi.end()-opt::windowSize, set1heterozygositiesPi.end());
                        double windowHetPi1 = vector_average_withRegion(windowHetPi1Vec, windowEnd-windowStart);
                        std::vector<double> windowHetPi2Vec(set2heterozygositiesPi.end()-opt::windowSize, set2heterozygositiesPi.end());
                        double windowHetPi2 = vector_average_withRegion(windowHetPi2Vec, windowEnd-windowStart);
                        if (opt::windowSize == opt::windowStep) {
                            std::vector<string> s = split(windowStartEnd, '\t');
                            if (s[0] == fields[0]) {
                                *pHetSets << windowStartEnd << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << "\t" << windowHetPi1 << "\t" << windowHetPi2 << std::endl;
                                windowStartEnd = fields[0] + "\t" + fields[1];
                                windowStart = atoi(fields[1].c_str());
                            } else {
                                windowStartEnd = fields[0] + "\t0";
                                windowStart = 0;
                            }
                        } else {
                            *pHetSets << windowMiddleVariant << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << std::endl;
                            windowMiddleVariant = fields[0] + "\t" + fields[1];     // works only if STEP is half SIZE for the window
                        }
                    }
                }
            }
            if (totalVariantNumber % 100000 == 0) {
                double Fst = calculateFst(fstNumerators, fstDenominators);
                std::cerr << totalVariantNumber << " variants processed... Fst: " << Fst << std::endl;
            }
                
        }
    }
    double Fst = calculateFst(fstNumerators, fstDenominators);
    double overallHetS1 = vector_average(set1heterozygositiesSimple);
    double overallHetS2 = vector_average(set2heterozygositiesSimple);
    double overallHetNei1 = vector_average(set1heterozygositiesNei);
    double overallHetNei2 = vector_average(set2heterozygositiesNei);
    
    std::cerr << "Fst: " << Fst << std::endl;
    std::cerr << "Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl;
    *pHetSets << "#Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl;
}
Ejemplo n.º 23
0
void doAbbaBaba() {
    string line; // for reading the input files
    
    std::istream* vcfFile = createReader(opt::vcfFile.c_str());
    std::ifstream* setsFile = new std::ifstream(opt::setsFile.c_str());
    string setsFileRoot = stripExtension(opt::setsFile);
    std::ofstream* outFile = new std::ofstream(setsFileRoot+ "_" + opt::runName + "_abbaBaba.txt");
    string windowStartEnd = "scaffold_0\t0";
    
    // Get the sample sets
    string outgroupString; std::vector<size_t> Opos; std::vector<string> outgroup;
    if (!opt::bAaEqO) { getline(*setsFile, outgroupString); outgroup = split(outgroupString, ','); } else { outgroupString = "VCF AA field"; }
    string P3string; getline(*setsFile, P3string); std::vector<string> P3 = split(P3string, ','); std::vector<size_t> P3pos;
    string P2string; getline(*setsFile, P2string); std::vector<string> P2 = split(P2string, ','); std::vector<size_t> P2pos;
    string P1string; getline(*setsFile, P1string); std::vector<string> P1 = split(P1string, ','); std::vector<size_t> P1pos;
    if (!opt::bFrequency && (P1.size() > 1 || P2.size() > 1 || P3.size() > 1)) {
        std::cerr << "There are more than one individual on some line of the SETS.txt file" << std::endl;
        std::cerr << "Perhaps you want to use the -f option?" << std::endl;
        exit(1);
    }
    
    // Now go through the vcf and calculate D
    int totalVariantNumber = 0;
    ABBA_BABA_Freq_allResults r;
    int lastPrint = 0; int lastWindowVariant = 0;
    std::vector<double> regionDs; std::vector<double> region_f_Gs; std::vector<double> region_f_Ds; std::vector<double> region_f_DMs;
    std::vector<string> sampleNames;
    while (getline(*vcfFile, line)) {
        if (line[0] == '#' && line[1] == '#')
            continue;
        else if (line[0] == '#' && line[1] == 'C') {
            std::vector<std::string> fields = split(line, '\t');
            if (opt::sampleNameFile.empty()) {
                for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) {
                    sampleNames.push_back(fields[i]);
                }
            } else {
                sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile);
            }
            if (!opt::bAaEqO) { Opos = locateSet(sampleNames, outgroup); }
            P3pos = locateSet(sampleNames, P3);
            P2pos = locateSet(sampleNames, P2); P1pos = locateSet(sampleNames, P1);
            
            
            if (!opt::bAaEqO) { std::cerr << "Outgroup: "; print_vector_stream(outgroup, std::cerr); } else { std::cerr << "Outgroup: " << outgroupString << std::endl; }
            std::cerr << "P3: "; print_vector_stream(P3, std::cerr);
            std::cerr << "P2: "; print_vector_stream(P2, std::cerr);
            std::cerr << "P1: "; print_vector_stream(P1, std::cerr);
        } else {
            totalVariantNumber++;
            std::vector<std::string> fields = split(line, '\t');
            std::vector<std::string> info = split(fields[7], ';');
            if (info[0] != "INDEL") {
                if (!opt::bAaEqO) {
                    ThreeSetCounts c;
                    if (opt::bNoAaO) {
                        c = getThreeSetVariantCountsAA4(fields,P1pos,P2pos,P3pos,Opos);
                        if (opt::bFrequency) {
                            incrementDnumDdenomFrequency(c, r);
                        } else {
                            incrementDnumDdenomSingleSequence(c, r);
                        }
                    } else {
                        FourSetCounts c;
                        string AA = getAAfromInfo(info);
                        if (AA == fields[3]) {
                            c = getFourSetVariantCounts(fields,P1pos,P2pos,P3pos,Opos,"ref");
                        } else if (AA == fields[4]) {
                            c = getFourSetVariantCounts(fields,P1pos,P2pos,P3pos,Opos,"alt");
                        }
                        r.Dnumerator += ((1-c.set1daAF)*c.set2daAF*c.set3daAF*(1-c.set4daAF)) - (c.set1daAF*(1-c.set2daAF)*c.set3daAF*(1-c.set4daAF));
                        r.Ddenominator += ((1-c.set1daAF)*c.set2daAF*c.set3daAF*(1-c.set4daAF)) + (c.set1daAF*(1-c.set2daAF)*c.set3daAF*(1-c.set4daAF));
                        if (c.set2daAF > c.set3daAF) {
                            r.f_d_denominator += ((1-c.set1daAF)*c.set2daAF*c.set2daAF*(1-c.set4daAF)) - (c.set1daAF*(1-c.set2daAF)*c.set2daAF*(1-c.set4daAF));
                        } else {
                            r.f_d_denominator += ((1-c.set1daAF)*c.set3daAF*c.set3daAF*(1-c.set4daAF)) - (c.set1daAF*(1-c.set3daAF)*c.set3daAF*(1-c.set4daAF));
                        }
                    }
                } else {
                    string AA = getAAfromInfo(info);
                    ThreeSetCounts c;
                    if (AA == fields[3]) {
                        c = getThreeSetVariantCounts(fields,P1pos,P2pos,P3pos,"ref");
                    } else if (AA == fields[4]) {
                        c = getThreeSetVariantCounts(fields,P1pos,P2pos,P3pos,"alt");
                    }
                    if (opt::bFrequency) {
                        incrementDnumDdenomFrequency(c, r);
                    } else {
                        incrementDnumDdenomSingleSequence(c, r);
                    }
                }
                // if (totalVariantNumber % 100000 == 0) { std::cerr << Dnumerator << std::endl; }
            } else {
                ABBABABAcounts::indels++;
            }
            
            if (ABBABABAcounts::usedVariantsCounter % opt::windowSize == 0 && ABBABABAcounts::usedVariantsCounter != lastWindowVariant) {
                std::vector<string> s = split(windowStartEnd, '\t');
                if (s[0] == fields[0]) {
                    windowStartEnd = windowStartEnd + "\t" + fields[1];
                    if ((double)r.windowDnum/r.window_f_dM_denominator > 1) {
                        std::cerr << "D num" << r.windowDnum << std::endl;
                        std::cerr << "f_dM denom" << r.window_f_dM_denominator << std::endl;
                    }
                    *outFile << windowStartEnd << "\t" << (double)r.windowDnum/r.windowDdenom << "\t" << (double)r.window_f_d_num/r.window_f_d_denominator << "\t" << (double)r.windowDnum/r.window_f_dM_denominator << std::endl;
                    windowStartEnd = fields[0] + "\t" + fields[1];
                } else {
                    windowStartEnd = fields[0] + "\t0";
                }
                r.windowDnum = 0; r.windowDdenom = 0; r.window_f_d_num = 0; r.window_f_d_denominator = 0; r.window_f_dM_denominator = 0; lastWindowVariant = ABBABABAcounts::usedVariantsCounter;
            }
            
            
            if (ABBABABAcounts::usedVariantsCounter % opt::jackKniveWindowSize == 0 && ABBABABAcounts::usedVariantsCounter != lastPrint) {
            //if (totalVariantNumber % 100000 == 0) {
                if (opt::bFrequency)
                    assert(ABBABABAcounts::XXAA + ABBABABAcounts::AABA + ABBABABAcounts::BBBA + ABBABABAcounts::indels + ABBABABAcounts::noDafInfo + ABBABABAcounts::usedVariantsCounter + ABBABABAcounts::p1p2 == totalVariantNumber);
                if (ABBABABAcounts::usedVariantsCounter > (6 * opt::jackKniveWindowSize)) {
                    double Dstd_err = jackknive_std_err(regionDs); double f_Gstd_err = jackknive_std_err(region_f_Gs);
                    double f_Dstd_err = jackknive_std_err(region_f_Ds); double f_DMstd_err = jackknive_std_err(region_f_DMs);
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tD=" << (double)r.Dnumerator/r.Ddenominator << " std_err=" << Dstd_err << std::endl;
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_G=" << (double)r.f_G_num/r.f_G_denom << " std_err=" << f_Gstd_err << std::endl;
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_d=" << (double)r.f_d_num/r.f_d_denominator << " std_err=" << f_Dstd_err << std::endl;
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_dM=" << (double)r.Dnumerator/r.f_dM_denominator << " std_err=" << f_DMstd_err << std::endl;
                } else {
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tD=" << (double)r.Dnumerator/r.Ddenominator << std::endl;
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_G=" << (double)r.f_G_num/r.f_G_denom << std::endl;
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_d=" << (double)r.f_d_num/r.f_d_denominator << std::endl;
                    std::cerr << totalVariantNumber << " variants processed. " << ABBABABAcounts::usedVariantsCounter << " variants used. \tf_dM=" << (double)r.Dnumerator/r.f_dM_denominator << std::endl;
                }
                std::cerr << "Last used "<< opt::jackKniveWindowSize << " variants \t\t\t\tD=" << r.lastVarsDnum/r.lastVarsDdenom << std::endl;
                // std::cerr << "AAAA=" << XXAA << "; AABA=" << AABA << "; BBBA=" << BBBA << std::endl;
                regionDs.push_back(r.lastVarsDnum/r.lastVarsDdenom); region_f_Gs.push_back(r.lastVarsF_G_num/r.lastVarsF_G_denom);
                region_f_Ds.push_back(r.lastVarsF_d_num/r.lastVarsF_d_denom); region_f_DMs.push_back(r.lastVarsDnum/r.lastVarsF_dM_denom);
                r.lastVarsDnum = 0; r.lastVarsDdenom = 0; r.lastVarsF_d_num = 0; r.lastVarsF_d_denom = 0; r.lastVarsF_G_num = 0; r.lastVarsF_G_denom = 0; r.lastVarsF_dM_denom = 0;
                lastPrint = ABBABABAcounts::usedVariantsCounter;
            }
        }
    }
    
    double Dstd_err = jackknive_std_err(regionDs); double f_Gstd_err = jackknive_std_err(region_f_Gs);
    double f_Dstd_err = jackknive_std_err(region_f_Ds); double f_DMstd_err = jackknive_std_err(region_f_DMs);
    std::cerr << std::endl;
    std::cerr << totalVariantNumber << " variants processed. D=" << (double)r.Dnumerator/r.Ddenominator << " std_err=" << Dstd_err << std::endl;
    std::cerr << totalVariantNumber << " variants processed. f_G=" << (double)r.f_G_num/r.f_G_denom << " std_err=" << f_Gstd_err << std::endl;
    std::cerr << totalVariantNumber << " variants processed. f_d=" << (double)r.f_d_num/r.f_d_denominator << " std_err=" << f_Dstd_err << std::endl;
    std::cerr << totalVariantNumber << " variants processed. f_dM=" << (double)r.Dnumerator/r.f_dM_denominator << " std_err=" << f_DMstd_err << std::endl;
    
}
int polymorphicMain(int argc, char** argv) {
    parsePolymorphicOptions(argc, argv);
    string fileRoot = stripExtension(opt::sampleSets);
    
    std::cerr << "Filtering a VCF file: " << opt::vcfFile << std::endl;
    std::cerr << "so that only sites that are ploymorphic in sample sets defined in: " << opt::sampleSets << " are output" << std::endl;
    
    // Open connection to read from the vcf file
    std::ifstream* vcfFile = new std::ifstream(opt::vcfFile.c_str());
    std::ifstream* setsFile = new std::ifstream(opt::sampleSets.c_str());
    string PolymorphicFileName = fileRoot + "_" + opt::runName + "_polymorphic.vcf";
    std::ofstream* pPolymorphicVCF = new std::ofstream(PolymorphicFileName.c_str());
    
    
    std::vector<std::vector<string> > sets;
    string setString;
    while (getline(*setsFile, setString)) {
        std::vector<string> thisSet = split(setString, ',');
        std::sort(thisSet.begin(),thisSet.end());
        sets.push_back(thisSet);
    }
    
    int numChromosomes;
    int totalVariantNumber = 0;
    string line;
    std::vector<string> sampleNames;
    std::vector<string> fields;
    std::vector<std::vector<size_t> > setsLoci;
    while (getline(*vcfFile, line)) {
        if (line[0] == '#' && line[1] == '#') {
            *pPolymorphicVCF << line << std::endl;
        } else if (line[0] == '#' && line[1] == 'C') {
            std::vector<std::string> fields = split(line, '\t');
            const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS;
            numChromosomes = (int)numSamples * 2;
            // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl;
            
            if (opt::sampleNameFile.empty()) {
                for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) {
                    sampleNames.push_back(fields[i]);
                }
            } else {
                sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile);
            }
            
            for (std::vector<std::vector<string> >::size_type i = 0; i != sets.size(); i++) {
                std::vector<size_t> thisSetLoci = locateSet(sampleNames, sets[i]);
                setsLoci.push_back(thisSetLoci);
                std::cerr << "Set" << i << " loci: " << std::endl;
                print_vector_stream(thisSetLoci, std::cerr);
            }
            
            *pPolymorphicVCF << line << std::endl;
        } else {
            totalVariantNumber++;
            
            std::vector<std::string> fields = split(line, '\t');
            
            bool polymorphicInSets = true;
            
            for (std::vector<std::vector<size_t> >::size_type i = 0; i != setsLoci.size(); i++) {
                if(!findIfPolymorhicInSet(fields, setsLoci[i])) {
                    polymorphicInSets = false;
                }
            }
            
            if (polymorphicInSets)
                *pPolymorphicVCF << line << std::endl;
                
        }
    }
    
    
    
    return 0;
}
Ejemplo n.º 25
0
void getFstFromMs() {
    std::cerr << "Calculating Fst using variants from: " << opt::msFile << std::endl;
    std::cerr << "and outputting chi-sq test p-vals < " << opt::msPvalCutoff << std::endl;
    
    std::ifstream* msFile = new std::ifstream(opt::msFile.c_str());
    string fileRoot = stripExtension(opt::msFile);
    string PvalFileName = fileRoot + "_" + opt::runName + "_pvals.txt";
    std::ofstream* pValFile;
    if (opt::msPvalCutoff > 0) {
        pValFile = new std::ofstream(PvalFileName.c_str());
        *pValFile << "Fisher p-val" << "\t" << "chi-sq pval" << "\t" << "set1Alt" << "\t" << "set1Ref" << "\t" << "set2Alt" << "\t" << "set2Ref" << "\t" << "Fst" << std::endl;
    }
    
    
    std::vector<int> set1_loci;
    std::vector<int> set2_loci;
    srand((int)time(NULL));
    if (opt::msSet1FstSample == 0) {
        opt::msSet1FstSample = opt::msSet1Size;
        for (int i = 0; i != opt::msSet1FstSample; i++) {
            set1_loci.push_back(i);
        }
    } else { // Randomly sample individuals from population 1 for Fst calculation
        for (int i = 0; i != opt::msSet1FstSample; i++) {
            int rand_sample = (rand()%opt::msSet1Size);
            while (std::find(set1_loci.begin(),set1_loci.end(),rand_sample) != set1_loci.end()) {
                rand_sample = (rand()%opt::msSet1Size);
            }
            set1_loci.push_back(rand_sample);
        }
    }
    // Do the same for set2
    if (opt::msSet2FstSample == 0) {
        opt::msSet2FstSample = opt::msSet2Size;
        for (int i = 0; i != opt::msSet2FstSample; i++) {
            set2_loci.push_back(i+opt::msSet1Size);
        }
    } else { // Randomly sample individuals from population 2 for Fst calculation
        for (int i = 0; i != opt::msSet2FstSample; i++) {
            int rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size;
            while (std::find(set2_loci.begin(),set2_loci.end(),rand_sample) != set2_loci.end()) {
                rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size;
            }
            set2_loci.push_back(rand_sample);
        }
    }

    std::cerr << "Selected population 1 individuals: "; print_vector_stream(set1_loci, std::cerr);
    std::cerr << "Selected population 2 individuals: "; print_vector_stream(set2_loci, std::cerr);
    
    if (opt::msSet1Size != opt::msSet1FstSample || opt::msSet2Size != opt::msSet2FstSample) {
        std::cerr << "Warning: the Fst column is going to contain '-1' values where the site is not a segregating site in the sampled individuals for Fst calcultation" << std::endl;
    }
    
    std::vector<double> fstNumerators; fstNumerators.reserve(500000000);
    std::vector<double> fstDenominators; fstDenominators.reserve(500000000);
    
    
    string line;
    int numFixedSites = 0;
    int numNearlyFixedSites = 0;
    std::vector<double> nullForChisq;
    std::vector<int> moreSet1;
    std::vector<int> lessSet1;
    std::vector<int> moreSet2;
    std::vector<int> lessSet2;
    SetCounts counts;
    while (getline(*msFile, line)) {
        counts.reset();
        double thisFst = -1;
        for (std::vector<int>::iterator it = set1_loci.begin(); it != set1_loci.end(); it++) {
            // std::cerr << line[*it] << std::endl;
            if (line[*it] == '1') {
                counts.set1Count++;
            }
        }
        for (std::vector<int>::iterator it = set2_loci.begin(); it != set2_loci.end(); it++) {
            if (line[*it] == '1') {
                counts.set2Count++;
            }
        }
        
        //std::cerr << "counts.set1Count" << counts.set1Count << "\t" << "counts.set2Count" << counts.set2Count << std::endl;
        
        if (counts.set1Count > 0 || counts.set2Count > 0) {
            double FstNum = calculateFstNumerator(counts, opt::msSet1FstSample, opt::msSet2FstSample);
            double FstDenom = calculateFstDenominator(counts, opt::msSet1FstSample, opt::msSet2FstSample);
            thisFst = FstNum/FstDenom; if (thisFst < 0) thisFst = 0;
            fstNumerators.push_back(FstNum);
            fstDenominators.push_back(FstDenom);
            
        }
        
        if ((counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 0)) {
            numFixedSites++;
        }
        
        if ((counts.set1Count == 1 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample-1) ||
            (counts.set1Count == opt::msSet1FstSample-1 && counts.set2Count == 0) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 1)) {
            numNearlyFixedSites++;
        }
        
        
        int set1WithoutVariant = opt::msSet1FstSample-counts.set1Count;
        int set2WithoutVariant = opt::msSet2FstSample-counts.set2Count;
        
        if (counts.set1Count >= set1WithoutVariant) {
            moreSet1.push_back(counts.set1Count);
            lessSet1.push_back(set1WithoutVariant);
            moreSet2.push_back(counts.set2Count);
            lessSet2.push_back(set2WithoutVariant);
        } else {
            moreSet1.push_back(set1WithoutVariant);
            lessSet1.push_back(counts.set1Count);
            moreSet2.push_back(set2WithoutVariant);
            lessSet2.push_back(counts.set2Count);
        }
        
       // std::cerr << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << std::endl;
        if ((counts.set1Count != 0 || counts.set2Count != 0) && (set1WithoutVariant != 0 || set2WithoutVariant != 0)) {
            if (opt::msSet1FstSample + opt::msSet2FstSample <= 60) {
                counts.fisher_pval = fisher_exact(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
   //             std::cerr << "Fisher: " << counts.fisher_pval << std::endl;
                counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
            } else {
                counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
            }
        }
        
        if (counts.fisher_pval < opt::msPvalCutoff || counts.chi_sq_pval < opt::msPvalCutoff) {
            *pValFile << counts.fisher_pval << "\t" << counts.chi_sq_pval << "\t" << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << "\t" << thisFst << std::endl;
        }
    }
    
    double Fst = calculateFst(fstNumerators, fstDenominators);
    std::cerr << "Fst: " << Fst << std::endl;
    std::cerr << "Fixed sites: " << numFixedSites << std::endl;
    std::cerr << "Tier2 sites: " << numNearlyFixedSites << std::endl;
    std::cerr << "Null ChiSq 1:" << vector_average(moreSet1)/opt::msSet1FstSample << "\t" << vector_average(lessSet1)/opt::msSet1FstSample << std::endl;
    std::cerr << "Null ChiSq 2:" << vector_average(moreSet2)/opt::msSet2FstSample << "\t" << vector_average(lessSet2)/opt::msSet2FstSample << std::endl;
}
Ejemplo n.º 26
0
void summariseEigensoft() {
    std::ifstream* eigenFile = new std::ifstream(opt::eigensoftFile.c_str());
    string fileRoot = stripExtension(opt::eigensoftFile);
    string FstResultsFileName = fileRoot + "_" + opt::runName + "_fst_matrix.forR";
    std::ofstream* pFst = new std::ofstream(FstResultsFileName.c_str());
    std::vector<std::vector<std::string> > fst_matrix;
    
    string line;
    getline(*eigenFile, line); // Get the first description line
    short type;
    if (line == "##") {
        type = 1;
    } else {
        type = 2;
    }
    std::cerr << "It is type: " << type << std::endl;
    if (type == 1) {
        getline(*eigenFile, line);
        std::vector<std::string> fields = split(line, '\t');
        std::vector<std::string> this_indiv_fst;
        std::vector<std::string> all_indiv;
        string this_indiv = fields[0];
        this_indiv_fst.push_back(fields[2]);
        while (getline(*eigenFile, line)) {
            fields = split(line, '\t');
            std::cerr << "Indiv: " << fields[0] << std::endl;
            if (this_indiv == fields[0]) {
                this_indiv_fst.push_back(fields[2]);
            } else {
                fst_matrix.push_back(this_indiv_fst);
                all_indiv.push_back(this_indiv);
                this_indiv = fields[0];
                this_indiv_fst.clear();
                this_indiv_fst.push_back(fields[2]);
            }
        }
        all_indiv.push_back(this_indiv);
        fst_matrix.push_back(this_indiv_fst);
        this_indiv_fst.clear(); this_indiv_fst.push_back("0"); all_indiv.push_back(fields[1]);
        fst_matrix.push_back(this_indiv_fst);
        
        for (std::vector<std::vector<std::string> >::size_type i = 0; i != fst_matrix.size(); i++) {
            std::reverse(fst_matrix[i].begin(), fst_matrix[i].end());
            fst_matrix[i].insert(fst_matrix[i].end(), "0");
            while (fst_matrix[i].size() != fst_matrix[0].size()) {
                fst_matrix[i].insert(fst_matrix[i].end(), "0");
            }
        }
        std::reverse(fst_matrix.begin(), fst_matrix.end());
        std::reverse(all_indiv.begin(), all_indiv.end());
        
        print_vector(all_indiv, *pFst);
        print_matrix(fst_matrix, *pFst);
    } else if (type == 2) {
        std::cerr << "type2" << std::endl;
        std::vector<std::string> fields = split(line, '\t');
        std::vector<std::string> all_indiv(fields.begin()+1,fields.end());
        getline(*eigenFile, line); getline(*eigenFile, line);
        std::vector<std::string> this_indiv_fst;
        while (getline(*eigenFile, line)) {
            fields = split(line, '\t');
            std::copy(fields.begin()+1,fields.end(),std::back_inserter(this_indiv_fst));
            for (std::vector<std::string>::size_type i = 0; i != this_indiv_fst.size(); i++) {
                double fst = convertToDouble(this_indiv_fst[i]) / 1000;
                this_indiv_fst[i] = numToString(fst);
            }
            fst_matrix.push_back(this_indiv_fst);
            this_indiv_fst.clear();
        }
        print_vector(all_indiv, *pFst);
        print_matrix(fst_matrix, *pFst);
    }
}