Ejemplo n.º 1
0
int main(int argc, char** argv) {

  globalOpts.threads = 1   ;
  globalOpts.af      = 0.05;

  // zero based index for the target and background indivudals 
  
  map<int, int> it, ib;
  
    const struct option longopts[] = 
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"region"    , 1, 0, 'r'},
	{"gen"       , 1, 0, 'g'},
	{"type"      , 1, 0, 'y'},
	{"threads"   , 1, 0, 'x'},
	{"af"        , 1, 0, 'a'},
	{"pos"       , 1, 0, 'p'},
	{0,0,0,0}
      };

    int findex;
    int iarg=0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:p:hv", longopts, &findex);
	
	switch (iarg)
	  {
	  case 'p':
	    {
	      globalOpts.pos = atoi(optarg);
	      break;
	    }

	  case 'a':
	    {
	      globalOpts.af = atof(optarg);
	      break;
	    }
	  case 'x':
	    {
	      globalOpts.threads = atoi(optarg);
	      break;
	    }
	  case 'g':
	    {
	      globalOpts.geneticMapFile = optarg;
	      break;
	    }
	  case 'h':
	    {
	      printHelp();
	      break;
	    }
	  case 'v':
	    {
	      printVersion();
	      break;
	    }
	  case 'y':
	    {
	      globalOpts.type = optarg;
	      break;
	    }
	  case 't':
	    {
	      loadIndices(it, optarg);
	      cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
	      cerr << "INFO: target ids: " << optarg << endl;
	      break;
	    }
	  case 'f':
	    {
	      cerr << "INFO: file: " << optarg  <<  endl;
	      globalOpts.filename = optarg;
	      break;
	    }
	  case 'r':
	    {
	      cerr << "INFO: set seqid region to : " << optarg << endl;
	      globalOpts.region = optarg; 
	      break;
	    default:
	      break;
	    }
	  }
      }

#if defined HAS_OPENMP
  omp_set_num_threads(globalOpts.threads);
#endif
  
    map<string, int> okayGenotypeLikelihoods;
    okayGenotypeLikelihoods["PL"] = 1;
    okayGenotypeLikelihoods["GL"] = 1;
    okayGenotypeLikelihoods["GP"] = 1;
    okayGenotypeLikelihoods["GT"] = 1;
    

    // add an option for dumping

//    for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){
//      cerr << "pos: " << gm->first << " cm: " << gm->second << endl; 
//    }

    if(globalOpts.type.empty()){
      cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
      printHelp();
      exit(1);
    }
    if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){
      cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
      printHelp();
      exit(1);
    }

    if(globalOpts.filename.empty()){
      cerr << "FATAL: did not specify a file" << endl;
      printHelp();
      exit(1);
    }

    if(it.size() < 2){
      cerr << "FATAL: target option is required -- or -- less than two individuals in target\n";
      printHelp();
      exit(1);
    }

    // using vcflib; thanksErik 

    VariantCallFile variantFile;

    variantFile.open(globalOpts.filename);
    
    if(globalOpts.region.empty()){
      cerr << "FATAL: region required" << endl;
      exit(1);
    }
    if(! variantFile.setRegion(globalOpts.region)){
      cerr <<"FATAL: unable to set region" << endl;
      exit(1);
    }

    if (!variantFile.is_open()) {
      exit(1);
    }
    
    Variant var( variantFile );
    vector<int> target_h, background_h;

    int index   = 0; 
    int indexi  = 0;


    vector<string> samples = variantFile.sampleNames;
    int nsamples = samples.size();

    for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
      
      string sampleName = (*samp);
     
      if(it.find(index) != it.end() ){
	target_h.push_back(indexi);
	indexi++;
      }
      index++;
    }
    
   
    vector<long int> positions;
    
    vector<double> afs;

    string **haplotypes = new string*[target_h.size()];
    for (int i = 0; i < target_h.size(); i++) {
      haplotypes[i] = new string[2];
    }
    

    while (variantFile.getNextVariant(var)) {

      globalOpts.seqid = var.sequenceName;

      if(!var.isPhased()){
	cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
	exit(1);
      }

      if(var.alleles.size() > 2){
	continue;
      }

      vector < map< string, vector<string> > > target, background, total;
      
      int sindex = 0;
      
      for(int nsamp = 0; nsamp < nsamples; nsamp++){

	map<string, vector<string> > sample = var.samples[ samples[nsamp]];
	
	if(it.find(sindex) != it.end() ){
	  target.push_back(sample);
	}	
	sindex += 1;
      }
      
      genotype * populationTarget    ;
      
      if(globalOpts.type == "PL"){
	populationTarget     = new pl();
      }
      if(globalOpts.type == "GL"){
	populationTarget     = new gl();
      }
      if(globalOpts.type == "GP"){
	populationTarget     = new gp();
      }
      if(globalOpts.type == "GT"){
	populationTarget     = new gt();
      }

      populationTarget->loadPop(target, var.sequenceName, var.position);
      
      if(populationTarget->af <= globalOpts.af 
	 || populationTarget->nref < 2 
	 || populationTarget->nalt < 2){
	delete populationTarget;
	continue;
      }
      positions.push_back(var.position);
      afs.push_back(populationTarget->af);
      loadPhased(haplotypes, populationTarget, populationTarget->gts.size()); 
    
      populationTarget = NULL;
      delete populationTarget;
    }

    if(!globalOpts.geneticMapFile.empty()){
      cerr << "INFO: loading genetics map" << endl;
      loadGeneticMap(positions.front(), positions.back());
      cerr << "INFO: finished loading genetics map" << endl;
    }

    calc(haplotypes, target_h.size(), afs, positions, 
	 target_h, background_h, globalOpts.seqid);
    clearHaplotypes(haplotypes, target_h.size());

    exit(0);		    

}
Ejemplo n.º 2
0
int main(int argc, char** argv) {

    int c;
    bool invert = false;
    bool logicalOr = false;
    bool filterSites = false;
    vector<string> infofilterStrs;
    vector<VariantFilter> infofilters;
    vector<string> genofilterStrs;
    vector<VariantFilter> genofilters;
    string tag = "";
    string filterSpec;
    string alleleTag;
    vector<string> regions;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"filter-sites", no_argument, 0, 's'},
            {"info-filter",  required_argument, 0, 'f'},
            {"genotype-filter",  required_argument, 0, 'g'},
            {"tag", required_argument, 0, 't'},
	    {"allele-tag", required_argument, 0, 'a'},
            {"invert", no_argument, 0, 'v'},
            {"or", no_argument, 0, 'o'},
            {"region", required_argument, 0, 'r'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hvsof:g:t:r:a:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'f':
            filterSpec += " " + string(optarg);
            infofilterStrs.push_back(string(optarg));
            break;

          case 's':
            filterSites = true;
            break;

	  case 'a':
	    alleleTag = optarg;
	    break;
 
          case 'g':
            filterSpec += " genotypes filtered with: " + string(optarg);
            genofilterStrs.push_back(string(optarg));
            break;
 
          case 't':
            tag = optarg;
            break;
 
          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case 'v':
            invert = true;
            break;

          case 'o':
            logicalOr = true;
            break;

          case 'r':
            regions.push_back(optarg);
            break;
          
          case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
          default:
            abort ();
          }
      }

    filterSpec = filterSpec.substr(1); // strip leading " "

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) {
        infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes));
    }

    for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) {
        genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes));
    }

    vector<string> headerlines = split(variantFile.header, "\n");
    variantFile.header.clear();
    for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) {
        if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) {
            variantFile.header += "##filter=\"" + filterSpec + "\"\n";
            filterSpec.clear();
        }
        variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n");
    }

    if (!alleleTag.empty()) {
	variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tag + " if this allele passes the filters, '.' if not, filters are: " + filterSpec + ".\">");
    }

    cout << variantFile.header << endl;

    /*
    if (genofilters.empty() && tag.empty()) {
        variantFile.parseSamples = false;
    }
    */

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            if (!genofilters.empty()) {
                for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) {
                    f->removeFilteredGenotypes(var);
                }
            }
            if (!infofilters.empty()) {
                if (filterSites) {
                    bool passes = passesFilters(var, infofilters, logicalOr);
                    if (invert) {
                        passes = !passes;
                    }
                    if (passes) {
                        if (!tag.empty()) {
			    if (alleleTag.empty()) {
				var.addFilter(tag);
			    } else {
				var.info[alleleTag].clear();
				for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
				    var.info[alleleTag].push_back(tag);
				}
			    }
                            cout << var << endl;
                        } else {
                            cout << var << endl;
                        }
                    } else if (!tag.empty()) {
                        cout << var << endl;
                    }
                } else { // filter out alleles which pass
                    // removes the failing alleles
                    vector<string> failingAlts;
                    vector<string> passingAlts;
		    vector<bool> passes;
                    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                        if (!passesFilters(var, infofilters, logicalOr, *a)) {
                            failingAlts.push_back(*a);
			    passes.push_back(false);
                        } else {
                            passingAlts.push_back(*a);
			    passes.push_back(true);
                        }
                    }
                    if (tag.empty()) { // if there is no specified tag, just remove the failing alts
			if (failingAlts.size() < var.alt.size()) {
			    for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) {
				var.removeAlt(*a);
			    }
			    cout << var << endl;
			}
                    } else { // otherwise, apply the tag
			if (alleleTag.empty()) {
			    if (!passingAlts.empty()) {
				var.addFilter(tag);
			    }
			} else {
			    var.info[alleleTag].clear();
			    for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) {
				if (*p) {
				    var.info[alleleTag].push_back(tag);
				} else {
				    var.info[alleleTag].push_back(".");
				}
			    }
			}
                        cout << var << endl;
                    }
                }
            } else {
                if (genofilters.empty()) {
                    cout << variantFile.line << endl;
                } else {
                    cout << var << endl;
                }
            }
        }

    } while (regionItr != regions.end());

    return 0;

}
Ejemplo n.º 3
0
int main(int argc, char** argv) {

    vector<string> regions;

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"region", required_argument, 0, 'r'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hr:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case 'r':
            regions.push_back(optarg);
            break;
          
          default:
            abort ();
          }
      }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    int variantAlleles = 0;
    int variantSites = 0;
    int snps = 0;
    int transitions = 0;
    int transversions = 0;
    int totalinsertions = 0;
    int totaldeletions = 0;
    int insertedbases = 0;
    int deletedbases = 0;
    int totalmnps = 0;
    int totalcomplex = 0;
    map<int, int> insertions;
    map<int, int> deletions;
    map<int, int> mnps;
    map<int, int> complexsubs;

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            ++variantSites;
            map<string, vector<VariantAllele> > alternates = var.parsedAlternates();
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                ++variantAlleles;
                string& alternate = *a;
                if (var.ref.size() == alternate.size()) {
                    if (var.ref.size() == 1) {
                        ++snps;
                        if (isTransition(var.ref, alternate)) {
                            ++transitions;
                        } else {
                            ++transversions;
                        }
                    } else {
                        ++totalmnps;
                        if (alternates[alternate].size() > 1) {
                        } else {
                            VariantAllele& va = alternates[alternate].front();
                            ++mnps[va.alt.size()]; // not entirely correct
                        }
                    }
                } else if (var.ref.size() > alternate.size()) {
                    int diff = var.ref.size() - alternate.size();
                    deletedbases += diff;
                    if (alternates[alternate].size() > 1) {
                        ++totalcomplex;
                        ++complexsubs[-diff];
                    } else {
                        ++totaldeletions;
                        ++deletions[diff];
                    }
                } else {
                    int diff = alternate.size() - var.ref.size();
                    insertedbases += diff;
                    if (alternates[alternate].size() > 1) {
                        ++totalcomplex;
                        ++complexsubs[diff];
                    } else {
                        ++totalinsertions;
                        ++insertions[diff];
                    }
                }
            }
        }

    } while (regionItr != regions.end());

    // find the maximum indel size
    int maxindel = 0;
    for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }
    for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }

    // and maximum mnp
    int maxmnp = 0;
    for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
        if (i->first > maxmnp) {
            maxmnp = i->first;
        }
    }

    // now print the results

    cout << "total variant sites:\t" << variantSites << endl
         << "total variant alleles:\t" << variantAlleles << endl
         << endl
         << "snps:\t" << snps << endl
         << "indels:\t" << totalinsertions + totaldeletions << endl
         << "mnps:\t" << totalmnps << endl
         << "complex:\t" << totalcomplex << endl
         << endl
         << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
         << endl
         << "ins/del length frequency distribution" << endl
         << "length\tins\tdel\tins/del" << endl;
    for (int i = 1; i <= maxindel; ++i) {
        int ins = insertions[i];
        int del = deletions[i];
        cout << i << "\t"
             << (ins > 0 ? convert(ins) : "" ) << "\t"
             << (del > 0 ? convert(del) : "") << "\t"
             << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
             << endl;
    }
    cout << endl
         << "insertion alleles / deletion alleles:\t" << (double) totalinsertions / (double) totaldeletions << endl
         << "inserted bases / deleted bases:\t" << (double) insertedbases / (double) deletedbases << endl
         << endl
         << "mnp length frequency distribution" << endl
         << "length\tcount" << endl;
    for (int i = 2; i <= maxmnp; ++i) {
        int mnp = mnps[i];
        cout << i << "\t"
             << (mnp > 0 ? convert(mnp) : "")
             << endl;
    }
    cout << endl;

    cout << "complex event frequency distribution" << endl
         << "length\tcount" << endl;
    for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
        cout << i->first << "\t" << i->second << endl;
    }

    return 0;

}
Ejemplo n.º 4
0
int main(int argc, char** argv) {

  // set the random seed for MCMC

  srand((unsigned)time(NULL));

  // the filename

  string filename = "NA";

  // set region to scaffold

  string region = "NA"; 

  // using vcflib; thanks to Erik Garrison 

  VariantCallFile variantFile;

  // zero based index for the target and background indivudals 
  
  map<int, int> it, ib;
  
  // deltaaf is the difference of allele frequency we bother to look at 

  // ancestral state is set to zero by default

  string mut = "1";

  int counts = 0;
  
  // phased 

  int phased = 0;

    const struct option longopts[] = 
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"background", 1, 0, 'b'},
	{"deltaaf"   , 1, 0, 'd'},
	{"region"    , 1, 0, 'r'},
	{"mutation"  , 1, 0, 'm'},
	{"phased"    , 1, 0, 'p'},
	{0,0,0,0}
      };

    int findex;
    int iarg=0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "p:m:r:d:t:b:f:hv", longopts, &findex);
	
	switch (iarg)
	  {
	  case 'h':
	    cerr << endl << endl;
	    cerr << "INFO: help" << endl;
	    cerr << "INFO: description:" << endl;
            cerr << "     gl-XPEHH estimates haplotype decay between the target and background populations.  SNVs are integrated                           " << endl;
	    cerr << "     until EHH in the target and background is less than 0.05. The score is the itegrated EHH (target) / integrated EHH (background). " << endl;
	    cerr << "     gl-XPEHH does NOT integrate over genetic distance, as genetic maps are not availible for most non-model organisms. 		   " << endl;
	    cerr << "     gl-XPEHH phases genotypes, imuputes missing genotypes, and changes poor quality genotypes. Phasing is done in a sliding window   " << endl;
	    cerr << "     with a stochastic search, therefore, every time gl-XPEHH is run it will generate slightly different results.                     " << endl;

	    cerr << "Output : 4 columns :     "    << endl;
	    cerr << "     1. seqid            "    << endl;
	    cerr << "     2. position         "    << endl;
	    cerr << "     3. xp-ehh           "    << endl;
	    cerr << "     4. iHS              "    << endl  << endl;

	    cerr << "INFO: gl-XPEHH  --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --ancestral 0        " << endl;
	    cerr << endl;
	    cerr << "INFO: required: r,region     -- a genomice range to calculate gl-XPEHH on in the format : \"seqid:start-end]\" or \"seqid\" " << endl;
	    cerr << "INFO: required: t,target     -- a zero base comma seperated list of target individuals corrisponding to VCF columns        " << endl;
	    cerr << "INFO: required: b,background -- a zero base comma seperated list of background individuals corrisponding to VCF columns    " << endl;
	    cerr << "INFO: required: f,file a     -- proper formatted VCF.  the FORMAT field MUST contain \"PL\" if option phased == 0           " << endl; 
	    cerr << "INFO: optional: m,mutation   -- which state is derived in vcf [0,1] default is 1                                            " << endl;
	    cerr << "INFO: optional: p,phased     -- phasing flag [0,1] 0 = phase vcf, 1 = vcf is already phased                                 " << endl;
	    cerr << endl; 
	    cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] " << endl;
	    cerr << endl << endl;
	    return 0;
	  case 'v':
	    cerr << endl << endl;
	    cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] "  << endl;
	    return 0;
	  case 'p':
	    phased = atoi(optarg);
	    cerr << "INFO: setting phase to: " << phased << endl;
	    break;
	  case 'm':
	    mut = optarg;
	    cerr << "INFO: derived state set to " << mut << endl;
	    break;
	  case 't':
	    loadIndices(it, optarg);
	    cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
	    cerr << "INFO: target ids: " << optarg << endl;
	    break;
	  case 'b':
	    loadIndices(ib, optarg);
	    cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
	    cerr << "INFO: background ids: " << optarg << endl;
	    break;
	  case 'f':
	    cerr << "INFO: file: " << optarg  <<  endl;
	    filename = optarg;
	    break;
	  case 'r':
            cerr << "INFO: set seqid region to : " << optarg << endl;
	    region = optarg; 
	    break;
	  default:
	    break;
	  }

      }

    if(filename == "NA"){
      cerr << "FATAL: did not specify a file" << endl;
      cerr << "INFO: please use gl-XPEHH --help" << endl;
      return(1);
    }


    variantFile.open(filename);
    
    if(region == "NA"){
      cerr << "FATAL: did not specify a region"  << endl;
      cerr << "INFO: please use gl-XPEHH --help" << endl;
    }

   if(region != "NA"){
     variantFile.setRegion(region); 
   }
    
    if (!variantFile.is_open()) {
        return 1;
    }
    
    Variant var(variantFile);

    vector<string> samples = variantFile.sampleNames;
    vector<int>    target_h, background_h;

    int index, indexi = 0;

    cerr << "INFO: there are " << samples.size() << " individuals in the VCF" << endl;

    if(samples.size() == 0){
      cerr << "FATAL: too few samples or no VCF header"    << endl;
      cerr << "INFO: please use gl-XPEHH --help"           << endl;
      return(1);
    }

    for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
     
      if(it.find(index) != it.end() ){
	target_h.push_back(indexi);
	indexi++;
      }
      if(ib.find(index) != ib.end()){
	background_h.push_back(indexi);
	indexi++;
      }
      index++;
    }
    

    list< pop > tdat, bdat, zdat;

    vector<long int> positions;

    string haplotypes [it.size() + ib.size()][2];    
    
    string seqid;

    while (variantFile.getNextVariant(var)) {
        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin(); 
        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
        
	// biallelic sites naturally 

	if(var.alt.size() > 1){
	  continue;
	}

	vector < map< string, vector<string> > > target, background, total;
	        
	int sindex = 0;

        for (; s != sEnd; s++) {	  
	  
	  map<string, vector<string> >& sample = s->second;
	  
	  if(it.find(sindex) != it.end() ){
	    target.push_back(sample);
	    total.push_back(sample);	
	  }
	  if(ib.find(sindex) != ib.end()){
	    background.push_back(sample);
	    total.push_back(sample);
	  }  
	  sindex += 1;
	}
	
	seqid = var.sequenceName;

	pop popt, popb, popz;

	initPop(popt);
	initPop(popb);
	initPop(popz);

	loadPop(target,     popt, var.sequenceName, var.position, phased );
	loadPop(background, popb, var.sequenceName, var.position, phased );
	loadPop(total,      popz, var.sequenceName, var.position, phased );

	if(popt.af == -1 || popb.af == -1){
	  continue;
	}
	if(popz.af > 0.95 || popz.af < 0.05){
	  continue;
	}
	if(popt.af == 0 && popb.af == 1){
	  continue;
	}
	if(popt.af == 1 && popb.af == 0){
	  continue;
	}
		
	tdat.push_back(popt);
	bdat.push_back(popb);
	zdat.push_back(popz);
       
	positions.push_back(var.position);
	
	counts += 1;
	if(counts >= 1000){
	  cerr << "INFO: processed " << haplotypes[0][0].size() << " SNPs; current location : " << var.position << endl;
	  counts = 0;
	}

	while(zdat.size() >= 15 && !zdat.empty()){
          if(phased == 0){	    
            localPhase(haplotypes, zdat, (it.size() + ib.size()));
          }
          else{
            loadPhased(haplotypes, zdat, (it.size() + ib.size()));
          }
          while(!zdat.empty()){
            zdat.pop_front();
          }
	}
    }

    if(phased == 0){
      localPhase(haplotypes, zdat, (it.size() + ib.size()));
    }
    else{
      loadPhased(haplotypes, zdat, (it.size() + ib.size()));
    }
    while(!zdat.empty()){
      zdat.pop_front();
    }


    cerr << "INFO: phasing done" << endl;
   
    calc(haplotypes, (it.size() + ib.size()), positions, target_h, background_h,  mut, seqid);

    cerr << "INFO: gl-XPEHH finished" << endl;

    return 0;		    
}
Ejemplo n.º 5
0
int main(int argc, char** argv) {

    vector<string> regions;
    bool addTags = false;
    bool addType = false;
    bool lengthFrequency = true;

    // constants for SmithWaterman algorithm
    float matchScore = 10.0f;
    float mismatchScore = -9.0f;
    float gapOpenPenalty = 15.0f;
    float gapExtendPenalty = 6.66f;

    bool useReferenceAlignment = false;

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"region", required_argument, 0, 'r'},
                {"add-info", no_argument, 0, 'a'},
                {"add-type", no_argument, 0, 't'},
                {"no-length-frequency", no_argument, 0, 'l'},
                {"match-score", required_argument, 0, 'm'},
                {"mismatch-score", required_argument, 0, 'x'},
                {"gap-open-penalty", required_argument, 0, 'o'},
                {"gap-extend-penalty", required_argument, 0, 'e'},
                //{"length",  no_argument, &printLength, true},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hlatr:m:x:o:e:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

	    case 'h':
            printSummary(argv);
            exit(0);
            break;
		
	    case 'r':
            regions.push_back(optarg);
            break;
		
	    case 'l':
            lengthFrequency = false;
            break;
		
	    case 'a':
            addTags = true;
            break;

	    case 't':
            addType = true;
            break;

	    case 'm':
            matchScore = atof(optarg);
	        break;

	    case 'x':
            mismatchScore = atof(optarg);
	        break;

	    case 'o':
            gapOpenPenalty = atof(optarg);
	        break;

	    case 'e':
            gapExtendPenalty = atof(optarg);
	        break;
		
	    default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    if (addType && !addTags) {
        variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
        variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
        cout << variantFile.header << endl;
    }

    if (addTags) {
        variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
        variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
        variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">");
        variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">");
        cout << variantFile.header << endl;
    }

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    int variantAlleles = 0;
    int uniqueVariantAlleles = 0;
    int variantSites = 0;
    int snps = 0;
    int transitions = 0;
    int transversions = 0;
    int deaminations = 0;
    int aminations = 0;
    int totalinsertions = 0;
    int totaldeletions = 0;
    int insertedbases = 0;
    int deletedbases = 0;
    int totalmnps = 0;
    int totalcomplex = 0;
    int mismatchbases = 0;
    int mnpbases = 0;
    int biallelics = 0;
    int multiallelics = 0;
    map<int, int> insertions;
    map<int, int> deletions;
    map<int, int> mnps;
    map<int, int> complexsubs;

    bool includePreviousBaseForIndels = false;
    bool useMNPs = true;
    bool useEntropy = false;

    AlleleStats biallelicSNPs;

    // todo, add biallelic snp dialog to output and ts/tv for snps and mnps

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            ++variantSites;
            if (var.alt.size() > 1) {
                ++multiallelics;
            } else {
                ++biallelics;
            }
            map<string, vector<VariantAllele> > alternates 
	      = var.parsedAlternates(includePreviousBaseForIndels,
				     useMNPs,
				     useEntropy,
				     matchScore,
				     mismatchScore,
				     gapOpenPenalty,
				     gapExtendPenalty);

            map<VariantAllele, vector<string> > uniqueVariants;
	    
            vector<string> cigars;
	    
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                string& alternate = *a;
                if (addTags)
                    var.info["altlen"].push_back(convert(alternate.size()));
                vector<VariantAllele>& vav = alternates[alternate];
                if (vav.size() > 1) {
                    // check that there are actually multiple non-reference alleles
                    int nonRefAlleles = 0;
                    for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) {
                        if (z->ref != z->alt)
                            ++nonRefAlleles;
                    }
                    if (nonRefAlleles > 1)
                        ++totalcomplex;
                }
                for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
                    uniqueVariants[*v].push_back(alternate);
                }

                if (addTags || addType) {
                    string cigar;
                    pair<int, string> element;
                    for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
                        VariantAllele& va = *v;
                        if (va.ref != va.alt) {
                            if (element.second == "M") {
                                cigar += convert(element.first) + element.second;
                                element.second = ""; element.first = 0;
                            }
                            if (va.ref.size() == va.alt.size()) {
                                cigar += convert(va.ref.size()) + "X";
                            } else if (va.ref.size() > va.alt.size()) {
                                cigar += convert(va.ref.size() - va.alt.size()) + "D";
                            } else {
                                cigar += convert(va.alt.size() - va.ref.size()) + "I";
                            }
                        } else {
                            if (element.second == "M") {
                                element.first += va.ref.size();
                            } else {
                                element = make_pair(va.ref.size(), "M");
                            }
                        }
                    }
                    if (element.second == "M") {
                        cigar += convert(element.first) + element.second;
                    }
                    element.second = ""; element.first = 0;
                    cigars.push_back(cigar);
                }
            }

            if (addTags) {
                var.info["cigar"] = cigars;
                var.info["reflen"].push_back(convert(var.ref.size()));
            } else if (addType) {
                var.info["cigar"] = cigars;
            }

            variantAlleles += var.alt.size();
            map<string, AlleleStats> alleleStats;

            for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) {
                const VariantAllele& va = v->first;
                vector<string>& alternates = v->second;

                if (!(addTags || addType)) { // don't add any tag information if we're not going to output it
                    alternates.clear();
                }

                if (va.ref != va.alt) {
                    ++uniqueVariantAlleles;
                    if (va.ref.size() == va.alt.size()) {
                        if (va.ref.size() == 1) {
                            ++snps;
                            ++mismatchbases;
                            for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                ++alleleStats[*a].mismatches;
                            }
                            if (isTransition(va.ref, va.alt)) {
                                ++transitions;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].transitions;
                                }
                            } else {
                                ++transversions;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].transversions;
                                }
                            }
                            if (isAmination(va.ref, va.alt)) {
                                ++aminations;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].aminations;
                                }
                            }
                            if (isDeamination(va.ref, va.alt)) {
                                ++deaminations;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].deaminations;
                                }
                            }
                        } else {
                            ++totalmnps;
                            ++mnps[va.alt.size()]; // not entirely correct
                            for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                alleleStats[*a].mismatches += va.alt.size();
                            }
                            string::const_iterator r = va.ref.begin();
                            for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) {
                                string rstr = string(1, *r);
                                string astr = string(1, *a);
                                if (rstr == astr) {
                                    continue;
                                }
                                if (isTransition(rstr, astr)) {
                                    ++transitions;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].transitions;
                                    }
                                } else {
                                    ++transversions;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].transversions;
                                    }
                                }
                                if (isAmination(rstr, astr)) {
                                    ++aminations;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].aminations;
                                    }
                                }
                                if (isDeamination(rstr, astr)) {
                                    ++deaminations;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].deaminations;
                                    }
                                }
                                ++mismatchbases;
                                ++mnpbases;
                            }
                        }
                    } else if (va.ref.size() > va.alt.size()) {
                        int diff = va.ref.size() - va.alt.size();
                        deletedbases += diff;
                        ++totaldeletions;
                        ++deletions[diff];
                        for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                            alleleStats[*a].deletedbases += diff;
                            alleleStats[*a].deletions += 1;
                        }
                    } else {
                        int diff = va.alt.size() - va.ref.size();
                        insertedbases += diff;
                        ++totalinsertions;
                        ++insertions[diff];
                        for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                            alleleStats[*a].insertedbases += diff;
                            alleleStats[*a].insertions += 1;
                        }
                    }
                }
            }
            if (addTags || addType) {
                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                    string vartype;
                    if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) {
                        if (alleleStats[*a].mismatches == 1) {
                            vartype = "snp";
                        } else if (alleleStats[*a].mismatches > 1) {
                            vartype = "complex";
                        } else {
                            vartype = "ref";
                        }
                    } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) {
                        if (alleleStats[*a].insertions == 1) {
                            vartype = "ins";
                        } else {
                            vartype = "del";
                        }
                    } else {
                        vartype = "complex";
                    }
                    if (addTags) {
                        var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches));
                        var.info["insertions"].push_back(convert(alleleStats[*a].insertions));
                        var.info["deletions"].push_back(convert(alleleStats[*a].deletions));
                        var.info["transitions"].push_back(convert(alleleStats[*a].transitions));
                        var.info["transversions"].push_back(convert(alleleStats[*a].transversions));
                        var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations));
                        var.info["aminations"].push_back(convert(alleleStats[*a].aminations));
                    }
                    var.info["type"].push_back(vartype);
                }
                cout << var << endl;
            }
            // biallelic SNP case
            if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) {
                if (isTransition(var.ref, var.alt.front())) {
                    biallelicSNPs.transitions++;
                } else {
                    biallelicSNPs.transversions++;
                }
                biallelicSNPs.mismatches++;
            }
        }

    } while (regionItr != regions.end());


    // find the maximum indel size
    int maxindel = 0;
    for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }
    for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }

    // and maximum mnp
    int maxmnp = 0;
    for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
        if (i->first > maxmnp) {
            maxmnp = i->first;
        }
    }

    // now print the results

    if (!addTags && !addType) {
        cout << "total variant sites:\t" << variantSites << endl
             << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and "
                            << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl
             << "total variant alleles:\t" << variantAlleles << endl
             << "unique variant alleles:\t" << uniqueVariantAlleles << endl
             << endl
             << "snps:\t" << snps << endl
             << "mnps:\t" << totalmnps << endl
             << "indels:\t" << totalinsertions + totaldeletions << endl
             << "complex:\t" << totalcomplex << endl
             << endl
             << "mismatches:\t" << mismatchbases << endl
             << endl
             << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
             << "deamination ratio:\t" << (double) deaminations / aminations << endl
             << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ "
             << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl;

        if (lengthFrequency) {
            cout << endl
                 << "ins/del length frequency distribution" << endl
                 << "length\tins\tdel\tins/del" << endl;
            for (int i = 1; i <= maxindel; ++i) {
                int ins = insertions[i];
                int del = deletions[i];
                cout << i << "\t"
                     << (ins > 0 ? convert(ins) : "" ) << "\t"
                     << (del > 0 ? convert(del) : "") << "\t"
                     << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
                     << endl;
            }
        }

        cout << endl
             << "insertion alleles / deletion alleles:\t"
             << (double) totalinsertions / (double) totaldeletions << endl
             << "inserted bases / deleted bases:\t"
             << (double) insertedbases / (double) deletedbases << endl
             << endl;

        if (lengthFrequency) {
            cout << "mnp length frequency distribution" << endl
                 << "length\tcount" << endl;
            for (int i = 2; i <= maxmnp; ++i) {
                int mnp = mnps[i];
                cout << i << "\t"
                     << (mnp > 0 ? convert(mnp) : "")
                     << endl;
            }
        }

        cout << "total bases in mnps:\t" << mnpbases << endl;

        /*
          cout << "complex event frequency distribution" << endl
          << "length\tcount" << endl;
          for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
          cout << i->first << "\t" << i->second << endl;
          }
        */
    }

    return 0;

}
Ejemplo n.º 6
0
int main(int argc, char** argv) {

  bool snp = false;

  // set the random seed for MCMC

  srand((unsigned)time(NULL));

  // the filename

  string filename;

  // open standardout 

  // set region to scaffold

  string region = "NA";

  // using vcflib; thanks to Erik Garrison

 VariantCallFile variantFile;

  // zero based index for the target and background indivudals

  map<int, int> it, ib;

  // genotype likelihood format

  string type = "NA";

  // are we polarizing the counts relative to the ancestral allele?
  bool use_ancestral_state = false;
  set<char> allowed_ancestral_bases = { 'A', 'T', 'C', 'G' };

    const struct option longopts[] =
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"region"    , 1, 0, 'r'},
	{"type"      , 1, 0, 'y'},
	{"snp"       , 0, 0, 's'},
	{"ancestral" , 0, 0, 'a'},
	{0,0,0,0}
      };

    int index;
    int iarg=0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chvsa", longopts, &index);

	switch (iarg)
	  {
    case 'a':
      {
        use_ancestral_state = true;
        break;
      }
	  case 's':
	    {
	      snp = true;
	      break;
	    }
	  case 'h':
	    {
	      printHelp();
	      return 0;
	    }
	  case 'v':
	    {
	      printVersion();
	      return 0;
	    }
	  case 't':
	    {
	      loadIndices(it, optarg);
	      cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
	      cerr << "INFO: target ids: " << optarg << endl;
	      break;
	    }
	  case 'b':
	    {
	      loadIndices(ib, optarg);
	      cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
	      cerr << "INFO: background ids: " << optarg << endl;
	      break;
	    }
	  case 'f':
	    {
	      cerr << "INFO: file: " << optarg  <<  endl;
	      filename = optarg;
	      break;
	    }
	  case 'r':
	    {
	      cerr << "INFO: set seqid region to : " << optarg << endl;
	      region = optarg;
	      break;
	    }
	  case 'y':
	    {
	      type = optarg;
	      cerr << "INFO: set genotype likelihood to: " << type << endl;
	      break;
	    }
	  default:
	    break;
	  }

      }

    if(filename.empty()){
      cerr << "FATAL: failed to specify a file" << endl;
      printHelp();
    }

    bool is_open; 

    if (filename == "-") {

        is_open=variantFile.open(std::cin);

    } else {

    	is_open=variantFile.open(filename); 
	
     }
    
    if (!is_open)  {
          cerr << "FATAL: could not open file for reading" << endl;
          printHelp();

    }


    if(region != "NA"){
      if(! variantFile.setRegion(region)){
	cerr <<"FATAL: unable to set region" << endl;
	return 1;
      }
    }

    if (!variantFile.is_open()) {
      cerr << "FATAL: could not open VCF for reading" << endl;
      printHelp();
      return 1;
    }

    map<string, int> okayGenotypeLikelihoods;
    okayGenotypeLikelihoods["PL"] = 1;
    okayGenotypeLikelihoods["GL"] = 1;
    okayGenotypeLikelihoods["GP"] = 1;
    okayGenotypeLikelihoods["GT"] = 1;

    if(type == "NA"){
      cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
      printHelp();
      return 1;
    }
    if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
      cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
      printHelp();
      return 1;
    }

    Variant var(variantFile);

    vector<string> samples = variantFile.sampleNames;
    int nsamples = samples.size();

    vector<indv *> countData;
    vector<string > countDataSampleName;

    for ( map<int ,int>::iterator x=it.begin(); x!=it.end(); ++x) {

        countDataSampleName.push_back(samples[x->first] ); 
    }


    for(int i = 0; i < it.size(); i++){
      indv * dip = new indv;

      dip->nhet   = 0;
      dip->nhom   = 0;
      dip->nalt   = 0;
      dip->nocall = 0;

      countData.push_back(dip);

    }


    while (variantFile.getNextVariant(var)) {

	// biallelic sites naturally

	if(var.alt.size() > 1){
	  continue;
	}
	if(snp){
	  bool hit =false;

	  for(vector<string>::iterator it = var.alleles.begin(); it != var.alleles.end(); it++){
	    if((*it).size() > 1){
	      hit = true;
	    }
	  }
	  if(hit){
	    continue;
	  }
	}

  // decide if we can polarize the site if we are using the ancestral allele
  bool ref_is_ancestral_allele = true;
  if (use_ancestral_state) {
    // we need the ancestral allele to decide what to do at this site
    if (var.info.find("AA") == var.info.end()) continue;
    string ancestral_allele = var.info["AA"].front();
    // if we do not have a polarized site with only allowed bases in the ancestral allele, skip it
    bool allowed = true;
    for (string::iterator c = ancestral_allele.begin(); c != ancestral_allele.end(); ++c) {
      if (!allowed_ancestral_bases.count(*c)) {
        allowed = false;
        break;
      }
    }
    if (!allowed) continue;
    ref_is_ancestral_allele = (ancestral_allele == var.ref);
  }

	vector < map< string, vector<string> > > target, background, total;

	int index = 0;

	for(int nsamp = 0; nsamp < nsamples; nsamp++){

	    if(it.find(index) != it.end() ){
	        const map<string, vector<string> >& sample = var.samples[ samples[nsamp]];
		target.push_back(sample);
	    }
	    index += 1;
	}

	genotype * populationTarget      ;

	if(type == "PL"){
	  populationTarget     = new pl();
	}
	if(type == "GL"){
	  populationTarget     = new gl();
	}
	if(type == "GP"){
	  populationTarget     = new gp();
	}
	if(type == "GT"){
          populationTarget     = new gt();
	}

	populationTarget->loadPop(target, var.sequenceName, var.position);

	for(int i = 0; i < populationTarget->genoIndex.size() ; i++){
	  if(populationTarget->genoIndex[i] == -1){
	    countData[i]->nocall += 1;
	  }
	  else if (populationTarget->genoIndex[i] == 0) {
            if (!use_ancestral_state || ref_is_ancestral_allele) {
	      countData[i]->nhom += 1;
            } else {
	      countData[i]->nalt += 1;
            }
	  }
	  else if (populationTarget->genoIndex[i] == 1){
	    countData[i]->nhet += 1;
	  }
	  else if (populationTarget->genoIndex[i] == 2) {
            if (!use_ancestral_state || ref_is_ancestral_allele) {
	      countData[i]->nalt += 1;
            } else {
	      countData[i]->nhom += 1;
            }
	  }
	  else{
	    std::cerr << "FATAL: unkown genotype index" << std::endl;
cerr << populationTarget->genoIndex[i] << endl;
cerr << var << endl;
	    exit(1);
	  }
	}
	delete populationTarget;

    }

    if (!use_ancestral_state) {
        std::cout << "#sample-id\tn-nocall\tn-hom-ref\tn-het\tn-hom-alt" << std::endl;
    } else {
        std::cout << "#sample-id\tn-nocall\tn-hom-ancestral\tn-het\tn-hom-derived" << std::endl;
    }
    for(int i = 0; i < countData.size(); i++){
        std::cout << countDataSampleName[i]
                  << "\t" << countData[i]->nocall
                  << "\t" << countData[i]->nhom
                  << "\t" << countData[i]->nhet
                  << "\t" << countData[i]->nalt
                  << std::endl;
    }


    return 0;
}
Ejemplo n.º 7
0
int main(int argc, char** argv) {

  // set the random seed for MCMC

  srand((unsigned)time(NULL));

  // the filename

  string filename = "NA";

  // set region to scaffold

  string region = "NA"; 

  // using vcflib; thanks to Erik Garrison 

  VariantCallFile variantFile;

  // zero based index for the target and background indivudals 
  
  map<int, int> it, ib;
  
  // genotype likelihood format

  string type = "NA";

    const struct option longopts[] = 
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"region"    , 1, 0, 'r'},
	{"type"      , 1, 0, 'y'},
	{0,0,0,0}
      };

    int index;
    int iarg=0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chv", longopts, &index);
	
	switch (iarg)
	  {
	  case 'h':
	    printHelp();
	    return 0;
	  case 'v':
	    printVersion();
	    return 0;
	  case 't':
	    loadIndices(it, optarg);
	    cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
	    cerr << "INFO: target ids: " << optarg << endl;
	    break;
	  case 'b':
	    loadIndices(ib, optarg);
	    cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
	    cerr << "INFO: background ids: " << optarg << endl;
	    break;
	  case 'f':
	    cerr << "INFO: file: " << optarg  <<  endl;
	    filename = optarg;
	    break;
	  case 'r':
            cerr << "INFO: set seqid region to : " << optarg << endl;
	    region = optarg; 
	    break;
	  case 'y':
	    type = optarg;
	    cerr << "INFO: set genotype likelihood to: " << type << endl;
	    break;
	  default:
	    break;
	  }

      }

    if(filename == "NA"){
      cerr << "FATAL: failed to specify a file" << endl;
      printHelp();
    }
    
    if(!variantFile.open(filename)){
      cerr << "FATAL: could not open file for reading" << endl;
      printHelp();
    }
    
    if(region != "NA"){
      if(! variantFile.setRegion(region)){
	cerr <<"FATAL: unable to set region" << endl;
	return 1;
      }
    }

    if (!variantFile.is_open()) {
      cerr << "FATAL: could not open VCF for reading" << endl;
      printHelp();
      return 1;
    }

    map<string, int> okayGenotypeLikelihoods;
    okayGenotypeLikelihoods["PL"] = 1;
    okayGenotypeLikelihoods["GL"] = 1;
    okayGenotypeLikelihoods["GP"] = 1;
    okayGenotypeLikelihoods["GT"] = 1;

    if(type == "NA"){
      cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
      printHelp();
      return 1;
    }
    if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){
      cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
      printHelp();
      return 1;
    }

    Variant var(variantFile);

    vector<string> samples = variantFile.sampleNames;
    int nsamples = samples.size();

    while (variantFile.getNextVariant(var)) {
        
	// biallelic sites naturally 

	if(var.alt.size() > 1){
	  continue;
	}
	
	vector < map< string, vector<string> > > target, background, total;
	        
	int index = 0;

	for(int nsamp = 0; nsamp < nsamples; nsamp++){

	  map<string, vector<string> > sample = var.samples[ samples[nsamp]];

	    if(sample["GT"].front() != "./."){
	      if(it.find(index) != it.end() ){
		target.push_back(sample);
	      }
	    }            
	    index += 1;
	}
	
	genotype * populationTarget      ;
	genotype * populationBackground  ;

	if(type == "PL"){
	  populationTarget     = new pl();
	}
	if(type == "GL"){
	  populationTarget     = new gl();
	}
	if(type == "GP"){
	  populationTarget     = new gp();
	}
	if(type == "GT"){
          populationTarget     = new gt();
	}
	
	populationTarget->loadPop(target, var.sequenceName, var.position);

	 //cerr << "     3. target allele frequency      "    << endl;
	 //cerr << "     4. expected heterozygosity      "    << endl;
	 //cerr << "     5. observed heterozygosity      "    << endl;
	 //cerr << "     6. number of hets               "    << endl;
	 //cerr << "     7. number of homozygous ref     "    << endl;
	 //cerr << "     8. number of homozygous alt     "    << endl;
	 //cerr << "     9. target Fis                   "    << endl;

	if(populationTarget->af == -1){
	  delete populationTarget;
	  continue;
	}

	double ehet = 2*(populationTarget->af * (1 - populationTarget->af));
	
	cout << var.sequenceName << "\t"  << var.position << "\t" 
	     << populationTarget->af  << "\t"
	     << ehet << "\t"
	     << populationTarget->hfrq  << "\t"
	     << populationTarget->nhet  << "\t"
	     << populationTarget->nhomr << "\t"
	     << populationTarget->nhoma << "\t"
	     << populationTarget->fis   << endl;

	delete populationTarget;

    }
    return 0;		    
}