Example #1
0
int HaploTree::Traverse(IntArray & pointer, IntArray & state) const
   {
   while (pointer.Length())
      {
      int branch = pointer.Peek();
      int & allele = state[levels[branch]];
      int next = -1;

      while (++allele < alleleCounts[levels[branch]])
         if ((next = PeekBranch(branch, allele)) != -1)
            break;

      if (next == -1)
         {
         allele = -1;
         pointer.Pop();
         continue;
         }

      if (levels[branch] == depth - 1)
         return next;

      pointer.Push(next);
      }

   return -1;
   }
Example #2
0
void PDF::WriteReferenceArray(const IntArray & array)
   {
   fprintf(file, " [ ");

   for (int i = 0; i < array.Length(); i++)
      WriteReference(array[i]);

   fprintf(file, " ] ");
   }
Example #3
0
void Pedigree::Filter(IntArray & filter)
{
    if (filter.Length() != count)
        error("Pedigree:Size of pedigree filter doesn't match number of persons in pedigree");

    for (int i = 0; i < count; i++)
        if (filter[i] == 1)
        {
            persons[i]->WipePhenotypes();
            persons[i]->filter = true;
        }
}
Example #4
0
void Family::ShowInvalidCycles()
   {
   // Try and identify key individuals responsible for
   // pedigree mess-up ... when this function is called
   // pedigree has been traversed top-down and individuals
   // that are correctly specified have IDs of >= 0.

   // This routine traverses the pedigree bottom up to
   // identify a subset of individuals likely to be causing
   // the problem
   IntArray descendants(ped.count);
   descendants.Zero();

   for (int i = first; i <= last; i++)
      if (ped[i].traverse == -1)
         {
         descendants[ped[i].father->serial]++;
         descendants[ped[i].mother->serial]++;
         }

   IntArray stack;

   for (int i = first; i <= last; i++)
      if (ped[i].traverse == -1 && descendants[i] == 0)
         {
         stack.Push(i);

         do {
            int j = stack.Pop();

            if (ped[j].traverse != -1) continue;

            ped[j].traverse = 9999;

            if (--descendants[ped[j].father->serial] == 0)
               stack.Push(ped[j].father->serial);
            if (--descendants[ped[j].mother->serial] == 0)
               stack.Push(ped[j].mother->serial);
         } while (stack.Length());
         }

   printf("The structure of family %s requires\n"
          "an individual to be his own ancestor.\n\n"
          "To identify the problem(s), examine the\n"
          "following key individuals:\n\n",
          (const char *) famid);

   for (int i = first; i <= last; i++)
      if (ped[i].traverse == -1)
         printf("Problem Person: %s\n", (const char *) ped[i].pid);

   error("Invalid pedigree structure.");
   }
void InverseNormalTransform(Pedigree & ped)
   {
   Vector     phenotypes;
   IntArray   individuals;
   QuickIndex index;

   phenotypes.Dimension(ped.count);
   individuals.Dimension(ped.count);

   for (int trait = 0; trait < ped.traitCount; trait++)
      {
      phenotypes.Dimension(0);
      individuals.Dimension(0);

      for (int i = 0; i < ped.count; i++)
         if (ped[i].traits[trait] != _NAN_)
            {
            phenotypes.Push(ped[i].traits[trait]);
            individuals.Push(i);
            }

      int count = individuals.Length();

      if (count == 0) continue;

      index.Index(phenotypes);

      double scale = 1.0 / count;

      for (int i = 0, j; i < index.Length(); i++)
         {
         for (j = i; j + 1 < index.Length(); j++)
            if (ped[individuals[index[i]]].traits[trait] !=
                ped[individuals[index[j]]].traits[trait] )
                break;

         if (ped[individuals[index[i]]].traits[trait] !=
             ped[individuals[index[j]]].traits[trait] )
             j--;

         double z = ninv(((i + j) * 0.5 + 0.5) * scale);

         for (int k = i; k <= j; k++)
            ped[individuals[index[k]]].traits[trait] = z;

         i = j;
         }
      }
   }
Example #6
0
void HaploTree::MakeEmptyTree(const IntArray & counts)
   {
   alleleCounts = counts;
   depth = counts.Length();
   leafs = 0;
   }
Example #7
0
int main(int argc, char ** argv)
   {
   setbuf(stdout, NULL);

   time_t start = time(NULL);

   printf("MiniMac - Imputation into phased haplotypes\n"
          "(c) 2011 Goncalo Abecasis\n");
#ifdef __VERSION__
   printf("VERSION 5.0\n");
#else
   printf("UNDOCUMENTED RELEASE\n");
#endif

   int rounds = 5, states = 200, cpus = 0;
   bool em = false, gzip = false, phased = false;

   String referenceHaplotypes, referenceSnps;
   String haplotypes, snps;
   String prefix("minimac");
   String firstMarker, lastMarker;

   String recombinationRates, errorRates;

   BEGIN_LONG_PARAMETERS(longParameters)
      LONG_PARAMETER_GROUP("Reference Haplotypes")
         LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes)
         LONG_STRINGPARAMETER("refSnps", &referenceSnps)
      LONG_PARAMETER_GROUP("Target Haplotypes")
         LONG_STRINGPARAMETER("haps", &haplotypes)
         LONG_STRINGPARAMETER("snps", &snps)
      LONG_PARAMETER_GROUP("Starting Parameters")
         LONG_STRINGPARAMETER("rec", &recombinationRates)
         LONG_STRINGPARAMETER("erate", &errorRates)
      LONG_PARAMETER_GROUP("Parameter Fitting")
         LONG_INTPARAMETER("rounds", &rounds)
         LONG_INTPARAMETER("states", &states)
         LONG_PARAMETER("em", &em)
      LONG_PARAMETER_GROUP("Output Files")
         LONG_STRINGPARAMETER("prefix", &prefix)
         LONG_PARAMETER("phased", &phased)
         LONG_PARAMETER("gzip", &gzip)
//    LONG_PARAMETER_GROUP("Clipping Window")
//      LONG_STRINGPARAMETER("start", &firstMarker)
//      LONG_STRINGPARAMETER("stop", &lastMarker)
#ifdef _OPENMP
      LONG_PARAMETER_GROUP("Multi-Threading")
         LONG_INTPARAMETER("cpus", &cpus)
#endif
   END_LONG_PARAMETERS();

   ParameterList pl;

   pl.Add(new LongParameters("Command Line Options", longParameters));
   pl.Read(argc, argv);
   pl.Status();

#ifdef _OPENMP
   if (cpus > 0)
      omp_set_num_threads(cpus);
#endif

   // Read marker list
   printf("Reading Reference Marker List ...\n");

   StringArray refMarkerList;
   refMarkerList.Read(referenceSnps);

   // Index markers
   StringIntHash referenceHash;
   for (int i = 0; i < refMarkerList.Length(); i++)
      referenceHash.Add(refMarkerList[i].Trim(), i);

   printf("  %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());

   // Load reference haplotypes
   printf("Loading reference haplotypes ...\n");
   HaplotypeSet reference;

   reference.markerCount = refMarkerList.Length();
   reference.LoadHaplotypes(referenceHaplotypes);

   printf("  %d Reference Haplotypes Loaded ...\n\n", reference.count);

   // Read framework marker list
   printf("Reading Framework Marker List ...\n");
   StringArray markerList;
   markerList.Read(snps);

   ClipReference(reference, refMarkerList, referenceHash, markerList,
                 firstMarker, lastMarker);

   // Crossref Marker Names to Reference Panel Positions
   IntArray markerIndex;
   markerIndex.Dimension(markerList.Length());

   int matches = 0;

   for (int i = 0; i < markerList.Length(); i++)
      {
      markerIndex[i] = referenceHash.Integer(markerList[i].Trim());

      if (markerIndex[i] >= 0) matches++;
      }

   printf("  %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);

   if (matches == 0)
      error("No markers overlap between target and reference\n"
            "Please check correct reference is being used and markers are named consistently");

   printf("  %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);

   // Check for flips in reference vs. target haplotypes
   int flips = 0;
   int previous = -1;
   for (int i = 0; i < markerIndex.Length(); i++)
      if (markerIndex[i] >= 0)
         if (markerIndex[i] < previous)
            {
            if (flips++ < 10)
               printf("  -> Marker %s precedes %s in reference, but follows it in target\n",
                     (const char *) refMarkerList[previous],
                     (const char *) markerList[i]);
            previous = markerIndex[i];
            }
   if (flips > 10)
      printf("  -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
   if (flips)
      printf("  %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);

   // Load target haplotypes
   printf("Loading target haplotypes ...\n");
   HaplotypeSet target;

   target.markerCount = markerList.Length();
   target.LoadHaplotypes(haplotypes, true);

   reference.CalculateFrequencies();
   target.CalculateFrequencies();
   target.CompareFrequencies(reference, markerIndex, markerList);

   printf("  %d Target Haplotypes Loaded ...\n\n", target.count);

   int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker);
   int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker);

   if (startIndex < 0 || stopIndex < 0)
      error("Clipping requested, but no position available for one of the endpoints");

   printf("Setting up Markov Model...\n\n");

   // Setup Markov Model
   MarkovParameters mp;

   mp.Allocate(reference.markerCount);

   if (rounds > 0)
      printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n",
             em ? "E-M" : "MCMC", states);

   // Simple initial estimates of error and recombination rate
   for (int i = 0; i < reference.markerCount; i++)
      mp.E[i] = 0.01;

   for (int i = 0; i < reference.markerCount - 1; i++)
      mp.R[i] = 0.001;

   if (mp.ReadErrorRates(errorRates))
      printf("  Updated error rates using data in %s ...\n", (const char *) errorRates);

   if (mp.ReadCrossoverRates(recombinationRates))
      printf("  Updated recombination rates using %s ...\n", (const char *) recombinationRates);

   // Parameter estimation loop
   for (int round = 0; round < rounds; round++)
      {
      printf("  Round %d of Parameter Refinement ...\n", round + 1);

      int iterations = states < reference.count ? states : reference.count;

      MarkovModel original;
      original.CopyParameters(mp);

      #pragma omp parallel for
      for (int i = 0; i < iterations; i++)
         {
         MarkovModel mm;

         mm.Allocate(reference.markerCount, reference.count - 1);
         mm.CopyParameters(original);

         // Reference leave one out (loo) panel
         char ** reference_loo = new char * [reference.count - 1];
         for (int in = 0, out = 0; in < reference.count; in++)
            if (in != i)
               reference_loo[out++] = reference.haplotypes[in];

         mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);

         if (em)
            mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
         else
            {
            #pragma omp critical
            { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); }
            }

         delete [] reference_loo;

         #pragma omp critical
         mp += mm;
         }

      if (round >= rounds / 2)
         {
         int iterations = states < target.count ? states : target.count;

         #pragma omp parallel for
         for (int i = 0; i < iterations; i++)
            {
            MarkovModel mm;

            mm.Allocate(reference.markerCount, reference.count);
            mm.CopyParameters(original);

            // Padded version of target haplotype, including missing sites
            char * padded = new char [reference.markerCount];
            for (int k = 0; k < reference.markerCount; k++)
               padded[k] = 0;

            // Copy current haplotype into padded vector
            for (int j = 0; j < target.markerCount; j++)
               if (markerIndex[j] >= 0)
                  padded[markerIndex[j]] = target.haplotypes[i][j];

            mm.WalkLeft(padded, reference.haplotypes, reference.freq);

            if (em)
               mm.CountExpected(padded, reference.haplotypes, reference.freq);
            else
               {
               #pragma omp critical
               { mm.ProfileModel(padded, reference.haplotypes, reference.freq); }
               }

            delete [] padded;

            #pragma omp critical
            mp += mm;
            }
         }

      mp.UpdateModel();

      double crossovers = 0;
      for (int i = 0; i < reference.markerCount - 1; i++)
         crossovers += mp.R[i];

      double errors = 0;
      for (int i = 0; i < reference.markerCount; i++)
         {
         double heterozygosity = 1.0 - square(reference.freq[1][i])
                                     - square(reference.freq[2][i])
                                     - square(reference.freq[3][i])
                                     - square(reference.freq[4][i]);

         errors += mp.E[i] * heterozygosity;
         }
      errors /= reference.markerCount + 1e-30;

      printf("      %.0f mosaic crossovers expected per haplotype\n", crossovers);
      printf("      %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.);
      printf("      %.3g errors in mosaic expected per marker\n", errors);
      }

   if (rounds > 0)
      {
      printf("  Saving estimated parameters for future use ...\n");
      mp.WriteParameters(refMarkerList, prefix, gzip);
      }

   printf("\n");

   // List the major allele at each location
   reference.ListMajorAlleles();

   printf("Generating Draft .info File ...\n\n");

   // Output some basic information
   IFILE info = ifopen(prefix + ".info.draft", "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");

   for (int i = 0, j = 0; i <= stopIndex; i++)
      if (i >= startIndex)
         ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
            reference.freq[reference.major[i]][i],
            j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
      else
         if (j < markerIndex.Length() && i == markerIndex[j])
            j++;

   ifclose(info);

   printf("Imputing Genotypes ...\n");

   IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt");
   IFILE hapdose, haps;

   if (phased)
      {
      hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt");
      haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt");
      }

   ImputationStatistics stats(reference.markerCount);

   // Impute each haplotype
   #pragma omp parallel for
   for (int i = 0; i < target.count; i++)
      {
      if (i != 0 && target.labels[i] == target.labels[i-1])
         continue;

      MarkovModel mm;

      mm.Allocate(reference.markerCount, reference.count);
      mm.ClearImputedDose();
      mm.CopyParameters(mp);

      // Padded version of target haplotype, including missing sites
      char * padded = new char [reference.markerCount];
      for (int j = 0; j < reference.markerCount; j++)
         padded[j] = 0;

      int k = i;

      do {
         printf("  Processing Haplotype %d of %d ...\n", k + 1, target.count);

         // Copy current haplotype into padded vector
         for (int j = 0; j < target.markerCount; j++)
            if (markerIndex[j] >= 0)
               padded[markerIndex[j]] = target.haplotypes[k][j];

         mm.WalkLeft(padded, reference.haplotypes, reference.freq);
         mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);

         #pragma omp critical
         { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }

         #pragma omp critical
         if (phased)
            {
            ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            for (int j = startIndex; j <= stopIndex; j++)
               {
               ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
               ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
               }
            ifprintf(hapdose, "\n");
            ifprintf(haps, "\n");
            }

         k++;
      } while (k < target.count && target.labels[k] == target.labels[i]);

      printf("    Outputting Individual %s ...\n", (const char *) target.labels[i]);

      #pragma omp critical
         {
         ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
         for (int j = startIndex; j <= stopIndex; j++)
            ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
         ifprintf(dosages, "\n");
         }

      delete [] padded;
      }

   ifclose(dosages);

   if (phased)
      {
      ifclose(hapdose);
      ifclose(haps);
      }

   // Output some basic information
   info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n");

   // Padded version of target haplotype, including missing sites
   char * padded = new char [reference.markerCount];
   for (int k = 0; k < reference.markerCount; k++)
      padded[k] = 0;

   // Mark genotyped SNPs in padded vector
   for (int j = 0; j < target.markerCount; j++)
      if (markerIndex[j] >= 0)
          padded[markerIndex[j]] = 1;

   for (int i = startIndex; i <= stopIndex; i++)
      {
      ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i),
            reference.MinorAlleleLabel(i),
            stats.AlleleFrequency(i),
            stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i),
            stats.AverageCallScore(i),
            stats.Rsq(i));

      if (padded[i])
         ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n",
                  stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i),
                  stats.LooMajorDose(i), stats.LooMinorDose(i));
      else
         ifprintf(info, "-\t-\t-\t-\t-\t-\n");
      }

   ifclose(info);

   delete [] padded;

   time_t stop = time(NULL);
   int seconds = stop - start;

   printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
          seconds / 3600, (seconds % 3600) / 60, seconds % 60,
          ctime(&stop));
   }
Example #8
0
bool Pedigree::TwinCheck()
   {
   bool fail = false;
   IntArray mzTwins;

   for (int f = 0; f < familyCount; f++)
      {
      mzTwins.Clear();

      for (int i = families[f]->first, j; i <= families[f]->last; i++)
         // Is this person an identical twin?
         if (persons[i]->isMzTwin( *persons[i] ))
            {
            // Have we got another identical sib yet?
            for ( j = 0; j < mzTwins.Length(); j++)
               if ( persons[i]->isMzTwin( *persons[mzTwins[j]] ) )
                  break;

            // If not, add to list of twins
            if (j == mzTwins.Length())
               {
               mzTwins.Push(i);
               continue;
               }

            // Check that their genotypes are compatible and
            // merge new twin's genotypes into original twin...
            Person * original = persons[mzTwins[j]];
            Person * twin = persons[i];

            for (int m = 0; m < Person::markerCount; m++)
               {
               if (!original->markers[m].isKnown())
                  original->markers[m] = twin->markers[m];
               else
                  if (twin->markers[m].isKnown() &&
                      twin->markers[m] != original->markers[m])
                      printf("MZ Twins %s and %s in family %s have "
                             "different %s genotypes\n",
                             (const char *) original->pid,
                             (const char *) twin->pid,
                             (const char *) original->famid,
                             (const char *) Person::markerNames[m]),
                             fail = true;

               if (twin->sex != original->sex)
                  printf("MZ Twins %s and %s in family %s have "
                         "different sexes\n",
                         (const char *) original->pid,
                         (const char *) twin->pid,
                         (const char *) original->famid),
                         fail = true;
               }
            }

      if (mzTwins.Length() == 0) continue;

      // In the second pass we copy merged twin genotypes
      // from original twin to other twins
      for (int i = families[f]->first, j; i <= families[f]->last; i++)
         if (persons[i]->isMzTwin( *persons[i] ))
            {
            for ( j = 0; j < mzTwins.Length(); j++)
               if ( persons[i]->isMzTwin( *persons[mzTwins[j]] ) )
                  break;

            if (mzTwins[j] == i) continue;

            Person * original = persons[mzTwins[j]];
            Person * twin = persons[i];

            for (int m = 0; m < Person::markerCount; m++)
               twin->markers[m] = original->markers[m];
            }
      }
   return fail;
   }