int main(int argc, char *argv[]) { int i; pbt_high_level_vars *HLV = (pbt_high_level_vars *)malloc(sizeof(pbt_high_level_vars)); FB_Vars *CPFB; /* "Compat Pairs FB_vars" this is going to be heavily re-used space, each time we are dealing with an offspring and all of his non-excluded parent pairs. It will contain a Colls array that is linear with the Specified pedigrees. */ HLV->PBUO = GetPBT_UserOpts(argc, argv); /* open up a file stream for the basic summary data */ if( (HLV->BasicSummaries_File = fopen("snppit_output_BasicDataSummary.txt","w"))==NULL) { fprintf(stderr,"Error! Failed to open file snppit_output_BasicDataSummary.txt to write to it. You may have it open and locked in another application. Exiting...\n"); exit(1); } printf("\n\n"); HLV->PFR = FirstPassThroughData(HLV->PBUO->DataFileName); #ifdef VERBOSE_PRINT_FIRST_PASS_SUMMARY PrintFirstPassSummaryOfPopsCollsAndIndivs(HLV->PFR); #endif CollectDataOnSecondPass(HLV->PFR, HLV->PBUO->DataFileName); NegotiatePiVectors(HLV->PBUO, HLV->PFR); PrintSummaryOfInputData(HLV->BasicSummaries_File,HLV->PFR); SummarizeLocusNameAndGtypRates(HLV->BasicSummaries_File, HLV->PFR); SummarizeAllelicTypes(HLV->BasicSummaries_File,HLV->PFR); CountAlleles(HLV->PFR); ComputeAlleleFreqsFromCounts(HLV->BasicSummaries_File, HLV->PFR); PrintSummaryOfAllelicCountsAndFreqs(HLV->BasicSummaries_File,HLV->PFR); fflush(stdout); fclose(HLV->BasicSummaries_File); if(HLV->PBUO->DryRun>0) { printf("\n\nData have been read in and summaries compiled on this dry run.\n"); printf("Please check the data summaries in file \"snppit_output_BasicDataSummary.txt\"\n"); printf("to confirm that the program is running correctly. If it all looks good,\n"); printf("then try a full-blown run by removing the --dry-run option from the\n"); printf("command line.\n\n"); return(0); } else { printf("\n\n\nDATA HAVE BEEN READ. SUMMARIES APPEAR IN: snppit_output_BasicDataSummary.txt\n\n\n"); } /* now compute the parental trios forwards probs using the Big Smax and select the smax to use for the future analyses */ printf("COMPUTING AN APPROPRIATE S-MAX\n"); SelectAnSmax3(HLV); printf("\n\n"); for(i=0;i<HLV->PFR->NumOffColls;i++) { #ifdef VERBOSE_SINGLE_PARENT_COMPAT_WITH_OFFSPRING printf("EXCLUDING SINGLE PARENTS. COLLECTION %d %s with %d indivs in collection. \nDone with individual index:\n",i+1,HLV->PFR->OffCollNames[i],HLV->PFR->NumInOffColls[i]); #endif AssignMatchingSingleParents(HLV, i); } printf("\n\n"); for(i=0;i<HLV->PFR->NumOffColls;i++) { printf("FINDING NON EXCLUDED PARENT PAIRS. COLLECTION %d %s with %d indivs in collection. \nDone with individual index:\n",i+1,HLV->PFR->OffCollNames[i], HLV->PFR->NumInOffColls[i]); fflush(stdout); AssignMatchingParentPairs(HLV, i, HLV->smax_to_use[2]); } printf("\n\n"); printf("COMPUTING THE FORWARD STEP AND PREPARING FOR BACKWARD STEP FOR ALL POPULATIONS\n"); fflush(stdout); ComputePurePopTrioColls(HLV); /* this should do the forward step AND prepare for the backward step in all these */ //ComputeCrossPopTrioColls(HLV); /* this can only be done AFTER computing all the PurePopTrioColls */ /* open up a file stream for the posteriors */ if( (HLV->TrioPosteriors_File = fopen("snppit_output_TrioPosteriors.txt","w"))==NULL) { fprintf(stderr,"Error! Failed to open file snppit_output_TrioPosteriors.txt to write to it. You may have it open and locked in another application. Exiting...\n"); exit(1); } fprintf(HLV->TrioPosteriors_File,"OffspCollection\tKid\tPa\tMa\tRank\tLOD"); {int k; for(k=0;k<NUM_SPEC_PEDS;k++) { fprintf(HLV->TrioPosteriors_File,"\tP.Pr.%s",SpecPedIdx2Str(k)); } } fprintf(HLV->TrioPosteriors_File,"\tKidMiss\tPaMiss\tMaMiss\tMI.Kid.Pa\tMI.Kid.Ma\tMI.Trio\n"); printf("\n\n"); for(i=0;i<HLV->PFR->NumOffColls;i++) { printf("COMPUTING POSTERIORS: COLLECTION %d %s with %d indivs in collection. \nDone with individual index:\n",i+1,HLV->PFR->OffCollNames[i],HLV->PFR->NumInOffColls[i]); fflush(stdout); CalculateTrioPosteriorProbs(HLV, i); } fclose(HLV->TrioPosteriors_File); HLV->KidsWithMaxLOD_Parents = (inds_with_max_lod_parents_from_this_pop **)calloc(HLV->PFR->NumPops,sizeof(inds_with_max_lod_parents_from_this_pop *)); for(i=0;i<HLV->PFR->NumPops;i++) { HLV->KidsWithMaxLOD_Parents[i] = RecordNonExcParentPairsFromPop(i,HLV); } /* allocate space to the areas where we will do FB algorithm successively, for each offspring with compatible parents */ CPFB = (FB_Vars *)malloc(sizeof(FB_Vars)); CPFB->RP = HLV->PurePopTrioColls->RP; CPFB->NumColls = NUM_SPEC_PEDS; CPFB->Colls = (Collection **)calloc(CPFB->NumColls, sizeof(Collection *)); for(i=0;i<CPFB->NumColls;i++) { CPFB->Colls[i] = AllocToCollection(CPFB->RP); } /* open up a file stream where we will store the max LOD parents */ /*if( (HLV->MaxLodNonExpPar_File = fopen("snppit_output_MaxLodNonExParents.txt","w"))==NULL) { fprintf(stderr,"Error! Failed to open file snppit_output_MaxLodNonExParents.txt to write to it. You may have it open and locked in another application. Exiting...\n"); exit(1); } */ /*fprintf(HLV->MaxLodNonExpPar_File,"Kid\tPa\tMa\tPvalue\tLOD\tP.Pr.C_Se_Se\tP.Pr.Max\tMaxP.Pr.Relat\tTotPaNonExc\tTotMaNonExc\tTotUnkNonExc\tTotPairsNonExc\tKidMiss\tPaMiss\tMaMiss\tMI.Kid.Pa\tMI.Kid.Ma\tMI.Trio\tMendIncLoci\n"); */ printf("\n\n"); SeedFromFile("snppit_seeds"); printf("\n\n"); for(i=0;i<HLV->PFR->NumOffColls;i++) { printf("COMPUTING P-VALUES BY SIMULATION: COLLECTION %d %s with %d indivs in collection\nDone with individual index:\n",i+1,HLV->PFR->OffCollNames[i],HLV->PFR->NumInOffColls[i]); AssessFPR_and_FNR_ByBackwardSimulation(HLV, i, CPFB); } /*fclose(HLV->MaxLodNonExpPar_File);*/ if( (HLV->FDR_Summaries_File = fopen("snppit_output_FDR_Summary.txt","w"))==NULL) { fprintf(stderr,"Error! Failed to open file \"snppit_output_FDR_Summary.txt\" to write to it. You may have it open and locked in another application. Exiting...\n"); exit(1); } printf("\n\n"); printf("PERFORMING FALSE DISCOVERY RATE CORRECTIONS\n"); fprintf(HLV->FDR_Summaries_File,"PopName\tRankInFDR\tKid\tPa\tMa\tFDR\tFDC.est.to.pop\tPvalue\n"); for(i=0;i<HLV->PFR->NumPops;i++) { DoFDR_ForAPop(i,HLV); //printf("Done with FDR for Pop= %d\n",i); } fclose(HLV->FDR_Summaries_File); printf("\n\n"); SeedToFile("snppit_seeds"); printf("\n\n"); printf("PRINTING FINAL PARENTAGE REPORT\n"); PrintFinalIndivReportWithFDRs(HLV); printf("\n\n"); printf("SNPPIT PROGRAM EXECUTION COMPLETED.\n"); printf("\nOutput is in the following files:\n"); printf("\tsnppit_output_ParentageAssignments.txt -- Main output file that gives false discovery rates for all offspring with the most likely parents\n"); printf("\tsnppit_output_BasicDataSummary.txt -- Basic information about the data that got read in.\n"); printf("\tsnppit_output_ChosenSMAXes.txt -- Information about the smax vectors used in the analysis.\n"); printf("\tsnppit_output_FDR_Summary.txt -- Offspring assigned to parents in each population, ranked by false discovery rate.\n"); printf("\tsnppit_output_PopSizesAnPiVectors.txt -- Information about the sizes of the populations and the expected fraction of different trios thereby implied.\n"); printf("\tsnppit_output_TrioPosteriors.txt -- Posterior probs for all non-excluded parent pairs of every offspring in the data file.\n"); printf("\n\n"); printf("Questions, etc.? Send them to [email protected]\n\n"); return(0); }
bool Pedigree::AutosomalCheck() { // Arrays indicating which alleles and homozygotes occur IntArray haplos, genos, counts, failedFamilies; bool fail = false; // For each marker ... for (int m = 0; m < markerCount; m++) { MarkerInfo * info = GetMarkerInfo(m); // Summary for marker int alleleCount = CountAlleles(m); int genoCount = alleleCount * (alleleCount + 1) / 2; // Initialize arrays haplos.Dimension(alleleCount + 1); haplos.Set(-1); genos.Dimension(genoCount + 1); genos.Set(-1); failedFamilies.Dimension(familyCount); failedFamilies.Zero(); counts.Dimension(alleleCount + 1); for (int f = 0; f < familyCount; f++) for (int i = families[f]->first; i <= families[f]->last; i++) if (!persons[i]->isFounder() && persons[i]->sibs[0] == persons[i]) { // This loop runs once per sibship Alleles fat = persons[i]->father->markers[m]; Alleles mot = persons[i]->mother->markers[m]; bool fgeno = fat.isKnown(); bool mgeno = mot.isKnown(); // Number of alleles, homozygotes and genotypes in this sibship int haplo = 0, h**o = 0, diplo = 0; // No. of different genotypes per allele counts.Zero(); // In general, there should be no more than 3 genotypes per allele bool too_many_genos = false; for (int j = 0; j < persons[i]->sibCount; j++) if (persons[i]->sibs[j]->isGenotyped(m)) { Alleles geno = persons[i]->sibs[j]->markers[m]; int fat1 = fat.hasAllele(geno.one); int fat2 = fat.hasAllele(geno.two); int mot1 = mot.hasAllele(geno.one); int mot2 = mot.hasAllele(geno.two); if ((fgeno && mgeno && !((fat1 && mot2) || (fat2 && mot1))) || (fgeno && !(fat1 || fat2)) || (mgeno && !(mot1 || mot2))) { printf("%s - Fam %s: Child %s [%s/%s] has ", (const char *) markerNames[m], (const char *) persons[i]->sibs[j]->famid, (const char *) persons[i]->sibs[j]->pid, (const char *) info->GetAlleleLabel(geno.one), (const char *) info->GetAlleleLabel(geno.two)); if (!fgeno || !mgeno) printf("%s [%s/%s]\n", fgeno ? "father" : "mother", (const char *) info->GetAlleleLabel(fgeno ? fat.one : mot.one), (const char *) info->GetAlleleLabel(fgeno ? fat.two : mot.two)); else printf("parents [%s/%s]*[%s/%s]\n", (const char *) info->GetAlleleLabel(fat.one), (const char *) info->GetAlleleLabel(fat.two), (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); fail = true; failedFamilies[f] = true; } else { if (haplos[geno.one] != i) { haplo++; haplos[geno.one] = i; }; if (haplos[geno.two] != i) { haplo++; haplos[geno.two] = i; }; int index = geno.SequenceCoded(); if (genos[index] != i) { genos[index] = i; diplo++; counts[geno.one]++; if (geno.isHomozygous()) h**o++; else counts[geno.two]++; if (counts[geno.one] > 2) too_many_genos = true; if (counts[geno.two] > 2) too_many_genos = true; } } } if (fgeno) { if (haplos[fat.one] != i) { haplo++; haplos[fat.one] = i; } if (haplos[fat.two] != i) { haplo++; haplos[fat.two] = i; } h**o += fat.isHomozygous(); } if (mgeno) { if (haplos[mot.one] != i) { haplo++; haplos[mot.one] = i; } if (haplos[mot.two] != i) { haplo++; haplos[mot.two] = i; } h**o += mot.isHomozygous(); } if (diplo > 4 || haplo + h**o > 4 || (haplo == 4 && too_many_genos)) { printf("%s - Fam %s: ", (const char *) markerNames[m], (const char *) persons[i]->famid); if (persons[i]->father->markers[m].isKnown()) printf("Father %s [%s/%s] has children [", (const char *) persons[i]->father->pid, (const char *) info->GetAlleleLabel(fat.one), (const char *) info->GetAlleleLabel(fat.two)); else if (persons[i]->mother->markers[m].isKnown()) printf("Mother %s [%s/%s] has children [", (const char *) persons[i]->mother->pid, (const char *) info->GetAlleleLabel(mot.one), (const char *) info->GetAlleleLabel(mot.two)); else printf("Couple %s * %s has children [", (const char *) persons[i]->mother->pid, (const char *) persons[i]->father->pid); for (int j = 0; j < persons[i]->sibCount; j++) printf("%s%s/%s", j == 0 ? "" : " ", (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].one), (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].two)); printf("]\n"); fail = true; failedFamilies[f] = true; } } for (int f = 0; f < familyCount; f++) if (!failedFamilies[f] && (families[f]->count > families[f]->founders + 1) && !families[f]->isNuclear()) fail |= !GenotypeList::EliminateGenotypes(*this, families[f], m); } if (fail) printf("\nMendelian inheritance errors detected\n"); return fail; }