/// The thread entry for the calculation of LD measure static void Entry_LD_Matrix(PdThread Thread, int ThreadIndex, void *Param) { double *base = (double*)Param; IdMatTri I = Array_Thread_MatIdx[ThreadIndex]; C_UInt8 *pGeno = &PackedGeno[0]; for (C_Int64 n=Array_Thread_MatCnt[ThreadIndex]; n > 0; n--) { const int i = I.Row(), j = I.Column(); double &p1 = base[i*NumSNP + j]; double &p2 = base[j*NumSNP + i]; C_UInt8 *s1 = pGeno + i*nPackedSamp; C_UInt8 *s2 = pGeno + j*nPackedSamp; switch (LD_Method) { case 1: p1 = p2 = PairComposite(s1, s2); break; case 2: p1 = p2 = PairR(s1, s2); break; case 3: p1 = p2 = PairDPrime(s1, s2); break; case 4: p1 = p2 = PairCorr(s1, s2); break; case 5: p1 = p2 = PairCov(s1, s2); break; default: p1 = p2 = R_NaN; } ++ I; } }
/// Compute IBD estimator in KING-h**o static void _Do_KING_Homo_Compute(int ThreadIndex, long Start, long SNP_Cnt, void* Param) { long Cnt = IBS_Thread_MatCnt[ThreadIndex]; IdMatTri I = IBS_Thread_MatIdx[ThreadIndex]; TKINGHomoFlag *p = ((TKINGHomoFlag*)Param) + I.Offset(); long _PackSNPLen = (SNP_Cnt / 4) + (SNP_Cnt % 4 ? 1 : 0); for (; Cnt > 0; Cnt--, ++I, p++) { UInt8 *p1 = GenoPacked.get() + I.Row()*_PackSNPLen; UInt8 *p2 = GenoPacked.get() + I.Column()*_PackSNPLen; for (long k=0; k < _PackSNPLen; k++, p1++, p2++) { size_t t = (size_t(*p1) << 8) | (*p2); p->IBS0 += IBS0_Num_SNP[t]; p->SumSq += Gen_KING_SqDiff[t]; UInt8 flag = Gen_Both_Valid[t]; if (flag & 0x01) { double f = GenoAlleleFreq.get()[4*k + 0]; p->SumAFreq += f; p->SumAFreq2 += f*f; } if (flag & 0x02) { double f = GenoAlleleFreq.get()[4*k + 1]; p->SumAFreq += f; p->SumAFreq2 += f*f; } if (flag & 0x04) { double f = GenoAlleleFreq.get()[4*k + 2]; p->SumAFreq += f; p->SumAFreq2 += f*f; } if (flag & 0x08) { double f = GenoAlleleFreq.get()[4*k + 3]; p->SumAFreq += f; p->SumAFreq2 += f*f; } } } }
/// Compute the pairwise IBS matrix static void _Do_IBS_Compute(int ThreadIndex, long Start, long SNP_Cnt, void* Param) { long Cnt = IBS_Thread_MatCnt[ThreadIndex]; IdMatTri I = IBS_Thread_MatIdx[ThreadIndex]; TIBS_Flag *p = ((TIBS_Flag*)Param) + I.Offset(); long _PackSNPLen = (SNP_Cnt / 4) + (SNP_Cnt % 4 ? 1 : 0); for (; Cnt > 0; Cnt--, ++I, p++) { UInt8 *p1 = GenoPacked.get() + I.Row()*_PackSNPLen; UInt8 *p2 = GenoPacked.get() + I.Column()*_PackSNPLen; for (long k=_PackSNPLen; k > 0; k--, p1++, p2++) { size_t t = (size_t(*p1) << 8) | (*p2); p->IBS0 += IBS0_Num_SNP[t]; p->IBS1 += IBS1_Num_SNP[t]; p->IBS2 += IBS2_Num_SNP[t]; } } }
/// Compute IBD estimator in KING-robust static void _Do_KING_Robust_Compute(int ThreadIndex, long Start, long SNP_Cnt, void* Param) { long Cnt = IBS_Thread_MatCnt[ThreadIndex]; IdMatTri I = IBS_Thread_MatIdx[ThreadIndex]; TKINGRobustFlag *p = ((TKINGRobustFlag*)Param) + I.Offset(); long _PackSNPLen = (SNP_Cnt / 4) + (SNP_Cnt % 4 ? 1 : 0); for (; Cnt > 0; Cnt--, ++I, p++) { UInt8 *p1 = GenoPacked.get() + I.Row()*_PackSNPLen; UInt8 *p2 = GenoPacked.get() + I.Column()*_PackSNPLen; for (long k=0; k < _PackSNPLen; k++, p1++, p2++) { size_t t = (size_t(*p1) << 8) | (*p2); p->IBS0 += IBS0_Num_SNP[t]; p->nLoci += Gen_KING_Num_Loci[t]; p->SumSq += Gen_KING_SqDiff[t]; p->N1_Aa += Gen_KING_N1_Aa[t]; p->N2_Aa += Gen_KING_N2_Aa[t]; } } }
/// Compute the covariate matrix static void _Do_Diss_Compute(int ThreadIndex, long Start, long SNP_Cnt, void* Param) { long Cnt = IBS_Thread_MatCnt[ThreadIndex]; IdMatTri I = IBS_Thread_MatIdx[ThreadIndex]; TDissflag *p = ((TDissflag*)Param) + I.Offset(); long _PackSNPLen = (SNP_Cnt / 4) + (SNP_Cnt % 4 ? 1 : 0); for (; Cnt > 0; Cnt--, ++I, p++) { UInt8 *p1 = GenoPacked.get() + I.Row()*_PackSNPLen; UInt8 *p2 = GenoPacked.get() + I.Column()*_PackSNPLen; for (long k=0; k < _PackSNPLen; k++, p1++, p2++) { size_t t = (size_t(*p1) << 8) | (*p2); p->SumGeno += Gen_Dist_SNP[t]; UInt8 flag = Gen_Both_Valid[t]; if (flag & 0x01) p->SumAFreq += GenoAlleleFreq.get()[4*k]; if (flag & 0x02) p->SumAFreq += GenoAlleleFreq.get()[4*k+1]; if (flag & 0x04) p->SumAFreq += GenoAlleleFreq.get()[4*k+2]; if (flag & 0x08) p->SumAFreq += GenoAlleleFreq.get()[4*k+3]; } } }
void thread_ibs_num(size_t i, size_t n) { const size_t npack = nBlock >> 3; const size_t npack2 = npack * 2; C_UInt8 *Base = Geno1b.Get(); IdMatTri I = Array_Thread_MatIdx[i]; C_Int64 N = Array_Thread_MatCnt[i]; TS_KINGHomo *p = ptrKING + I.Offset(); for (; N > 0; N--, ++I, p++) { C_UInt8 *p1 = Base + I.Row() * npack2; C_UInt8 *p2 = Base + I.Column() * npack2; double *pAF = AF_1_AF.Get(); double *pAF2 = AF_1_AF_2.Get(); ssize_t m = npack; #if defined(COREARRAY_SIMD_SSE2) { POPCNT_SSE2_HEAD __m128i ibs0_sum, sumsq_sum; ibs0_sum = sumsq_sum = _mm_setzero_si128(); __m128d sq_sum, sq_sum2; sq_sum = sq_sum2 = _mm_setzero_pd(); for (; m > 0; m-=16) { __m128i g1_1 = _mm_load_si128((__m128i*)p1); __m128i g1_2 = _mm_load_si128((__m128i*)(p1 + npack)); __m128i g2_1 = _mm_load_si128((__m128i*)p2); __m128i g2_2 = _mm_load_si128((__m128i*)(p2 + npack)); p1 += 16; p2 += 16; __m128i mask = (g1_1 | ~g1_2) & (g2_1 | ~g2_2); __m128i ibs0 = (~((g1_1 ^ ~g2_1) | (g1_2 ^ ~g2_2))) & mask; __m128i het = ((g1_1 ^ g1_2) ^ (g2_1 ^ g2_2)) & mask; POPCNT_SSE2_RUN(ibs0) ibs0_sum = _mm_add_epi32(ibs0_sum, ibs0); POPCNT_SSE2_RUN(het) sumsq_sum = _mm_add_epi32(_mm_add_epi32(sumsq_sum, het), _mm_slli_epi32(ibs0, 2)); C_UInt64 m1 = _mm_cvtsi128_si64(mask); C_UInt64 m2 = _mm_cvtsi128_si64(_mm_shuffle_epi32(mask, _MM_SHUFFLE(1,0,3,2))); for (size_t k=32; k > 0; k--) { switch (m1 & 0x03) { case 3: sq_sum = _mm_add_pd(sq_sum, _mm_load_pd(pAF)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_load_pd(pAF2)); break; case 1: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(0, pAF[0])); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(0, pAF2[0])); break; case 2: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(pAF[1], 0)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(pAF2[1], 0)); break; } pAF += 2; pAF2 += 2; m1 >>= 2; } for (size_t k=32; k > 0; k--) { switch (m2 & 0x03) { case 3: sq_sum = _mm_add_pd(sq_sum, _mm_load_pd(pAF)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_load_pd(pAF2)); break; case 1: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(0, pAF[0])); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(0, pAF2[0])); break; case 2: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(pAF[1], 0)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(pAF2[1], 0)); break; } pAF += 2; pAF2 += 2; m2 >>= 2; } } p->IBS0 += vec_sum_i32(ibs0_sum); p->SumSq += vec_sum_i32(sumsq_sum); p->SumAFreq += vec_sum_f64(sq_sum); p->SumAFreq2 += vec_sum_f64(sq_sum2); } #else for (; m > 0; m-=8) { C_UInt64 g1_1 = *((C_UInt64*)p1); C_UInt64 g1_2 = *((C_UInt64*)(p1 + npack)); C_UInt64 g2_1 = *((C_UInt64*)p2); C_UInt64 g2_2 = *((C_UInt64*)(p2 + npack)); p1 += 8; p2 += 8; C_UInt64 mask = (g1_1 | ~g1_2) & (g2_1 | ~g2_2); C_UInt64 ibs0 = (~((g1_1 ^ ~g2_1) | (g1_2 ^ ~g2_2))) & mask; C_UInt64 het = ((g1_1 ^ g1_2) ^ (g2_1 ^ g2_2)) & mask; p->IBS0 += POPCNT_U64(ibs0); p->SumSq += POPCNT_U64(het) + POPCNT_U64(ibs0)*4; double sum=0, sum2=0; for (size_t k=64; k > 0; k--) { if (mask & 0x01) { sum += (*pAF); sum2 += (*pAF2); } pAF ++; pAF2 ++; mask >>= 1; } p->SumAFreq += sum; p->SumAFreq2 += sum2; } #endif } }