buffer_s16_t FM::execute(
	const buffer_c16_t& src,
	const buffer_s16_t& dst
) {
	auto z = z_;

	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const auto s0 = *__SIMD32(src_p)++;
		const auto s1 = *__SIMD32(src_p)++;
		const auto t0 = multiply_conjugate_s16_s32(s0, z);
		const auto t1 = multiply_conjugate_s16_s32(s1, s0);
		z = s1;
		const int32_t theta0_int = angle_approx_0deg27(t0) * ks16;
		const int32_t theta0_sat = __SSAT(theta0_int, 16);
		const int32_t theta1_int = angle_approx_0deg27(t1) * ks16;
		const int32_t theta1_sat = __SSAT(theta1_int, 16);
		*__SIMD32(dst_p)++ = __PKHBT(
			theta0_sat,
			theta1_sat,
			16
		);
	}
	z_ = z;

	return { dst.p, src.count, src.sampling_rate };
}
buffer_f32_t FM::execute(
	const buffer_c16_t& src,
	const buffer_f32_t& dst
) {
	auto z = z_;

	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const auto s0 = *__SIMD32(src_p)++;
		const auto s1 = *__SIMD32(src_p)++;
		const auto t0 = multiply_conjugate_s16_s32(s0, z);
		const auto t1 = multiply_conjugate_s16_s32(s1, s0);
		z = s1;
		*(dst_p++) = angle_precise(t0) * kf;
		*(dst_p++) = angle_precise(t1) * kf;
	}
	z_ = z;

	return { dst.p, src.count, src.sampling_rate };
}
void fm_demodulate_s16_s16(
	fm_demodulate_s16_s16_state_t* const state,
	const complex_s16_t* const src,
	int16_t* dst,
	int32_t n
) {
	complex_s16_t z1 = state->z1;
	// TODO: Gain compensation based on ratio of sampling rate and deviation?
	//const int32_t decimation_rate = 1;
	//const float k = state->k * 4096.0f / decimation_rate;
	const complex_s16_t* p = src;
	for(; n>0; n-=1) {
		const complex_s16_t s = *(p++);
		const complex_s32_t t = multiply_conjugate_s16_s32(s, z1);
		z1 = s;
		*(dst++) = fxpt_atan2(t.q >> 12, t.i >> 12) >> 1;
	}
	state->z1 = z1;
}