void LOADERDECL Pos_ReadDirect_Float3()
{
	// No need to use floating point here.
	((u32 *)VertexManager::s_pCurBufferPointer)[0] = DataReadU32(); 
	((u32 *)VertexManager::s_pCurBufferPointer)[1] = DataReadU32();
	((u32 *)VertexManager::s_pCurBufferPointer)[2] = DataReadU32();
	LOG_VTX();
	VertexManager::s_pCurBufferPointer += 12;
}
void LOADERDECL Pos_ReadIndex_Float_SSSE3()
{
	auto const index = DataRead<I>();
	const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * arraystrides[ARRAY_POSITION]));
	GC_ALIGNED128(const __m128i a = _mm_loadu_si128((__m128i*)pData));
	GC_ALIGNED128(__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2));
	_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b);
	VertexManager::s_pCurBufferPointer += sizeof(float) * 3;
	LOG_VTX();
}
void Pos_ReadDirect()
{
	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(T)DataRead<T>() * posScale;
	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(T)DataRead<T>() * posScale;
	if (three)
		((float*)VertexManager::s_pCurBufferPointer)[2] = (float)(T)DataRead<T>() * posScale;
	else
		((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
	LOG_VTX();
	VertexManager::s_pCurBufferPointer += 12;
}
void Pos_ReadIndex_Float_SSSE3(int Index)
{
	if(Index < MaxSize)
	{
		const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
		const __m128i a = _mm_loadu_si128((__m128i*)pData);
		__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2);
		_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b);
		LOG_VTX();
		VertexManager::s_pCurBufferPointer += 12;
	}
}
void LOADERDECL Pos_ReadDirect()
{
	static_assert(N <= 3, "N > 3 is not sane!");
	auto const scale = posScale;
	DataWriter dst;
	DataReader src;

	for (int i = 0; i < 3; ++i)
		dst.Write(i<N ? PosScale(src.Read<T>(), scale) : 0.f);

	LOG_VTX();
}
void LOADERDECL Pos_ReadDirect(VertexLoader* loader)
{
	static_assert(N <= 3, "N > 3 is not sane!");
	auto const scale = loader->m_posScale;
	DataReader dst(g_vertex_manager_write_ptr, nullptr);
	DataReader src(g_video_buffer_read_ptr, nullptr);

	for (int i = 0; i < 3; ++i)
		dst.Write(i < N ? PosScale(src.Read<T>(), scale) : 0.f);

	g_vertex_manager_write_ptr = dst.GetPointer();
	g_video_buffer_read_ptr = src.GetPointer();
	LOG_VTX();
}
void Pos_ReadIndex_Float(int Index)
{
	if(Index < MaxSize)
	{
		const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
		((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
		((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
		if (three)
			((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
		else
			((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
		LOG_VTX();
		VertexManager::s_pCurBufferPointer += 12;
	}
}
inline void Pos_ReadIndex_Short(int Index)
{
	if(Index < MaxSize)
	{
		const u16* pData = (const u16 *)(cached_arraybases[ARRAY_POSITION] + ((u32)Index * arraystrides[ARRAY_POSITION]));
		((float*)VertexManager::s_pCurBufferPointer)[0] = ((float)(T)Common::swap16(pData[0])) * posScale;
		((float*)VertexManager::s_pCurBufferPointer)[1] = ((float)(T)Common::swap16(pData[1])) * posScale;
		if (three)
			((float*)VertexManager::s_pCurBufferPointer)[2] = ((float)(T)Common::swap16(pData[2])) * posScale;
		else
			((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
		LOG_VTX();
		VertexManager::s_pCurBufferPointer += 12;
	}
}
void LOADERDECL Pos_ReadIndex()
{
	static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");
	static_assert(N <= 3, "N > 3 is not sane!");

	auto const index = DataRead<I>();
	auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_POSITION] + (index * arraystrides[ARRAY_POSITION]));
	auto const scale = posScale;
	DataWriter dst;

	for (int i = 0; i < 3; ++i)
		dst.Write(i<N ? PosScale(Common::FromBigEndian(data[i]), scale) : 0.f);

	LOG_VTX();
}
void LOADERDECL Pos_ReadIndex(VertexLoader* loader)
{
	static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
	static_assert(N <= 3, "N > 3 is not sane!");

	auto const index = DataRead<I>();
	loader->m_vertexSkip = index == std::numeric_limits<I>::max();
	auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION]));
	auto const scale = loader->m_posScale;
	DataReader dst(g_vertex_manager_write_ptr, nullptr);

	for (int i = 0; i < 3; ++i)
		dst.Write(i < N ? PosScale(Common::FromBigEndian(data[i]), scale) : 0.f);

	g_vertex_manager_write_ptr = dst.GetPointer();
	LOG_VTX();
}