// Construct the suffix array for the string S and store it into SA
// S[N] and S[N+1] must be set to zero
void suffix_array(int N, int *S, int *SA) {
	int N0 = (N+2)/3, N1 = (N+1)/3, N2 = N/3, N12 = N1+N2;
	int *smpl = (int*)malloc((N12+2)*sizeof(int));
	int *tmp = (int*)malloc((N12+2)*sizeof(int));
	int *SA12 = (int*)malloc((N12+2)*sizeof(int));
	int *rank = (int*)malloc((N+3)*sizeof(int));
	int *S0 = (int*)malloc(N0*sizeof(int));

	// Create sample and sort triples
	int t = 0;
	for (int i = 1; i < N; i+=3) smpl[t++] = i;
	for (int i = 2; i < N; i+=3) smpl[t++] = i;

	radixsort_pass(smpl, tmp, S+2, N12, N);
	radixsort_pass(tmp, smpl, S+1, N12, N);
	radixsort_pass(smpl, tmp, S, N12, N);
	
	// Rename triples
	for (int i = 0; i < N12 + 2; i++) smpl[i] = 0;
	int rename = 1;
	smpl[global_to_sample(N,tmp[0])] = 1;
	for (int i = 1; i < N12; i++) {
		if (!eq(S, tmp[i], tmp[i-1])) rename++;
		smpl[global_to_sample(N,tmp[i])] = rename;
	}
	smpl[N12] = smpl[N12+1] = 0;

	// Create suffix array of sample (SA12)
	if (rename < N12) suffix_array(N12, smpl, SA12);
	else { SA12[0] = N12;
		for (int i = 1; i <= N12; i++)
			SA12[smpl[i-1]] = i-1;
	}
	 
	for (int i = 0; i <= N12; i++)
		rank[sample_to_global(N,SA12[i])] = i;
	rank[N+1] = rank[N+2] = 0;
	t = 0;
	for (int i = 0; i < N; i+=3) S0[t++] = i;

	// Sort nonsample suffixes
	// i <= j <=> (S[i], rank[i+1) <= (S[j], rank[j+1])
	radixsort_pass(S0, tmp, rank+1, N0, N+2);
	radixsort_pass(tmp, S0, S, N0, N+2);

	// Merge sample and non-sample suffixes
	int p1 = 0, p2 = 1, p = 1;
	SA[0] = N;
	while (p1 < N0 && p2 <= N12) {
		int j = S0[p1];
		int i = sample_to_global(N,SA12[p2]);
		// Do the comparison
		// i % 3 == 1:
		//i <= j <=> (S[i],rank[i+1]) <= (S[j],rank[j+1])
		// i % 3 == 2:
		// i <= j <=> (S[i],S[i+1],rank[i+2]) <= (S[j],S[j+1],rank[j+2])
		if (i % 3 == 1) {
			if (leq2(S[i], rank[i+1], S[j], rank[j+1])) {
				SA[p++] = i; p2++; }
			else SA[p++] = S0[p1++];
		}
		else {
			if (leq3(S[i], S[i+1], rank[i+2],
				 S[j], S[j+1], rank[j+2])) {
				SA[p++] = i; p2++; }
			else SA[p++] = S0[p1++];
		}
	}
	while (p1 < N0)
		SA[p++] = S0[p1++];
	while (p2 <= N12)
		SA[p++] = sample_to_global(N,SA12[p2++]);
	free(smpl); free(tmp); free(SA12); free(rank); free(S0);
}
void compute_suffix_array(const size_t *const s, size_t *const suffix_array, const size_t length, const unsigned int max_val) {
	const size_t L0 = (length + 2) / 3, L1 = (length + 1) / 3, L2 = length / 3, L02 = L0 + L2;
	size_t i, j, label, p, t, k, c0, c1, c2, *const s12 = (size_t *)malloc((L02 + 3) * sizeof(size_t)), *const suffix_array12 = (size_t *)malloc((L02 + 3) * sizeof(size_t)), *const s0 = (size_t *)malloc(L0 * sizeof(size_t)), *const suffix_array0 = (size_t *)malloc(L0 * sizeof(size_t));
	s12[L02] = s12[L02 + 1] = s12[L02 + 2] = 0;  /* pading with 0s */
	suffix_array12[L02] = suffix_array12[L02 + 1] = suffix_array12[L02 + 2] = 0;
	for (i = 0, j = 0; i < length + L0 - L1; ++i) if (i % 3) s12[j++] = i;
	radix_pass(s12, suffix_array12, s + 2, L02, max_val);
	radix_pass(suffix_array12, s12, s + 1, L02, max_val);
	radix_pass(s12, suffix_array12, s, L02, max_val);
	label = 0, c0 = c1 = c2 = UINT_MAX;
	for (i = 0; i < L02; ++i) {
		if (s[suffix_array12[i]] != c0 || s[suffix_array12[i] + 1] != c1 || s[suffix_array12[i] + 2] != c2) {
			++label;
			c0 = s[suffix_array12[i]];
			c1 = s[suffix_array12[i] + 1];
			c2 = s[suffix_array12[i] + 2];
		}
		if (1 == suffix_array12[i] % 3) s12[suffix_array12[i] / 3] = label; else s12[suffix_array12[i] / 3 + L0] = label;
	}
	if (label < L02) {
		compute_suffix_array(s12, suffix_array12, L02, label);
		for (i = 0; i < L02; ++i) s12[suffix_array12[i]] = i + 1;
	} else
		for (i = 0; i < L02; ++i) suffix_array12[s12[i] - 1] = i;
	for (i = 0, j = 0; i < L02; ++i) if (suffix_array12[i] < L0) s0[j++] = 3 * suffix_array12[i];
	radix_pass(s0, suffix_array0, s, L0, max_val);
	for (p = 0, t = L0 - L1, k = 0; k < length; ++k) {
		i = (suffix_array12[t] < L0 ? suffix_array12[t] * 3 + 1 : (suffix_array12[t] - L0) * 3 + 2);
		j = suffix_array0[p];
		if (suffix_array12[t] < L0 ? leq2(s[i], s12[suffix_array12[t] + L0], s[j], s12[j / 3]) : leq3(s[i], s[i + 1], s12[suffix_array12[t] - L0 + 1], s[j], s[j + 1], s12[j / 3 + L0])) {
			suffix_array[k] = i; ++t;
			if(t == L02) for (++k; p < L0; ++p, ++k) suffix_array[k] = suffix_array0[p];
		} else {
			suffix_array[k] = j; ++p;
			if (p == L0) for (++k; t < L02; ++t, ++k) suffix_array[k] = (suffix_array12[t] < L0 ? suffix_array12[t] * 3 + 1 : (suffix_array12[t] - L0) * 3 + 2);
		}
	}
	free(s12), free(suffix_array12), free(suffix_array0), free(s0);
}