static void apply_buf(const plan *ego_, R *cr, R *ci)
{
     const P *ego = (const P *) ego_;
     plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
     plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
     INT i, j, ms = ego->ms, v = ego->v;
     INT batchsz = compute_batchsize(ego->r);
     R *buf;
     INT mb = 1, me = (ego->m+1) / 2;

     STACK_MALLOC(R *, buf, ego->r * batchsz * 2 * sizeof(R));

     for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
	  R *Rp = cr;
	  R *Ip = ci;
	  R *Rm = cr + ego->m * ms;
	  R *Im = ci + ego->m * ms;

	  cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);

	  for (j = mb; j + batchsz < me; j += batchsz) 
	       dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);

	  dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);

	  cldm->apply((plan *) cldm, 
		      Rp + me * ms, Ip + me * ms,
		      Rp + me * ms, Ip + me * ms);

     }

     STACK_FREE(buf);
}
static void initSystem(LaplacianDeformModifierData *lmd, Object *ob, DerivedMesh *dm,
                       float (*vertexCos)[3], int numVerts)
{
	int i;
	int defgrp_index;
	int total_anchors;
	float wpaint;
	MDeformVert *dvert = NULL;
	MDeformVert *dv = NULL;
	LaplacianSystem *sys;
	if (isValidVertexGroup(lmd, ob, dm)) {
		int *index_anchors = MEM_mallocN(sizeof(int) * numVerts, __func__);  /* over-alloc */
		MFace *tessface;
		STACK_DECLARE(index_anchors);
		STACK_INIT(index_anchors);

		modifier_get_vgroup(ob, dm, lmd->anchor_grp_name, &dvert, &defgrp_index);
		BLI_assert(dvert != NULL);
		dv = dvert;
		for (i = 0; i < numVerts; i++) {
			wpaint = defvert_find_weight(dv, defgrp_index);
			dv++;
			if (wpaint > 0.0f) {
				STACK_PUSH(index_anchors, i);
			}
		}
		DM_ensure_tessface(dm);
		total_anchors = STACK_SIZE(index_anchors);
		lmd->cache_system = initLaplacianSystem(numVerts, dm->getNumEdges(dm), dm->getNumTessFaces(dm),
		                                       total_anchors, lmd->anchor_grp_name, lmd->repeat);
		sys = (LaplacianSystem *)lmd->cache_system;
		memcpy(sys->index_anchors, index_anchors, sizeof(int) * total_anchors);
		memcpy(sys->co, vertexCos, sizeof(float[3]) * numVerts);
		MEM_freeN(index_anchors);
		STACK_FREE(index_anchors);
		lmd->vertexco = MEM_mallocN(sizeof(float[3]) * numVerts, "ModDeformCoordinates");
		memcpy(lmd->vertexco, vertexCos, sizeof(float[3]) * numVerts);
		lmd->total_verts = numVerts;

		createFaceRingMap(
		            dm->getNumVerts(dm), dm->getTessFaceArray(dm), dm->getNumTessFaces(dm),
		            &sys->ringf_map, &sys->ringf_indices);
		createVertRingMap(
		            dm->getNumVerts(dm), dm->getEdgeArray(dm), dm->getNumEdges(dm),
		            &sys->ringv_map, &sys->ringv_indices);


		tessface = dm->getTessFaceArray(dm);

		for (i = 0; i < sys->total_faces; i++) {
			memcpy(&sys->faces[i], &tessface[i].v1, sizeof(*sys->faces));
		}
	}
}
static void apply_slow(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
     const P *ego = (const P *) ego_;
     int n = ego->n, m = ego->m;
     R buf[4];
     int move_size = (n + m) / 2;
     char *move;

     UNUSED(ii); UNUSED(ro); UNUSED(io);
     STACK_MALLOC(char *, move, move_size);
     X(transpose_slow)(ri + ego->offset, n, m, 2, move, move_size, buf);
     STACK_FREE(move);
}
Example #4
0
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe)
{
     if (XM(idle_process)(sz, k, which_pe))
	  return 0;
     else {
	  int i;
	  INT N = 1, *coords;
	  STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk);
	  XM(block_coords)(sz, k, which_pe, coords);
	  for (i = 0; i < sz->rnk; ++i)
	       N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]);
	  STACK_FREE(coords);
	  return N;
     }
}
Example #5
0
void
syntax_free (void)
{
  STACK_FREE (U8);
  STACK_FREE (props);
}
Example #6
0
void
jsp_early_error_free (void)
{
  STACK_FREE (size_t_stack);
  STACK_FREE (props);
}
Example #7
0
/* Distribute a loop from 0 to loopmax-1 over nthreads threads.
   proc(d) is called to execute a block of iterations from d->min
   to d->max-1.  d->thr_num indicate the number of the thread
   that is executing proc (from 0 to nthreads-1), and d->data is
   the same as the data parameter passed to X(spawn_loop).

   This function returns only after all the threads have completed. */
void X(spawn_loop)(int loopmax, int nthr,
		   spawn_function proc, void *data)
{
     int block_size;

     A(loopmax >= 0);
     A(nthr > 0);
     A(proc);

     if (!loopmax) return;

     /* Choose the block size and number of threads in order to (1)
        minimize the critical path and (2) use the fewest threads that
        achieve the same critical path (to minimize overhead).
        e.g. if loopmax is 5 and nthr is 4, we should use only 3
        threads with block sizes of 2, 2, and 1. */
     block_size = (loopmax + nthr - 1) / nthr;
     nthr = (loopmax + block_size - 1) / block_size;

     if (nthr <= 1) {
	  spawn_data d;
	  d.min = 0; d.max = loopmax;
	  d.thr_num = 0;
	  d.data = data;
	  proc(&d);
     }
     else {
#if defined(USING_COMPILER_THREADS)
	  spawn_data d;
#elif defined(HAVE_SEMAPHORES)
	  spawn_data d;
	  worker_data *w;
#else
	  spawn_data *d;
	  fftw_thr_id *tid;
#endif
	  int i;
	  
	  THREAD_ON; /* prevent debugging mode from failing under threads */

#if defined(USING_COMPILER_THREADS)
	  
#if defined(USING_SGIMP_THREADS)
#pragma parallel local(d,i)
	  {
#pragma pfor iterate(i=0; nthr; 1)
#elif defined(USING_OPENMP_THREADS)
#pragma omp parallel for private(d)
#endif	
	       for (i = 0; i < nthr; ++i) {
		    d.max = (d.min = i * block_size) + block_size;
                    if (d.max > loopmax)
                         d.max = loopmax;
		    d.thr_num = i;
		    d.data = data;
		    proc(&d);
	       }
#if defined(USING_SGIMP_THREADS)
	  }
#endif

#elif defined(HAVE_SEMAPHORES)

	  --nthr;
	  for (w = workers, i = 0; i < nthr; ++i) {
	       A(w);
	       w->d.max = (w->d.min = i * block_size) + block_size;
	       w->d.thr_num = i;
	       w->d.data = data;
	       w->proc = proc;
	       fftw_sem_post(&w->sid_ready);
	       w = w->next;
	  }
	  d.min = i * block_size;
	  d.max = loopmax;
	  d.thr_num = i;
	  d.data = data;
	  proc(&d);

	  for (w = workers, i = 0; i < nthr; ++i) {
	       A(w);
	       fftw_sem_wait(&w->sid_done);
	       w = w->next;
	  }

#else /* explicit thread spawning: */

	  STACK_MALLOC(spawn_data *, d, sizeof(spawn_data) * nthr);
	  STACK_MALLOC(fftw_thr_id *, tid, sizeof(fftw_thr_id) * (--nthr));

	  for (i = 0; i < nthr; ++i) {
	       d[i].max = (d[i].min = i * block_size) + block_size;
	       d[i].thr_num = i;
	       d[i].data = data;
	       fftw_thr_spawn(&tid[i], (fftw_thr_function) proc,
			      (void *) (d + i));
	  }
	  d[i].min = i * block_size;
	  d[i].max = loopmax;
	  d[i].thr_num = i;
	  d[i].data = data;
	  proc(&d[i]);
	  
	  for (i = 0; i < nthr; ++i)
	       fftw_thr_wait(tid[i]);

	  STACK_FREE(tid);
	  STACK_FREE(d);

#endif /* ! USING_COMPILER_THREADS */

	  THREAD_OFF; /* prevent debugging mode from failing under threads */
     }
}
Example #8
0
void NAME(char *UPLO, char *TRANS, char *DIAG,
	   blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){

  char uplo_arg  = *UPLO;
  char trans_arg = *TRANS;
  char diag_arg  = *DIAG;

  blasint n    = *N;
  blasint lda  = *LDA;
  blasint incx = *INCX;

  blasint info;
  int uplo;
  int unit;
  int trans, buffer_size;
  FLOAT *buffer;
#ifdef SMP
  int nthreads;
#endif

  PRINT_DEBUG_NAME;

  TOUPPER(uplo_arg);
  TOUPPER(trans_arg);
  TOUPPER(diag_arg);

  trans = -1;
  unit  = -1;
  uplo  = -1;

  if (trans_arg == 'N') trans = 0;
  if (trans_arg == 'T') trans = 1;
  if (trans_arg == 'R') trans = 2;
  if (trans_arg == 'C') trans = 3;

  if (diag_arg  == 'U') unit  = 0;
  if (diag_arg  == 'N') unit  = 1;

  if (uplo_arg  == 'U') uplo  = 0;
  if (uplo_arg  == 'L') uplo  = 1;

  info = 0;

  if (incx == 0)          info =  8;
  if (lda  < MAX(1, n))   info =  6;
  if (n < 0)              info =  4;
  if (unit  < 0)          info =  3;
  if (trans < 0)          info =  2;
  if (uplo  < 0)          info =  1;

  if (info != 0) {
    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
    return;
  }

#else

void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
	   blasint n, FLOAT  *a, blasint lda, FLOAT  *x, blasint incx) {

  int trans, uplo, unit, buffer_size;
  blasint info;
  FLOAT *buffer;
#ifdef SMP
  int nthreads;
#endif

  PRINT_DEBUG_CNAME;

  unit  = -1;
  uplo  = -1;
  trans = -1;
  info  =  0;

  if (order == CblasColMajor) {
    if (Uplo == CblasUpper)         uplo  = 0;
    if (Uplo == CblasLower)         uplo  = 1;

    if (TransA == CblasNoTrans)     trans = 0;
    if (TransA == CblasTrans)       trans = 1;
    if (TransA == CblasConjNoTrans) trans = 2;
    if (TransA == CblasConjTrans)   trans = 3;

    if (Diag == CblasUnit)          unit  = 0;
    if (Diag == CblasNonUnit)       unit  = 1;

    info = -1;

    if (incx == 0)          info =  8;
    if (lda  < MAX(1, n))   info =  6;
    if (n < 0)              info =  4;
    if (unit  < 0)          info =  3;
    if (trans < 0)          info =  2;
    if (uplo  < 0)          info =  1;
  }

  if (order == CblasRowMajor) {
    if (Uplo == CblasUpper)         uplo  = 1;
    if (Uplo == CblasLower)         uplo  = 0;

    if (TransA == CblasNoTrans)     trans = 1;
    if (TransA == CblasTrans)       trans = 0;
    if (TransA == CblasConjNoTrans) trans = 3;
    if (TransA == CblasConjTrans)   trans = 2;

    if (Diag == CblasUnit)          unit  = 0;
    if (Diag == CblasNonUnit)       unit  = 1;

    info = -1;

    if (incx == 0)          info =  8;
    if (lda  < MAX(1, n))   info =  6;
    if (n < 0)              info =  4;
    if (unit  < 0)          info =  3;
    if (trans < 0)          info =  2;
    if (uplo  < 0)          info =  1;
  }

  if (info >= 0) {
    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
    return;
  }

#endif

  if (n == 0) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0 ) x -= (n - 1) * incx * 2;

#ifdef SMP
  // Calibrated on a Xeon E5-2630
  if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
    nthreads = num_cpu_avail(2);
    if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
      nthreads = 2;
  } else
      nthreads = 1;

  if(nthreads > 1) {
    buffer_size = n > 16 ? 0 : n * 4 + 40;
  }
  else
#endif
  {
    buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
    if(incx != 1)
      buffer_size += n * 2;
  }
  STACK_ALLOC(buffer_size, FLOAT, buffer);

#ifdef SMP
  if (nthreads == 1) {
#endif

  (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);

#ifdef SMP
  } else {

    (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads);

  }
#endif

  STACK_FREE(buffer);

  FUNCTION_PROFILE_END(4, n * n / 2 + n,  n * n);

  IDEBUG_END;

  return;
}
Example #9
0
static DerivedMesh *applyModifier(
        ModifierData *md, Object *ob,
        DerivedMesh *dm,
        ModifierApplyFlag UNUSED(flag))
{
	unsigned int i;
	DerivedMesh *result;
	const SolidifyModifierData *smd = (SolidifyModifierData *) md;

	MVert *mv, *mvert, *orig_mvert;
	MEdge *ed, *medge, *orig_medge;
	MLoop *ml, *mloop, *orig_mloop;
	MPoly *mp, *mpoly, *orig_mpoly;
	const unsigned int numVerts = (unsigned int)dm->getNumVerts(dm);
	const unsigned int numEdges = (unsigned int)dm->getNumEdges(dm);
	const unsigned int numFaces = (unsigned int)dm->getNumPolys(dm);
	const unsigned int numLoops = (unsigned int)dm->getNumLoops(dm);
	unsigned int newLoops = 0, newFaces = 0, newEdges = 0;

	/* only use material offsets if we have 2 or more materials  */
	const short mat_nr_max = ob->totcol > 1 ? ob->totcol - 1 : 0;
	const short mat_ofs = mat_nr_max ? smd->mat_ofs : 0;
	const short mat_ofs_rim = mat_nr_max ? smd->mat_ofs_rim : 0;

	/* use for edges */
	/* over-alloc new_vert_arr, old_vert_arr */
	unsigned int *new_vert_arr = NULL;
	STACK_DECLARE(new_vert_arr);

	unsigned int *new_edge_arr = NULL;
	STACK_DECLARE(new_edge_arr);

	unsigned int *old_vert_arr = MEM_callocN(sizeof(*old_vert_arr) * (size_t)numVerts, "old_vert_arr in solidify");

	unsigned int *edge_users = NULL;
	char *edge_order = NULL;

	float (*vert_nors)[3] = NULL;
	float (*face_nors)[3] = NULL;

	const bool need_face_normals = (smd->flag & MOD_SOLIDIFY_NORMAL_CALC) || (smd->flag & MOD_SOLIDIFY_EVEN);

	const float ofs_orig = -(((-smd->offset_fac + 1.0f) * 0.5f) * smd->offset);
	const float ofs_new  = smd->offset + ofs_orig;
	const float offset_fac_vg = smd->offset_fac_vg;
	const float offset_fac_vg_inv = 1.0f - smd->offset_fac_vg;
	const bool do_flip = (smd->flag & MOD_SOLIDIFY_FLIP) != 0;
	const bool do_clamp = (smd->offset_clamp != 0.0f);

	/* weights */
	MDeformVert *dvert, *dv = NULL;
	const int defgrp_invert = ((smd->flag & MOD_SOLIDIFY_VGROUP_INV) != 0);
	int defgrp_index;

	modifier_get_vgroup(ob, dm, smd->defgrp_name, &dvert, &defgrp_index);

	orig_mvert = dm->getVertArray(dm);
	orig_medge = dm->getEdgeArray(dm);
	orig_mloop = dm->getLoopArray(dm);
	orig_mpoly = dm->getPolyArray(dm);

	if (need_face_normals) {
		/* calculate only face normals */
		face_nors = MEM_mallocN(sizeof(*face_nors) * (size_t)numFaces, __func__);
		BKE_mesh_calc_normals_poly(
		            orig_mvert, (int)numVerts,
		            orig_mloop, orig_mpoly,
		            (int)numLoops, (int)numFaces,
		            face_nors, true);
	}

	STACK_INIT(new_vert_arr);
	STACK_INIT(new_edge_arr);

	if (smd->flag & MOD_SOLIDIFY_RIM) {
		BLI_bitmap *orig_mvert_tag = BLI_BITMAP_NEW(numVerts, __func__);
		unsigned int eidx;

#define INVALID_UNUSED ((unsigned int)-1)
#define INVALID_PAIR ((unsigned int)-2)

		new_vert_arr = MEM_mallocN(sizeof(*new_vert_arr) * (size_t)(numVerts * 2), __func__);
		new_edge_arr = MEM_mallocN(sizeof(*new_edge_arr) * (size_t)((numEdges * 2) + numVerts), __func__);

		edge_users = MEM_mallocN(sizeof(*edge_users) * (size_t)numEdges, "solid_mod edges");
		edge_order = MEM_mallocN(sizeof(*edge_order) * (size_t)numEdges, "solid_mod eorder");


		/* save doing 2 loops here... */
#if 0
		fill_vn_i(edge_users, numEdges, INVALID_UNUSED);
#endif

		for (eidx = 0, ed = orig_medge; eidx < numEdges; eidx++, ed++) {
			edge_users[eidx] = INVALID_UNUSED;
		}

		for (i = 0, mp = orig_mpoly; i < numFaces; i++, mp++) {
			MLoop *ml_prev;
			int j;

			ml = orig_mloop + mp->loopstart;
			ml_prev = ml + (mp->totloop - 1);

			for (j = 0; j < mp->totloop; j++, ml++) {
				/* add edge user */
				eidx = ml_prev->e;
				if (edge_users[eidx] == INVALID_UNUSED) {
					ed = orig_medge + eidx;
					BLI_assert(ELEM(ml_prev->v,    ed->v1, ed->v2) &&
					           ELEM(ml->v, ed->v1, ed->v2));
					edge_users[eidx] = (ml_prev->v > ml->v) == (ed->v1 < ed->v2) ? i : (i + numFaces);
					edge_order[eidx] = j;
				}
				else {
					edge_users[eidx] = INVALID_PAIR;
				}
				ml_prev = ml;
			}
		}

		for (eidx = 0, ed = orig_medge; eidx < numEdges; eidx++, ed++) {
			if (!ELEM(edge_users[eidx], INVALID_UNUSED, INVALID_PAIR)) {
				BLI_BITMAP_SET(orig_mvert_tag, ed->v1);
				BLI_BITMAP_SET(orig_mvert_tag, ed->v2);
				STACK_PUSH(new_edge_arr, eidx);
				newFaces++;
				newLoops += 4;
			}
		}

#undef INVALID_UNUSED
#undef INVALID_PAIR

		for (i = 0; i < numVerts; i++) {
			if (BLI_BITMAP_GET(orig_mvert_tag, i)) {
				old_vert_arr[i] = STACK_SIZE(new_vert_arr);
				STACK_PUSH(new_vert_arr, i);
				newEdges++;
			}
		}

		MEM_freeN(orig_mvert_tag);
	}

	if (smd->flag & MOD_SOLIDIFY_NORMAL_CALC) {
		vert_nors = MEM_callocN(sizeof(float) * (size_t)numVerts * 3, "mod_solid_vno_hq");
		dm_calc_normal(dm, face_nors, vert_nors);
	}

	result = CDDM_from_template(dm,
	                            (int)(numVerts * 2),
	                            (int)((numEdges * 2) + newEdges), 0,
	                            (int)((numLoops * 2) + newLoops),
	                            (int)((numFaces * 2) + newFaces));

	mpoly = CDDM_get_polys(result);
	mloop = CDDM_get_loops(result);
	medge = CDDM_get_edges(result);
	mvert = CDDM_get_verts(result);

	DM_copy_edge_data(dm, result, 0, 0, (int)numEdges);
	DM_copy_edge_data(dm, result, 0, (int)numEdges, (int)numEdges);

	DM_copy_vert_data(dm, result, 0, 0, (int)numVerts);
	DM_copy_vert_data(dm, result, 0, (int)numVerts, (int)numVerts);

	DM_copy_loop_data(dm, result, 0, 0, (int)numLoops);
	DM_copy_loop_data(dm, result, 0, (int)numLoops, (int)numLoops);

	DM_copy_poly_data(dm, result, 0, 0, (int)numFaces);
	DM_copy_poly_data(dm, result, 0, (int)numFaces, (int)numFaces);

	/* flip normals */
	mp = mpoly + numFaces;
	for (i = 0; i < dm->numPolyData; i++, mp++) {
		MLoop *ml2;
		unsigned int e;
		int j;

		ml2 = mloop + mp->loopstart + dm->numLoopData;
		for (j = 0; j < mp->totloop; j++) {
			CustomData_copy_data(&dm->loopData, &result->loopData, mp->loopstart + j,
			                     mp->loopstart + (mp->totloop - j - 1) + dm->numLoopData, 1);
		}

		if (mat_ofs) {
			mp->mat_nr += mat_ofs;
			CLAMP(mp->mat_nr, 0, mat_nr_max);
		}

		e = ml2[0].e;
		for (j = 0; j < mp->totloop - 1; j++) {
			ml2[j].e = ml2[j + 1].e;
		}
		ml2[mp->totloop - 1].e = e;

		mp->loopstart += dm->numLoopData;

		for (j = 0; j < mp->totloop; j++) {
			ml2[j].e += numEdges;
			ml2[j].v += numVerts;
		}
	}

	for (i = 0, ed = medge + numEdges; i < numEdges; i++, ed++) {
		ed->v1 += numVerts;
		ed->v2 += numVerts;
	}

	/* note, copied vertex layers don't have flipped normals yet. do this after applying offset */
	if ((smd->flag & MOD_SOLIDIFY_EVEN) == 0) {
		/* no even thickness, very simple */
		float scalar_short;
		float scalar_short_vgroup;

		/* for clamping */
		float *vert_lens = NULL;
		const float offset    = fabsf(smd->offset) * smd->offset_clamp;
		const float offset_sq = offset * offset;

		if (do_clamp) {
			vert_lens = MEM_mallocN(sizeof(float) * numVerts, "vert_lens");
			fill_vn_fl(vert_lens, (int)numVerts, FLT_MAX);
			for (i = 0; i < numEdges; i++) {
				const float ed_len_sq = len_squared_v3v3(mvert[medge[i].v1].co, mvert[medge[i].v2].co);
				vert_lens[medge[i].v1] = min_ff(vert_lens[medge[i].v1], ed_len_sq);
				vert_lens[medge[i].v2] = min_ff(vert_lens[medge[i].v2], ed_len_sq);
			}
		}

		if (ofs_new != 0.0f) {
			scalar_short = scalar_short_vgroup = ofs_new / 32767.0f;
			mv = mvert + (((ofs_new >= ofs_orig) == do_flip) ? numVerts : 0);
			dv = dvert;
			for (i = 0; i < numVerts; i++, mv++) {
				if (dv) {
					if (defgrp_invert) scalar_short_vgroup = 1.0f - defvert_find_weight(dv, defgrp_index);
					else scalar_short_vgroup = defvert_find_weight(dv, defgrp_index);
					scalar_short_vgroup = (offset_fac_vg + (scalar_short_vgroup * offset_fac_vg_inv)) * scalar_short;
					dv++;
				}
				if (do_clamp) {
					/* always reset becaise we may have set before */
					if (dv == NULL) {
						scalar_short_vgroup = scalar_short;
					}
					if (vert_lens[i] < offset_sq) {
						float scalar = sqrtf(vert_lens[i]) / offset;
						scalar_short_vgroup *= scalar;
					}
				}
				madd_v3v3short_fl(mv->co, mv->no, scalar_short_vgroup);
			}
		}

		if (ofs_orig != 0.0f) {
			scalar_short = scalar_short_vgroup = ofs_orig / 32767.0f;
			mv = mvert + (((ofs_new >= ofs_orig) == do_flip) ? 0 : numVerts); /* as above but swapped */
			dv = dvert;
			for (i = 0; i < numVerts; i++, mv++) {
				if (dv) {
					if (defgrp_invert) scalar_short_vgroup = 1.0f - defvert_find_weight(dv, defgrp_index);
					else scalar_short_vgroup = defvert_find_weight(dv, defgrp_index);
					scalar_short_vgroup = (offset_fac_vg + (scalar_short_vgroup * offset_fac_vg_inv)) * scalar_short;
					dv++;
				}
				if (do_clamp) {
					/* always reset becaise we may have set before */
					if (dv == NULL) {
						scalar_short_vgroup = scalar_short;
					}
					if (vert_lens[i] < offset_sq) {
						float scalar = sqrtf(vert_lens[i]) / offset;
						scalar_short_vgroup *= scalar;
					}
				}
				madd_v3v3short_fl(mv->co, mv->no, scalar_short_vgroup);
			}
		}

		if (do_clamp) {
			MEM_freeN(vert_lens);
		}
	}
	else {
#ifdef USE_NONMANIFOLD_WORKAROUND
		const bool check_non_manifold = (smd->flag & MOD_SOLIDIFY_NORMAL_CALC) != 0;
#endif
		/* same as EM_solidify() in editmesh_lib.c */
		float *vert_angles = MEM_callocN(sizeof(float) * numVerts * 2, "mod_solid_pair"); /* 2 in 1 */
		float *vert_accum = vert_angles + numVerts;
		unsigned int vidx;

		if (vert_nors == NULL) {
			vert_nors = MEM_mallocN(sizeof(float) * numVerts * 3, "mod_solid_vno");
			for (i = 0, mv = mvert; i < numVerts; i++, mv++) {
				normal_short_to_float_v3(vert_nors[i], mv->no);
			}
		}

		for (i = 0, mp = mpoly; i < numFaces; i++, mp++) {
			/* #BKE_mesh_calc_poly_angles logic is inlined here */
			float nor_prev[3];
			float nor_next[3];

			int i_curr = mp->totloop - 1;
			int i_next = 0;

			ml = &mloop[mp->loopstart];

			sub_v3_v3v3(nor_prev, mvert[ml[i_curr - 1].v].co, mvert[ml[i_curr].v].co);
			normalize_v3(nor_prev);

			while (i_next < mp->totloop) {
				float angle;
				sub_v3_v3v3(nor_next, mvert[ml[i_curr].v].co, mvert[ml[i_next].v].co);
				normalize_v3(nor_next);
				angle = angle_normalized_v3v3(nor_prev, nor_next);


				/* --- not related to angle calc --- */
				if (angle < FLT_EPSILON) {
					angle = FLT_EPSILON;
				}

				vidx = ml[i_curr].v;
				vert_accum[vidx] += angle;

#ifdef USE_NONMANIFOLD_WORKAROUND
				/* skip 3+ face user edges */
				if ((check_non_manifold == false) ||
				    LIKELY(((orig_medge[ml[i_curr].e].flag & ME_EDGE_TMP_TAG) == 0) &&
				           ((orig_medge[ml[i_next].e].flag & ME_EDGE_TMP_TAG) == 0)))
				{
					vert_angles[vidx] += shell_angle_to_dist(angle_normalized_v3v3(vert_nors[vidx], face_nors[i])) * angle;
				}
				else {
					vert_angles[vidx] += angle;
				}
#else
				vert_angles[vidx] += shell_angle_to_dist(angle_normalized_v3v3(vert_nors[vidx], face_nors[i])) * angle;
#endif
				/* --- end non-angle-calc section --- */


				/* step */
				copy_v3_v3(nor_prev, nor_next);
				i_curr = i_next;
				i_next++;
			}
		}

		/* vertex group support */
		if (dvert) {
			float scalar;

			dv = dvert;
			if (defgrp_invert) {
				for (i = 0; i < numVerts; i++, dv++) {
					scalar = 1.0f - defvert_find_weight(dv, defgrp_index);
					scalar = offset_fac_vg + (scalar * offset_fac_vg_inv);
					vert_angles[i] *= scalar;
				}
			}
			else {
				for (i = 0; i < numVerts; i++, dv++) {
					scalar = defvert_find_weight(dv, defgrp_index);
					scalar = offset_fac_vg + (scalar * offset_fac_vg_inv);
					vert_angles[i] *= scalar;
				}
			}
		}

		if (do_clamp) {
			float *vert_lens_sq = MEM_callocN(sizeof(float) * numVerts, "vert_lens");
			const float offset    = fabsf(smd->offset) * smd->offset_clamp;
			const float offset_sq = offset * offset;
			fill_vn_fl(vert_lens_sq, (int)numVerts, FLT_MAX);
			for (i = 0; i < numEdges; i++) {
				const float ed_len = len_squared_v3v3(mvert[medge[i].v1].co, mvert[medge[i].v2].co);
				vert_lens_sq[medge[i].v1] = min_ff(vert_lens_sq[medge[i].v1], ed_len);
				vert_lens_sq[medge[i].v2] = min_ff(vert_lens_sq[medge[i].v2], ed_len);
			}
			for (i = 0; i < numVerts; i++) {
				if (vert_lens_sq[i] < offset_sq) {
					float scalar = sqrtf(vert_lens_sq[i]) / offset;
					vert_angles[i] *= scalar;
				}
			}
			MEM_freeN(vert_lens_sq);
		}

		if (ofs_new) {
			mv = mvert + (((ofs_new >= ofs_orig) == do_flip) ? numVerts : 0);

			for (i = 0; i < numVerts; i++, mv++) {
				if (vert_accum[i]) { /* zero if unselected */
					madd_v3_v3fl(mv->co, vert_nors[i], ofs_new * (vert_angles[i] / vert_accum[i]));
				}
			}
		}

		if (ofs_orig) {
			/* same as above but swapped, intentional use of 'ofs_new' */
			mv = mvert + (((ofs_new >= ofs_orig) == do_flip) ? 0 : numVerts);

			for (i = 0; i < numVerts; i++, mv++) {
				if (vert_accum[i]) { /* zero if unselected */
					madd_v3_v3fl(mv->co, vert_nors[i], ofs_orig * (vert_angles[i] / vert_accum[i]));
				}
			}
		}

		MEM_freeN(vert_angles);
	}

	if (vert_nors)
		MEM_freeN(vert_nors);

	/* must recalculate normals with vgroups since they can displace unevenly [#26888] */
	if ((dm->dirty & DM_DIRTY_NORMALS) || (smd->flag & MOD_SOLIDIFY_RIM) || dvert) {
		result->dirty |= DM_DIRTY_NORMALS;
	}
	else {
		/* flip vertex normals for copied verts */
		mv = mvert + numVerts;
		for (i = 0; i < numVerts; i++, mv++) {
			negate_v3_short(mv->no);
		}
	}

	if (smd->flag & MOD_SOLIDIFY_RIM) {

		/* bugger, need to re-calculate the normals for the new edge faces.
		 * This could be done in many ways, but probably the quickest way
		 * is to calculate the average normals for side faces only.
		 * Then blend them with the normals of the edge verts.
		 *
		 * at the moment its easiest to allocate an entire array for every vertex,
		 * even though we only need edge verts - campbell
		 */

#define SOLIDIFY_SIDE_NORMALS

#ifdef SOLIDIFY_SIDE_NORMALS
		const bool do_side_normals = !(result->dirty & DM_DIRTY_NORMALS);
		/* annoying to allocate these since we only need the edge verts, */
		float (*edge_vert_nos)[3] = do_side_normals ? MEM_callocN(sizeof(float) * numVerts * 3, __func__) : NULL;
		float nor[3];
#endif
		const unsigned char crease_rim = smd->crease_rim * 255.0f;
		const unsigned char crease_outer = smd->crease_outer * 255.0f;
		const unsigned char crease_inner = smd->crease_inner * 255.0f;

		int *origindex_edge;
		int *orig_ed;
		unsigned int j;

		if (crease_rim || crease_outer || crease_inner) {
			result->cd_flag |= ME_CDFLAG_EDGE_CREASE;
		}

		/* add faces & edges */
		origindex_edge = result->getEdgeDataArray(result, CD_ORIGINDEX);
		ed = &medge[numEdges * 2];
		orig_ed = &origindex_edge[numEdges * 2];
		for (i = 0; i < newEdges; i++, ed++, orig_ed++) {
			ed->v1 = new_vert_arr[i];
			ed->v2 = new_vert_arr[i] + numVerts;
			ed->flag |= ME_EDGEDRAW;

			*orig_ed = ORIGINDEX_NONE;

			if (crease_rim) {
				ed->crease = crease_rim;
			}
		}

		/* faces */
		mp = mpoly + (numFaces * 2);
		ml = mloop + (numLoops * 2);
		j = 0;
		for (i = 0; i < newFaces; i++, mp++) {
			unsigned int eidx = new_edge_arr[i];
			unsigned int fidx = edge_users[eidx];
			int k1, k2;
			bool flip;

			if (fidx >= numFaces) {
				fidx -= numFaces;
				flip = true;
			}
			else {
				flip = false;
			}

			ed = medge + eidx;

			/* copy most of the face settings */
			DM_copy_poly_data(dm, result, (int)fidx, (int)((numFaces * 2) + i), 1);
			mp->loopstart = (int)(j + numLoops * 2);
			mp->flag = mpoly[fidx].flag;

			/* notice we use 'mp->totloop' which is later overwritten,
			 * we could lookup the original face but theres no point since this is a copy
			 * and will have the same value, just take care when changing order of assignment */
			k1 = mpoly[fidx].loopstart + (((edge_order[eidx] - 1) + mp->totloop) % mp->totloop);  /* prev loop */
			k2 = mpoly[fidx].loopstart +   (edge_order[eidx]);

			mp->totloop = 4;

			CustomData_copy_data(&dm->loopData, &result->loopData, k2, (int)(numLoops * 2 + j + 0), 1);
			CustomData_copy_data(&dm->loopData, &result->loopData, k1, (int)(numLoops * 2 + j + 1), 1);
			CustomData_copy_data(&dm->loopData, &result->loopData, k1, (int)(numLoops * 2 + j + 2), 1);
			CustomData_copy_data(&dm->loopData, &result->loopData, k2, (int)(numLoops * 2 + j + 3), 1);

			if (flip == FALSE) {
				ml[j].v = ed->v1;
				ml[j++].e = eidx;

				ml[j].v = ed->v2;
				ml[j++].e = numEdges * 2 + old_vert_arr[ed->v2];

				ml[j].v = ed->v2 + numVerts;
				ml[j++].e = eidx + numEdges;

				ml[j].v = ed->v1 + numVerts;
				ml[j++].e = numEdges * 2 + old_vert_arr[ed->v1];
			}
			else {
				ml[j].v = ed->v2;
				ml[j++].e = eidx;

				ml[j].v = ed->v1;
				ml[j++].e = numEdges * 2 + old_vert_arr[ed->v1];

				ml[j].v = ed->v1 + numVerts;
				ml[j++].e = eidx + numEdges;

				ml[j].v = ed->v2 + numVerts;
				ml[j++].e = numEdges * 2 + old_vert_arr[ed->v2];
			}

			origindex_edge[ml[j - 3].e] = ORIGINDEX_NONE;
			origindex_edge[ml[j - 1].e] = ORIGINDEX_NONE;

			/* use the next material index if option enabled */
			if (mat_ofs_rim) {
				mp->mat_nr += mat_ofs_rim;
				CLAMP(mp->mat_nr, 0, mat_nr_max);
			}
			if (crease_outer) {
				/* crease += crease_outer; without wrapping */
				unsigned char *cr = (unsigned char *)&(ed->crease);
				int tcr = *cr + crease_outer;
				*cr = tcr > 255 ? 255 : tcr;
			}

			if (crease_inner) {
				/* crease += crease_inner; without wrapping */
				unsigned char *cr = (unsigned char *)&(medge[numEdges + eidx].crease);
				int tcr = *cr + crease_inner;
				*cr = tcr > 255 ? 255 : tcr;
			}

#ifdef SOLIDIFY_SIDE_NORMALS
			if (do_side_normals) {
				normal_quad_v3(nor,
				               mvert[ml[j - 4].v].co,
				               mvert[ml[j - 3].v].co,
				               mvert[ml[j - 2].v].co,
				               mvert[ml[j - 1].v].co);

				add_v3_v3(edge_vert_nos[ed->v1], nor);
				add_v3_v3(edge_vert_nos[ed->v2], nor);
			}
#endif
		}

#ifdef SOLIDIFY_SIDE_NORMALS
		if (do_side_normals) {
			ed = medge + (numEdges * 2);
			for (i = 0; i < newEdges; i++, ed++) {
				float nor_cpy[3];
				short *nor_short;
				int k;

				/* note, only the first vertex (lower half of the index) is calculated */
				normalize_v3_v3(nor_cpy, edge_vert_nos[ed->v1]);

				for (k = 0; k < 2; k++) { /* loop over both verts of the edge */
					nor_short = mvert[*(&ed->v1 + k)].no;
					normal_short_to_float_v3(nor, nor_short);
					add_v3_v3(nor, nor_cpy);
					normalize_v3(nor);
					normal_float_to_short_v3(nor_short, nor);
				}
			}

			MEM_freeN(edge_vert_nos);
		}
#endif

		MEM_freeN(new_vert_arr);
		MEM_freeN(new_edge_arr);

		MEM_freeN(edge_users);
		MEM_freeN(edge_order);
	}

	STACK_FREE(new_vert_arr);
	STACK_FREE(new_edge_arr);

	if (old_vert_arr)
		MEM_freeN(old_vert_arr);

	if (face_nors)
		MEM_freeN(face_nors);

	if (numFaces == 0 && numEdges != 0) {
		modifier_setError(md, "Faces needed for useful output");
	}

	return result;
}